From d838711ca87eee898b8a6264118c56842287aeca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sat, 16 Sep 2023 23:14:39 -0300 Subject: [PATCH 01/18] Revert "Remove urllib3 dependency" This reverts commit 09114777c3e599cbe7a393dab6816eb4d71381c9. urllib3 is needed to postprocess/parse/URL for sanitization and privacy purpose (#192) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1e1e948..3893c3a 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ }, install_requires=[ 'internetarchive', + 'urllib3==1.26.13', 'docopt==0.6.2', 'yt-dlp', ] From add619cf92aec85129dd5a8152bb5446b7cb10f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sat, 16 Sep 2023 23:12:58 -0300 Subject: [PATCH 02/18] IA currently leaks the IP address of the submitter. This is bad. We fix this by carefully redacting the IP address in the JSON fields known to contain it. --- tubeup/TubeUp.py | 7 ++++++- tubeup/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..98bdae7 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -10,7 +10,7 @@ from internetarchive.config import parse_config_file from datetime import datetime from yt_dlp import YoutubeDL -from .utils import (get_itemname, check_is_file_empty, +from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta, EMPTY_ANNOTATION_FILE) from logging import getLogger from urllib.parse import urlparse @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None): with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + # Exit if video download did not complete, don't upload .part files to IA for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: if glob.glob(videobasename + ext): diff --git a/tubeup/utils.py b/tubeup/utils.py index bc12845..846c9fb 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -1,5 +1,6 @@ import os import re +from urllib.parse import urlparse, parse_qs, urlencode EMPTY_ANNOTATION_FILE = ('' @@ -29,3 +30,39 @@ def check_is_file_empty(filepath): return os.stat(filepath).st_size == 0 else: raise FileNotFoundError("Path '%s' doesn't exist" % filepath) + + +def strip_ip_from_url(url): + """ + Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/ + or in an "ip" query-parameter, like in ?ip=1.2.3.4 + """ + u = urlparse(url) + u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)) + if u.query != '': + qs = parse_qs(u.query) + try: + del(qs['ip']) + u = u._replace(query=urlencode(qs, True)) + except: + pass + return u.geturl() + + +def strip_ip_from_meta(meta): + modified = False + if 'url' in meta: + redacted_url = strip_ip_from_url(meta['url']) + if redacted_url != meta['url']: + meta['url'] = redacted_url + modified = True + + for _format in meta['formats']: + for field in ['manifest_url', 'fragment_base_url', 'url']: + if field in _format: + redacted_url = strip_ip_from_url(_format[field]) + if redacted_url != _format[field]: + _format[field] = redacted_url + modified = True + + return modified, meta From 7cb7b3ec81782f2fef37099dc0c5ef57829f4f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sat, 16 Sep 2023 23:38:46 -0300 Subject: [PATCH 03/18] added tests --- tests/test_utils.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8c409d8..224b9c1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,12 @@ import unittest import os -from tubeup.utils import sanitize_identifier, check_is_file_empty +import json +from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta +current_path = os.path.dirname(os.path.realpath(__file__)) + +def get_testfile_path(name): + return os.path.join(current_path, 'test_tubeup_files', name) class UtilsTest(unittest.TestCase): @@ -48,3 +53,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self): FileNotFoundError, r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"): check_is_file_empty('file_that_doesnt_exist.txt') + + def test_strip_ip_from_meta(self): + with open(get_testfile_path( + 'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.' + 'info.json') + ) as f: + vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + self.assertTrue(mod) + self.assertNotEqual(f.read(), json.dumps(new_meta)) + self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234') From 402ae19223993bbf74b178e18808e75f7a3f320b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sun, 17 Sep 2023 01:39:40 -0300 Subject: [PATCH 04/18] flake8 linting --- tests/test_utils.py | 2 ++ tubeup/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 224b9c1..813271a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,9 +5,11 @@ current_path = os.path.dirname(os.path.realpath(__file__)) + def get_testfile_path(name): return os.path.join(current_path, 'test_tubeup_files', name) + class UtilsTest(unittest.TestCase): def test_preserve_valid_identifiers(self): diff --git a/tubeup/utils.py b/tubeup/utils.py index 846c9fb..2be5b86 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -42,9 +42,9 @@ def strip_ip_from_url(url): if u.query != '': qs = parse_qs(u.query) try: - del(qs['ip']) + del (qs['ip']) u = u._replace(query=urlencode(qs, True)) - except: + except KeyError: pass return u.geturl() From 9c2ae72643c471623f871290e32f962c0962cc35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Mon, 18 Sep 2023 18:28:23 -0300 Subject: [PATCH 05/18] Dont leak ip address (#310) * Revert "Remove urllib3 dependency" This reverts commit 09114777c3e599cbe7a393dab6816eb4d71381c9. urllib3 is needed to postprocess/parse/URL for sanitization and privacy purpose (#192) * IA currently leaks the IP address of the submitter. This is bad. We fix this by carefully redacting the IP address in the JSON fields known to contain it. * added tests * flake8 linting --- setup.py | 1 + tests/test_utils.py | 20 +++++++++++++++++++- tubeup/TubeUp.py | 7 ++++++- tubeup/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 1e1e948..3893c3a 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ }, install_requires=[ 'internetarchive', + 'urllib3==1.26.13', 'docopt==0.6.2', 'yt-dlp', ] diff --git a/tests/test_utils.py b/tests/test_utils.py index 8c409d8..813271a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,13 @@ import unittest import os -from tubeup.utils import sanitize_identifier, check_is_file_empty +import json +from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta + +current_path = os.path.dirname(os.path.realpath(__file__)) + + +def get_testfile_path(name): + return os.path.join(current_path, 'test_tubeup_files', name) class UtilsTest(unittest.TestCase): @@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self): FileNotFoundError, r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"): check_is_file_empty('file_that_doesnt_exist.txt') + + def test_strip_ip_from_meta(self): + with open(get_testfile_path( + 'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.' + 'info.json') + ) as f: + vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + self.assertTrue(mod) + self.assertNotEqual(f.read(), json.dumps(new_meta)) + self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234') diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..98bdae7 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -10,7 +10,7 @@ from internetarchive.config import parse_config_file from datetime import datetime from yt_dlp import YoutubeDL -from .utils import (get_itemname, check_is_file_empty, +from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta, EMPTY_ANNOTATION_FILE) from logging import getLogger from urllib.parse import urlparse @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None): with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + # Exit if video download did not complete, don't upload .part files to IA for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: if glob.glob(videobasename + ext): diff --git a/tubeup/utils.py b/tubeup/utils.py index bc12845..2be5b86 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -1,5 +1,6 @@ import os import re +from urllib.parse import urlparse, parse_qs, urlencode EMPTY_ANNOTATION_FILE = ('' @@ -29,3 +30,39 @@ def check_is_file_empty(filepath): return os.stat(filepath).st_size == 0 else: raise FileNotFoundError("Path '%s' doesn't exist" % filepath) + + +def strip_ip_from_url(url): + """ + Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/ + or in an "ip" query-parameter, like in ?ip=1.2.3.4 + """ + u = urlparse(url) + u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)) + if u.query != '': + qs = parse_qs(u.query) + try: + del (qs['ip']) + u = u._replace(query=urlencode(qs, True)) + except KeyError: + pass + return u.geturl() + + +def strip_ip_from_meta(meta): + modified = False + if 'url' in meta: + redacted_url = strip_ip_from_url(meta['url']) + if redacted_url != meta['url']: + meta['url'] = redacted_url + modified = True + + for _format in meta['formats']: + for field in ['manifest_url', 'fragment_base_url', 'url']: + if field in _format: + redacted_url = strip_ip_from_url(_format[field]) + if redacted_url != _format[field]: + _format[field] = redacted_url + modified = True + + return modified, meta From a7d8b42f3a09c143c271ba7e874e63a406c2698c Mon Sep 17 00:00:00 2001 From: Paul Henning Date: Mon, 18 Sep 2023 18:12:52 -0400 Subject: [PATCH 06/18] URLlib3 dependency version lift - Use latest urllib3 instead of that specific version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3893c3a..d1f9e39 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ }, install_requires=[ 'internetarchive', - 'urllib3==1.26.13', + 'urllib3', 'docopt==0.6.2', 'yt-dlp', ] From d0b9df63cbe199627cf03f78ef45e011e0c03baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Sun, 17 Sep 2023 01:22:25 -0300 Subject: [PATCH 07/18] Idempotency: Keep a local "archive" (index) of files successfully uploaded the way yt-dlp keep track of downloaded items. - Amend #19 by adding optional idempotency between runs: While concurrent-instances can still rely on a "clean" download directory (#19) a single upload node can use --use-upload-archive can avoid source files removal. Subsequent runs of `tubeup` will omit files already uploaded. - Fixes #23 (and part of #233) - NB: Ability for upload_ia() to return None, paves the way to fix #36 or #109 --- tests/test_tubeup.py | 80 ++++++++++++++++++++++++++++++++++++++++++++ tubeup/TubeUp.py | 37 +++++++++++++++----- tubeup/__main__.py | 17 ++++++++-- 3 files changed, 122 insertions(+), 12 deletions(-) diff --git a/tests/test_tubeup.py b/tests/test_tubeup.py index 9983d56..7603e86 100644 --- a/tests/test_tubeup.py +++ b/tests/test_tubeup.py @@ -590,3 +590,83 @@ def test_archive_urls(self): 'scanner': SCANNER})] self.assertEqual(expected_result, result) + + def test_archive_deletion(self): + root_path = os.path.join(current_path, 'test_tubeup_rootdir') + # Clean up before test + shutil.rmtree(root_path, ignore_errors=True) + + tu = TubeUp(dir_path=root_path, + ia_config_path=get_testfile_path('ia_config_for_test.ini')) + + videobasename = os.path.join( + current_path, 'test_tubeup_rootdir', 'downloads', + 'KdsN9YhkDrY') + + copy_testfiles_to_tubeup_rootdir_test() + dest = os.path.join(root_path, 'downloads', '*') + files_before_upload = glob.glob(dest) + + vid_info = {'mediatype': 'movies', + 'creator': 'RelaxingWorld', + 'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg', + 'collection': 'opensource_movies', + 'title': 'Epic Ramadan - Video Background HD1080p', + 'description': ('If you enjoy my work, please consider Subscribe to my NEW ' + 'channel for more videos:
' + 'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1
' + '▷ If you use this video, please put credits to my channel ' + 'in description:
' + 'Source from RelaxingWorld: https://goo.gl/HsW75m
' + '
' + '▷ Also, do not forget to Subscribe to my channel. Thanks!'), + 'date': '2016-06-25', + 'year': '2016', + 'subject': ('Youtube;video;Film & Animation;Video Background;' + 'Footage;Animation;Cinema;Royalty Free Videos;' + 'Stock Video Footage;Video Backdrops;' + 'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;' + 'relaxing music;Ramadan;'), + 'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY', + 'licenseurl': '', + 'scanner': SCANNER} + + with requests_mock.Mocker() as m: + # Mock the request to s3.us.archive.org, so it will responds + # a custom json. `internetarchive` library sends GET request to + # that url to check that we don't violate the upload limit. + m.get('https://s3.us.archive.org', + content=b'{"over_limit": 0}', + headers={'content-type': 'application/json'}) + + m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY', + content=b'{}', + headers={'content-type': 'application/json'}) + + # Mock the PUT requests for internetarchive urls that defined + # in mock_upload_response_by_videobasename(), so this test + # doesn't perform upload to the real archive.org server. + mock_upload_response_by_videobasename( + m, 'youtube-KdsN9YhkDrY', videobasename) + + # First upload, this actually get uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and returns a remote IA item name + expected_result = [('youtube-KdsN9YhkDrY', vid_info)] + self.assertEqual(expected_result, result) + + # ... and no file got deleted + files_after_upload = glob.glob(dest) + self.assertListEqual(files_before_upload, files_after_upload) + # ... and a upload-archive file was created + self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive'))) + + # Second upload, nothing was actually uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and no remote IA item name is returned + expected_result = [(None, vid_info)] + self.assertEqual(expected_result, result) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 98bdae7..f4223f9 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -309,16 +309,21 @@ def generate_ydl_options(self, return ydl_opts - def upload_ia(self, videobasename, custom_meta=None): + def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): """ Upload video to archive.org. - :param videobasename: A video base name. - :param custom_meta: A custom meta, will be used by internetarchive - library when uploading to archive.org. - :return: A tuple containing item name and metadata used - when uploading to archive.org and whether the item - already exists. + :param videobasename: A video base name. + :param use_upload_archive: Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. + :param custom_meta: A custom meta, will be used by internetarchive + library when uploading to archive.org. + :return: A tuple containing item name and metadata used + when uploading to archive.org and whether the item + already exists. A null item name means upload + didn't happened. """ json_metadata_filepath = videobasename + '.info.json' with open(json_metadata_filepath, 'r', encoding='utf-8') as f: @@ -339,6 +344,12 @@ def upload_ia(self, videobasename, custom_meta=None): metadata = self.create_archive_org_metadata_from_youtubedl_meta( vid_meta) + if use_upload_archive: + ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')}) + if ydl.in_download_archive(vid_meta): + self.logger.debug('Skipping already uploaded video: %s', metadata['title']) + return None, metadata + # Delete empty description file description_file_path = videobasename + '.description' if (os.path.exists(description_file_path) and @@ -380,16 +391,20 @@ def upload_ia(self, videobasename, custom_meta=None): raise Exception(msg) item.upload(files_to_upload, metadata=metadata, retries=9001, - request_kwargs=dict(timeout=9001), delete=True, + request_kwargs=dict(timeout=9001), delete=not use_upload_archive, verbose=self.verbose, access_key=s3_access_key, secret_key=s3_secret_key) + if use_upload_archive: + ydl.record_download_archive(vid_meta) + return itemname, metadata def archive_urls(self, urls, custom_meta=None, cookie_file=None, proxy=None, ydl_username=None, ydl_password=None, use_download_archive=False, + use_upload_archive=False, ignore_existing_item=False): """ Download and upload videos from youtube_dl supported sites to @@ -409,6 +424,10 @@ def archive_urls(self, urls, custom_meta=None, This will download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + :param use_upload_archive: Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. @@ -417,7 +436,7 @@ def archive_urls(self, urls, custom_meta=None, urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, ignore_existing_item) for basename in downloaded_file_basenames: - identifier, meta = self.upload_ia(basename, custom_meta) + identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta) yield identifier, meta @staticmethod diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 0841765..4bd199a 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -24,6 +24,7 @@ [--proxy ] [--quiet] [--debug] [--use-download-archive] + [--use-upload-archive] [--output ] [--ignore-existing-item] tubeup -h | --help @@ -45,6 +46,10 @@ This will download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. + -U --use-upload-archive Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. -q --quiet Just print errors. -d --debug Print all logs to stdout. -o --output Youtube-dlc output template. @@ -75,6 +80,7 @@ def main(): quiet_mode = args['--quiet'] debug_mode = args['--debug'] use_download_archive = args['--use-download-archive'] + use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] if debug_mode: @@ -100,10 +106,15 @@ def main(): cookie_file, proxy_url, username, password, use_download_archive, + use_upload_archive, ignore_existing_item): - print('\n:: Upload Finished. Item information:') - print('Title: %s' % meta['title']) - print('Item URL: https://archive.org/details/%s\n' % identifier) + if identifier: + print('\n:: Upload Finished. Item information:') + print('Title: %s' % meta['title']) + print('Item URL: https://archive.org/details/%s\n' % identifier) + else: + print('\n:: Upload skipped. Item information:') + print('Title: %s' % meta['title']) except Exception: print('\n\033[91m' # Start red color text 'An exception just occured, if you found this ' From 545c8fe161229e926330fbf39b559516017311ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Thu, 21 Sep 2023 23:06:21 -0300 Subject: [PATCH 08/18] support arbitrary options from yt-dlp, fix #212 --- tubeup/TubeUp.py | 20 ++++++++++++++------ tubeup/__main__.py | 8 +++++++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index f4223f9..f1f7a42 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -86,12 +86,13 @@ def get_resource_basenames(self, urls, cookie_file=None, proxy_url=None, ydl_username=None, ydl_password=None, use_download_archive=False, - ignore_existing_item=False): + ignore_existing_item=False, + yt_args=[]): """ Get resource basenames from an url. :param urls: A list of urls that will be downloaded with - youtubedl. + youtubedl (or their corresponding info-files) :param cookie_file: A cookie file for YoutubeDL. :param proxy_url: A proxy url for YoutubeDL. :param ydl_username: Username that will be used to download the @@ -103,6 +104,7 @@ def get_resource_basenames(self, urls, the archive file. Record the IDs of all downloaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. + :param yt_args: Additional parameters passed to yt-dlp. :return: Set of videos basename that has been downloaded. """ downloaded_files_basename = set() @@ -174,6 +176,10 @@ def ydl_progress_hook(d): ydl_username, ydl_password, use_download_archive) + # Default yt-dlp overriden by tubeup specific options + yt_args.update(ydl_opts) + ydl_opts = yt_args + with YoutubeDL(ydl_opts) as ydl: for url in urls: if not ignore_existing_item: @@ -405,13 +411,14 @@ def archive_urls(self, urls, custom_meta=None, ydl_username=None, ydl_password=None, use_download_archive=False, use_upload_archive=False, - ignore_existing_item=False): + ignore_existing_item=False, + yt_args=[]): """ Download and upload videos from youtube_dl supported sites to archive.org - :param urls: List of url that will be downloaded and uploaded - to archive.org + :param urls: List of url or local info files that will + be downloaded and uploaded to archive.org :param custom_meta: A custom metadata that will be used when uploading the file with archive.org. :param cookie_file: A cookie file for YoutubeDL. @@ -429,12 +436,13 @@ def archive_urls(self, urls, custom_meta=None, the archive file. Record the IDs of all uploaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. + :param yt_args: Additional parameters passed to yt-dlp. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. """ downloaded_file_basenames = self.get_resource_basenames( urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, - ignore_existing_item) + ignore_existing_item, yt_args) for basename in downloaded_file_basenames: identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta) yield identifier, meta diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 4bd199a..70ac6e0 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -27,6 +27,7 @@ [--use-upload-archive] [--output ] [--ignore-existing-item] + [--yt X...] tubeup -h | --help tubeup --version @@ -54,6 +55,7 @@ -d --debug Print all logs to stdout. -o --output Youtube-dlc output template. -i --ignore-existing-item Don't check if an item already exists on archive.org + --yt X... Any option to be passed to underlying yt-dlp. """ import sys @@ -61,6 +63,8 @@ import logging import traceback +from yt_dlp import parse_options + import internetarchive import internetarchive.cli @@ -82,6 +86,7 @@ def main(): use_download_archive = args['--use-download-archive'] use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] + parser, opts, all_urls, yt_args = parse_options(args['--yt']) if debug_mode: # Display log messages. @@ -107,7 +112,8 @@ def main(): username, password, use_download_archive, use_upload_archive, - ignore_existing_item): + ignore_existing_item, + yt_args): if identifier: print('\n:: Upload Finished. Item information:') print('Title: %s' % meta['title']) From a52031cf902ab9096aab655c978f6f57bc1b874f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Thu, 21 Sep 2023 23:07:35 -0300 Subject: [PATCH 09/18] Ability, from an existing info-file, to upload existing associated media files. If `tubeup` is passed the path of info.json files instead of URL, then it will parse them and, if `--output` was set correctly (same value as previous `yt-dlp`) then recover the basename of the local files associated with this video. If used intelligently, especially in conjunction with: --ignore-existing-item # needed, for now, to avoid the download-archive codepath --yt=--format=... # use the same download format --yt=--no-overwrites ... then existing files may be uploaded without having been re-downloaded/overwritten. Usage isn't foolproof. Use at your own risks. --- tubeup/TubeUp.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index f1f7a42..37967b7 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -182,9 +182,16 @@ def ydl_progress_hook(d): with YoutubeDL(ydl_opts) as ydl: for url in urls: + info_dict = {} if not ignore_existing_item: - # Get the info dict of the url - info_dict = ydl.extract_info(url, download=False) + if os.path.exists(url): + p = ydl.download_with_info_file(url) + if p == 0: + with open(url, 'r') as f: + info_dict = json.load(f) + else: + # Get the info dict of the url + info_dict = ydl.extract_info(url, download=False) if info_dict.get('_type', 'video') == 'playlist': for entry in info_dict['entries']: @@ -192,7 +199,13 @@ def ydl_progress_hook(d): else: ydl_progress_each(info_dict) else: - info_dict = ydl.extract_info(url) + if os.path.exists(url): + p = ydl.download_with_info_file(url) + if p == 0: + with open(url, 'r') as f: + info_dict = json.load(f) + else: + info_dict = ydl.extract_info(url) downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict)) self.logger.debug( @@ -443,6 +456,8 @@ def archive_urls(self, urls, custom_meta=None, downloaded_file_basenames = self.get_resource_basenames( urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, ignore_existing_item, yt_args) + self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames) + for basename in downloaded_file_basenames: identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta) yield identifier, meta From 40f173069a6f29a8957f7ee454e815eb01f9b066 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 22 Sep 2023 10:06:58 -0300 Subject: [PATCH 10/18] missed another occurence inside m3u8 manifest URL --- tubeup/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubeup/utils.py b/tubeup/utils.py index 2be5b86..de8c3e4 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -38,7 +38,7 @@ def strip_ip_from_url(url): or in an "ip" query-parameter, like in ?ip=1.2.3.4 """ u = urlparse(url) - u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)) + u = u._replace(path=re.sub(r'%26ip%3D[^%]+', r'%26ip%3DREDACTED%', re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))) if u.query != '': qs = parse_qs(u.query) try: From 33a85732e53e0ceb7467c611db77d997e1dc5143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 17:09:25 -0300 Subject: [PATCH 11/18] Fail-safe when multiple items are queued for upload --- tubeup/TubeUp.py | 14 ++------------ tubeup/__main__.py | 46 ++++++++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 37967b7..b784f9f 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -419,11 +419,10 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): return itemname, metadata - def archive_urls(self, urls, custom_meta=None, + def download_urls(self, urls, cookie_file=None, proxy=None, ydl_username=None, ydl_password=None, use_download_archive=False, - use_upload_archive=False, ignore_existing_item=False, yt_args=[]): """ @@ -432,8 +431,6 @@ def archive_urls(self, urls, custom_meta=None, :param urls: List of url or local info files that will be downloaded and uploaded to archive.org - :param custom_meta: A custom metadata that will be used when - uploading the file with archive.org. :param cookie_file: A cookie file for YoutubeDL. :param proxy_url: A proxy url for YoutubeDL. :param ydl_username: Username that will be used to download the @@ -444,10 +441,6 @@ def archive_urls(self, urls, custom_meta=None, This will download only videos not listed in the archive file. Record the IDs of all downloaded videos in it. - :param use_upload_archive: Record the video url to the upload archive. - This will upload only videos not listed in - the archive file. Record the IDs of all - uploaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. :param yt_args: Additional parameters passed to yt-dlp. :return: Tuple containing identifier and metadata of the @@ -457,10 +450,7 @@ def archive_urls(self, urls, custom_meta=None, urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, ignore_existing_item, yt_args) self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames) - - for basename in downloaded_file_basenames: - identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta) - yield identifier, meta + return downloaded_file_basenames @staticmethod def determine_collection_type(url): diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 70ac6e0..dac678c 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -27,6 +27,7 @@ [--use-upload-archive] [--output ] [--ignore-existing-item] + [--abort-on-error] [--yt X...] tubeup -h | --help tubeup --version @@ -53,6 +54,7 @@ uploaded videos in it. -q --quiet Just print errors. -d --debug Print all logs to stdout. + --abort-on-error Abort after the first failed upload. -o --output Youtube-dlc output template. -i --ignore-existing-item Don't check if an item already exists on archive.org --yt X... Any option to be passed to underlying yt-dlp. @@ -86,6 +88,7 @@ def main(): use_download_archive = args['--use-download-archive'] use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] + abort_on_error = args['--abort-on-error'] parser, opts, all_urls, yt_args = parse_options(args['--yt']) if debug_mode: @@ -106,14 +109,17 @@ def main(): tu = TubeUp(verbose=not quiet_mode, output_template=args['--output']) - try: - for identifier, meta in tu.archive_urls(URLs, metadata, - cookie_file, proxy_url, - username, password, - use_download_archive, - use_upload_archive, - ignore_existing_item, - yt_args): + downloaded_file_basenames = tu.download_urls(URLs, + cookie_file, proxy_url, + username, password, + use_download_archive, + ignore_existing_item, + yt_args) + + failures = [] + for basename in downloaded_file_basenames: + try: + identifier, meta = tu.upload_ia(basename, use_upload_archive, metadata) if identifier: print('\n:: Upload Finished. Item information:') print('Title: %s' % meta['title']) @@ -121,16 +127,20 @@ def main(): else: print('\n:: Upload skipped. Item information:') print('Title: %s' % meta['title']) - except Exception: - print('\n\033[91m' # Start red color text - 'An exception just occured, if you found this ' - "exception isn't related with any of your connection problem, " - 'please report this issue to ' - 'https://github.com/bibanon/tubeup/issues') - traceback.print_exc() - print('\033[0m') # End the red color text - sys.exit(1) - + except Exception: + failures.append(basename) + print('\n\033[91m' # Start red color text + 'An exception just occured, if you found this ' + "exception isn't related with any of your connection problem, " + 'please report this issue to ' + 'https://github.com/bibanon/tubeup/issues') + traceback.print_exc() + print('\033[0m') # End the red color text + if abort_on_error: + break + + if len(failures) > 0: + print("Failed uploads:\n" + "\n".join(failures)) if __name__ == '__main__': main() From 8777dfad2b46a361a0b3c04bcc42cf94b7c83041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 17:18:26 -0300 Subject: [PATCH 12/18] network: Use sane (or at least default) values for timeouts --- tubeup/TubeUp.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index b784f9f..22685c2 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -282,8 +282,6 @@ def generate_ydl_options(self, 'progress_with_newline': True, 'forcetitle': True, 'continuedl': True, - 'retries': 9001, - 'fragment_retries': 9001, 'forcejson': False, 'writeinfojson': True, 'writedescription': True, @@ -409,8 +407,8 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): print(msg) raise Exception(msg) - item.upload(files_to_upload, metadata=metadata, retries=9001, - request_kwargs=dict(timeout=9001), delete=not use_upload_archive, + item.upload(files_to_upload, metadata=metadata, retries=15, + request_kwargs=dict(timeout=60), delete=not use_upload_archive, verbose=self.verbose, access_key=s3_access_key, secret_key=s3_secret_key) From ff0c21fd93d2bf48634a28b1abf962f0bceb8081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 17:19:10 -0300 Subject: [PATCH 13/18] playlist: Omit empty playlists --- tubeup/TubeUp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 22685c2..406571d 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -232,6 +232,8 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict): if info_type == 'playlist': # Iterate and get the filenames through the playlist + if 'entries' not in info_dict: + return set() for video in info_dict['entries']: filenames.add(ydl.prepare_filename(video)) else: From e962f216dc52bd05224ced7b2c936c2a4904254f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 18:23:11 -0300 Subject: [PATCH 14/18] cmd: Further simplify option handling --- tubeup/TubeUp.py | 65 ++-------------------------------------------- tubeup/__main__.py | 52 +++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 83 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 406571d..343721a 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -83,9 +83,6 @@ def dir_path(self, dir_path): } def get_resource_basenames(self, urls, - cookie_file=None, proxy_url=None, - ydl_username=None, ydl_password=None, - use_download_archive=False, ignore_existing_item=False, yt_args=[]): """ @@ -93,16 +90,6 @@ def get_resource_basenames(self, urls, :param urls: A list of urls that will be downloaded with youtubedl (or their corresponding info-files) - :param cookie_file: A cookie file for YoutubeDL. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be used - to download the resources with youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. :param yt_args: Additional parameters passed to yt-dlp. :return: Set of videos basename that has been downloaded. @@ -171,10 +158,7 @@ def ydl_progress_hook(d): if self.verbose: print(msg) - ydl_opts = self.generate_ydl_options(ydl_progress_hook, - cookie_file, proxy_url, - ydl_username, ydl_password, - use_download_archive) + ydl_opts = self.generate_ydl_options(ydl_progress_hook) # Default yt-dlp overriden by tubeup specific options yt_args.update(ydl_opts) @@ -250,11 +234,6 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict): def generate_ydl_options(self, ydl_progress_hook, - cookie_file=None, - proxy_url=None, - ydl_username=None, - ydl_password=None, - use_download_archive=False, ydl_output_template=None): """ Generate a dictionary that contains options that will be used @@ -262,16 +241,6 @@ def generate_ydl_options(self, :param ydl_progress_hook: A function that will be called during the download process by youtube_dl. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be - used to download the resources with - youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. :return: A dictionary that contains options that will be used by youtube_dl. """ @@ -310,22 +279,6 @@ def generate_ydl_options(self, 'progress_hooks': [ydl_progress_hook] } - if cookie_file is not None: - ydl_opts['cookiefile'] = cookie_file - - if proxy_url is not None: - ydl_opts['proxy'] = proxy_url - - if ydl_username is not None: - ydl_opts['username'] = ydl_username - - if ydl_password is not None: - ydl_opts['password'] = ydl_password - - if use_download_archive: - ydl_opts['download_archive'] = os.path.join(self.dir_path['root'], - '.ytdlarchive') - return ydl_opts def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): @@ -420,9 +373,6 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): return itemname, metadata def download_urls(self, urls, - cookie_file=None, proxy=None, - ydl_username=None, ydl_password=None, - use_download_archive=False, ignore_existing_item=False, yt_args=[]): """ @@ -431,24 +381,13 @@ def download_urls(self, urls, :param urls: List of url or local info files that will be downloaded and uploaded to archive.org - :param cookie_file: A cookie file for YoutubeDL. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be used - to download the resources with youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. :param ignore_existing_item: Ignores the check for existing items on archive.org. :param yt_args: Additional parameters passed to yt-dlp. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. """ downloaded_file_basenames = self.get_resource_basenames( - urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, - ignore_existing_item, yt_args) + urls, ignore_existing_item, yt_args) self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames) return downloaded_file_basenames diff --git a/tubeup/__main__.py b/tubeup/__main__.py index dac678c..17dd6af 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -41,14 +41,11 @@ Options: -h --help Show this screen. - -p --proxy Use a proxy while uploading. - -u --username Provide a username, for sites like Nico Nico Douga. - -p --password Provide a password, for sites like Nico Nico Douga. - -a --use-download-archive Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. - -U --use-upload-archive Record the video url to the upload archive. + -p --proxy Deprecated. Shortcut for the corresponding yt-dlp option. + -u --username Deprecated. Shortcut for the corresponding yt-dlp option. + -p --password Deprecated. Shortcut for the corresponding yt-dlp option. + -a --use-download-archive Shortcut for --yt=--download-archive=%s + -U --use-upload-archive Record the video url to the upload archive at %s This will upload only videos not listed in the archive file. Record the IDs of all uploaded videos in it. @@ -58,8 +55,23 @@ -o --output Youtube-dlc output template. -i --ignore-existing-item Don't check if an item already exists on archive.org --yt X... Any option to be passed to underlying yt-dlp. + +Example: + Assuming that *.info.json files are consistent and + that yt-dlp output template led to uniform/predictible file names, + then a way to upload existing files based without triggering new downloads + is to use a combination of the following: + * --output='' + * --use-upload-archive + * --use-download-archive + * --ignore-existing-item + * --yt=--no-playlist + * --yt=--match-filter=!playlist + * --yt=--no-overwrites + """ +import os import sys import docopt import logging @@ -73,19 +85,24 @@ from tubeup.TubeUp import TubeUp from tubeup import __version__ +DEFAULT_DOWNLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.ytdlarchive') +DEFAULT_UPLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.iauparchive') def main(): # Parse arguments from file docstring - args = docopt.docopt(__doc__, version=__version__) + args = docopt.docopt(__doc__ % (DEFAULT_DOWNLOAD_ARCHIVE, DEFAULT_UPLOAD_ARCHIVE), + version=__version__) URLs = args[''] - cookie_file = args['--cookies'] - proxy_url = args['--proxy'] - username = args['--username'] - password = args['--password'] + for v in ['--cookies', '--proxy', '--username', '--password']: + if v in args and args[v]: + args['--yt'].append('%s=%s' % (v, args[v])) + + if args['--use-download-archive']: + args['--yt'].append('--download-archive=' + DEFAULT_DOWNLOAD_ARCHIVE) + quiet_mode = args['--quiet'] debug_mode = args['--debug'] - use_download_archive = args['--use-download-archive'] use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] abort_on_error = args['--abort-on-error'] @@ -109,12 +126,7 @@ def main(): tu = TubeUp(verbose=not quiet_mode, output_template=args['--output']) - downloaded_file_basenames = tu.download_urls(URLs, - cookie_file, proxy_url, - username, password, - use_download_archive, - ignore_existing_item, - yt_args) + downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item, yt_args) failures = [] for basename in downloaded_file_basenames: From 4aaa6e7172ed5238dfe0276dd438d3abd2534da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 18:35:50 -0300 Subject: [PATCH 15/18] perfs: Do not reinstance over and over an (identical) yt-dlp object --- tubeup/TubeUp.py | 90 +++++++++++++++++++++++----------------------- tubeup/__main__.py | 6 ++-- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 343721a..b8af440 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -27,7 +27,8 @@ def __init__(self, verbose=False, dir_path='~/.tubeup', ia_config_path=None, - output_template=None): + output_template=None, + yt_args=[]): """ `tubeup` is a tool to archive YouTube by downloading the videos and uploading it back to the archive.org. @@ -41,6 +42,7 @@ def __init__(self, be used in uploading the file. :param output_template: A template string that will be used to generate the output filenames. + :param yt_args: Additional parameters passed to yt-dlp. """ self.dir_path = dir_path self.verbose = verbose @@ -55,6 +57,8 @@ def __init__(self, if not self.verbose: self.logger.setLevel(logging.ERROR) + self.YDL = self.get_ytdlp_instance(yt_args) + @property def dir_path(self): return self._dir_path @@ -82,42 +86,7 @@ def dir_path(self, dir_path): DOWNLOAD_DIR_NAME) } - def get_resource_basenames(self, urls, - ignore_existing_item=False, - yt_args=[]): - """ - Get resource basenames from an url. - - :param urls: A list of urls that will be downloaded with - youtubedl (or their corresponding info-files) - :param ignore_existing_item: Ignores the check for existing items on archive.org. - :param yt_args: Additional parameters passed to yt-dlp. - :return: Set of videos basename that has been downloaded. - """ - downloaded_files_basename = set() - - def check_if_ia_item_exists(infodict): - itemname = get_itemname(infodict) - item = internetarchive.get_item(itemname) - if item.exists and self.verbose: - print("\n:: Item already exists. Not downloading.") - print('Title: %s' % infodict['title']) - print('Video URL: %s\n' % infodict['webpage_url']) - return True - return False - - def ydl_progress_each(entry): - if not entry: - self.logger.warning('Video "%s" is not available. Skipping.' % url) - return - if ydl.in_download_archive(entry): - return - if not check_if_ia_item_exists(entry): - ydl.extract_info(entry['webpage_url']) - downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry)) - else: - ydl.record_download_archive(entry) - + def get_ytdlp_instance(self, yt_args=[]): def ydl_progress_hook(d): if d['status'] == 'downloading' and self.verbose: if d.get('_total_bytes_str') is not None: @@ -162,9 +131,45 @@ def ydl_progress_hook(d): # Default yt-dlp overriden by tubeup specific options yt_args.update(ydl_opts) - ydl_opts = yt_args - with YoutubeDL(ydl_opts) as ydl: + return YoutubeDL(yt_args) + + + def get_resource_basenames(self, urls, + ignore_existing_item=False): + """ + Get resource basenames from an url. + + :param urls: A list of urls that will be downloaded with + youtubedl (or their corresponding info-files) + :param ignore_existing_item: Ignores the check for existing items on archive.org. + :return: Set of videos basename that has been downloaded. + """ + downloaded_files_basename = set() + + def check_if_ia_item_exists(infodict): + itemname = get_itemname(infodict) + item = internetarchive.get_item(itemname) + if item.exists and self.verbose: + print("\n:: Item already exists. Not downloading.") + print('Title: %s' % infodict['title']) + print('Video URL: %s\n' % infodict['webpage_url']) + return True + return False + + def ydl_progress_each(entry): + if not entry: + self.logger.warning('Video "%s" is not available. Skipping.' % url) + return + if ydl.in_download_archive(entry): + return + if not check_if_ia_item_exists(entry): + ydl.extract_info(entry['webpage_url']) + downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry)) + else: + ydl.record_download_archive(entry) + + with self.YDL as ydl: for url in urls: info_dict = {} if not ignore_existing_item: @@ -373,8 +378,7 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): return itemname, metadata def download_urls(self, urls, - ignore_existing_item=False, - yt_args=[]): + ignore_existing_item=False): """ Download and upload videos from youtube_dl supported sites to archive.org @@ -382,12 +386,10 @@ def download_urls(self, urls, :param urls: List of url or local info files that will be downloaded and uploaded to archive.org :param ignore_existing_item: Ignores the check for existing items on archive.org. - :param yt_args: Additional parameters passed to yt-dlp. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. """ - downloaded_file_basenames = self.get_resource_basenames( - urls, ignore_existing_item, yt_args) + downloaded_file_basenames = self.get_resource_basenames(urls, ignore_existing_item) self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames) return downloaded_file_basenames diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 17dd6af..7608f7d 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -123,10 +123,8 @@ def main(): metadata = internetarchive.cli.argparser.get_args_dict(args['--metadata']) - tu = TubeUp(verbose=not quiet_mode, - output_template=args['--output']) - - downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item, yt_args) + tu = TubeUp(verbose=not quiet_mode, output_template=args['--output'], yt_args=yt_args) + downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item) failures = [] for basename in downloaded_file_basenames: From e32cee9b74dc51f1514b9162c3adbed764574d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 19:00:20 -0300 Subject: [PATCH 16/18] perfs: Improve discard-logic of temporary files (and actually escape video basename) --- tubeup/TubeUp.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index b8af440..7f90101 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -2,6 +2,7 @@ import sys import re import glob +import fnmatch import time import json import logging @@ -312,8 +313,12 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): json.dump(new_meta, f) # Exit if video download did not complete, don't upload .part files to IA + # One glob() + fnmatch() is ten times less expensive than 8 globs(), + # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists) + filenames = glob.glob(glob.escape(videobasename) + '*') for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: - if glob.glob(videobasename + ext): + matching = fnmatch.filter(filenames, ext) + if matching: msg = 'Video download incomplete, please re-run or delete video stubs in downloads folder, exiting...' raise Exception(msg) @@ -345,7 +350,7 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): # Upload all files with videobase name: e.g. video.mp4, # video.info.json, video.srt, etc. - files_to_upload = glob.glob(videobasename + '*') + files_to_upload = glob.glob(glob.escape(videobasename) + '*') # Upload the item to the Internet Archive item = internetarchive.get_item(itemname) From 6f9cc3022090c86fa415f465121283757cf8ecfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 19:04:26 -0300 Subject: [PATCH 17/18] add619cf92 follow-up: Replace JSON info-file if it's actually going to be uploaded --- tubeup/TubeUp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7f90101..08fedb1 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -307,11 +307,6 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) - mod, new_meta = strip_ip_from_meta(vid_meta) - if mod: - with open(json_metadata_filepath, 'w') as f: - json.dump(new_meta, f) - # Exit if video download did not complete, don't upload .part files to IA # One glob() + fnmatch() is ten times less expensive than 8 globs(), # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists) @@ -372,6 +367,11 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): print(msg) raise Exception(msg) + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + item.upload(files_to_upload, metadata=metadata, retries=15, request_kwargs=dict(timeout=60), delete=not use_upload_archive, verbose=self.verbose, access_key=s3_access_key, From 055302bc21c4d47a071c27d4d7000067b6b1e7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Tue, 26 Sep 2023 19:23:19 -0300 Subject: [PATCH 18/18] doc: README.md, list changes --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9926e38..a8794b6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,15 @@ Tubeup - a multi-VOD service to Archive.org uploader It was designed by the [Bibliotheca Anonoma](https://github.com/bibanon/bibanon/wiki) to archive single videos, playlists (see warning below about more than video uploads) or accounts to the Internet Archive. +## Changes specific to this fork +- Clean-up IP addresses contained by Youtube-generated info files before IA upload +- Do not abort after a failure but try the next item + saner timeout values +- Accept arbitrary yt-dlp options (⚠️) +- Can upload existing resource (⚠️ under certain strict condition, see --help and a52031c) +- Can upload existing resource based on a local JSON info file +- More efficient at proceessing a large number of files/URL +- Broken testsuite (⚠️) + ## Prerequisites This script strongly recommends Linux or some sort of POSIX system (such as macOS), preferably from a rented VPS and not your personal machine or phone. @@ -31,7 +40,7 @@ For Debian/Ubuntu: At a minimum Python 3.8 and up is required (latest Python preferred). ``` - python3 -m pip install -U pip tubeup + python3 -m pip install -U pip git+https://github.com/drzraf/tubeup ``` 3. If you don't already have an Internet Archive account, [register for one](https://archive.org/account/login.createaccount.php) to give the script upload privileges.