diff --git a/README.md b/README.md index 9926e38..a8794b6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,15 @@ Tubeup - a multi-VOD service to Archive.org uploader It was designed by the [Bibliotheca Anonoma](https://github.com/bibanon/bibanon/wiki) to archive single videos, playlists (see warning below about more than video uploads) or accounts to the Internet Archive. +## Changes specific to this fork +- Clean-up IP addresses contained by Youtube-generated info files before IA upload +- Do not abort after a failure but try the next item + saner timeout values +- Accept arbitrary yt-dlp options (⚠️) +- Can upload existing resource (⚠️ under certain strict condition, see --help and a52031c) +- Can upload existing resource based on a local JSON info file +- More efficient at proceessing a large number of files/URL +- Broken testsuite (⚠️) + ## Prerequisites This script strongly recommends Linux or some sort of POSIX system (such as macOS), preferably from a rented VPS and not your personal machine or phone. @@ -31,7 +40,7 @@ For Debian/Ubuntu: At a minimum Python 3.8 and up is required (latest Python preferred). ``` - python3 -m pip install -U pip tubeup + python3 -m pip install -U pip git+https://github.com/drzraf/tubeup ``` 3. If you don't already have an Internet Archive account, [register for one](https://archive.org/account/login.createaccount.php) to give the script upload privileges. diff --git a/setup.py b/setup.py index 1e1e948..d1f9e39 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ }, install_requires=[ 'internetarchive', + 'urllib3', 'docopt==0.6.2', 'yt-dlp', ] diff --git a/tests/test_tubeup.py b/tests/test_tubeup.py index 9983d56..7603e86 100644 --- a/tests/test_tubeup.py +++ b/tests/test_tubeup.py @@ -590,3 +590,83 @@ def test_archive_urls(self): 'scanner': SCANNER})] self.assertEqual(expected_result, result) + + def test_archive_deletion(self): + root_path = os.path.join(current_path, 'test_tubeup_rootdir') + # Clean up before test + shutil.rmtree(root_path, ignore_errors=True) + + tu = TubeUp(dir_path=root_path, + ia_config_path=get_testfile_path('ia_config_for_test.ini')) + + videobasename = os.path.join( + current_path, 'test_tubeup_rootdir', 'downloads', + 'KdsN9YhkDrY') + + copy_testfiles_to_tubeup_rootdir_test() + dest = os.path.join(root_path, 'downloads', '*') + files_before_upload = glob.glob(dest) + + vid_info = {'mediatype': 'movies', + 'creator': 'RelaxingWorld', + 'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg', + 'collection': 'opensource_movies', + 'title': 'Epic Ramadan - Video Background HD1080p', + 'description': ('If you enjoy my work, please consider Subscribe to my NEW ' + 'channel for more videos:
' + 'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1
' + '▷ If you use this video, please put credits to my channel ' + 'in description:
' + 'Source from RelaxingWorld: https://goo.gl/HsW75m
' + '
' + '▷ Also, do not forget to Subscribe to my channel. Thanks!'), + 'date': '2016-06-25', + 'year': '2016', + 'subject': ('Youtube;video;Film & Animation;Video Background;' + 'Footage;Animation;Cinema;Royalty Free Videos;' + 'Stock Video Footage;Video Backdrops;' + 'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;' + 'relaxing music;Ramadan;'), + 'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY', + 'licenseurl': '', + 'scanner': SCANNER} + + with requests_mock.Mocker() as m: + # Mock the request to s3.us.archive.org, so it will responds + # a custom json. `internetarchive` library sends GET request to + # that url to check that we don't violate the upload limit. + m.get('https://s3.us.archive.org', + content=b'{"over_limit": 0}', + headers={'content-type': 'application/json'}) + + m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY', + content=b'{}', + headers={'content-type': 'application/json'}) + + # Mock the PUT requests for internetarchive urls that defined + # in mock_upload_response_by_videobasename(), so this test + # doesn't perform upload to the real archive.org server. + mock_upload_response_by_videobasename( + m, 'youtube-KdsN9YhkDrY', videobasename) + + # First upload, this actually get uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and returns a remote IA item name + expected_result = [('youtube-KdsN9YhkDrY', vid_info)] + self.assertEqual(expected_result, result) + + # ... and no file got deleted + files_after_upload = glob.glob(dest) + self.assertListEqual(files_before_upload, files_after_upload) + # ... and a upload-archive file was created + self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive'))) + + # Second upload, nothing was actually uploaded... + result = list(tu.archive_urls( + ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True)) + + # ... and no remote IA item name is returned + expected_result = [(None, vid_info)] + self.assertEqual(expected_result, result) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8c409d8..813271a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,13 @@ import unittest import os -from tubeup.utils import sanitize_identifier, check_is_file_empty +import json +from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta + +current_path = os.path.dirname(os.path.realpath(__file__)) + + +def get_testfile_path(name): + return os.path.join(current_path, 'test_tubeup_files', name) class UtilsTest(unittest.TestCase): @@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self): FileNotFoundError, r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"): check_is_file_empty('file_that_doesnt_exist.txt') + + def test_strip_ip_from_meta(self): + with open(get_testfile_path( + 'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.' + 'info.json') + ) as f: + vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + self.assertTrue(mod) + self.assertNotEqual(f.read(), json.dumps(new_meta)) + self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234') diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..08fedb1 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -2,6 +2,7 @@ import sys import re import glob +import fnmatch import time import json import logging @@ -10,7 +11,7 @@ from internetarchive.config import parse_config_file from datetime import datetime from yt_dlp import YoutubeDL -from .utils import (get_itemname, check_is_file_empty, +from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta, EMPTY_ANNOTATION_FILE) from logging import getLogger from urllib.parse import urlparse @@ -27,7 +28,8 @@ def __init__(self, verbose=False, dir_path='~/.tubeup', ia_config_path=None, - output_template=None): + output_template=None, + yt_args=[]): """ `tubeup` is a tool to archive YouTube by downloading the videos and uploading it back to the archive.org. @@ -41,6 +43,7 @@ def __init__(self, be used in uploading the file. :param output_template: A template string that will be used to generate the output filenames. + :param yt_args: Additional parameters passed to yt-dlp. """ self.dir_path = dir_path self.verbose = verbose @@ -55,6 +58,8 @@ def __init__(self, if not self.verbose: self.logger.setLevel(logging.ERROR) + self.YDL = self.get_ytdlp_instance(yt_args) + @property def dir_path(self): return self._dir_path @@ -82,53 +87,7 @@ def dir_path(self, dir_path): DOWNLOAD_DIR_NAME) } - def get_resource_basenames(self, urls, - cookie_file=None, proxy_url=None, - ydl_username=None, ydl_password=None, - use_download_archive=False, - ignore_existing_item=False): - """ - Get resource basenames from an url. - - :param urls: A list of urls that will be downloaded with - youtubedl. - :param cookie_file: A cookie file for YoutubeDL. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be used - to download the resources with youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. - :param ignore_existing_item: Ignores the check for existing items on archive.org. - :return: Set of videos basename that has been downloaded. - """ - downloaded_files_basename = set() - - def check_if_ia_item_exists(infodict): - itemname = get_itemname(infodict) - item = internetarchive.get_item(itemname) - if item.exists and self.verbose: - print("\n:: Item already exists. Not downloading.") - print('Title: %s' % infodict['title']) - print('Video URL: %s\n' % infodict['webpage_url']) - return True - return False - - def ydl_progress_each(entry): - if not entry: - self.logger.warning('Video "%s" is not available. Skipping.' % url) - return - if ydl.in_download_archive(entry): - return - if not check_if_ia_item_exists(entry): - ydl.extract_info(entry['webpage_url']) - downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry)) - else: - ydl.record_download_archive(entry) - + def get_ytdlp_instance(self, yt_args=[]): def ydl_progress_hook(d): if d['status'] == 'downloading' and self.verbose: if d.get('_total_bytes_str') is not None: @@ -169,16 +128,60 @@ def ydl_progress_hook(d): if self.verbose: print(msg) - ydl_opts = self.generate_ydl_options(ydl_progress_hook, - cookie_file, proxy_url, - ydl_username, ydl_password, - use_download_archive) + ydl_opts = self.generate_ydl_options(ydl_progress_hook) + + # Default yt-dlp overriden by tubeup specific options + yt_args.update(ydl_opts) + + return YoutubeDL(yt_args) + + + def get_resource_basenames(self, urls, + ignore_existing_item=False): + """ + Get resource basenames from an url. + + :param urls: A list of urls that will be downloaded with + youtubedl (or their corresponding info-files) + :param ignore_existing_item: Ignores the check for existing items on archive.org. + :return: Set of videos basename that has been downloaded. + """ + downloaded_files_basename = set() + + def check_if_ia_item_exists(infodict): + itemname = get_itemname(infodict) + item = internetarchive.get_item(itemname) + if item.exists and self.verbose: + print("\n:: Item already exists. Not downloading.") + print('Title: %s' % infodict['title']) + print('Video URL: %s\n' % infodict['webpage_url']) + return True + return False + + def ydl_progress_each(entry): + if not entry: + self.logger.warning('Video "%s" is not available. Skipping.' % url) + return + if ydl.in_download_archive(entry): + return + if not check_if_ia_item_exists(entry): + ydl.extract_info(entry['webpage_url']) + downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry)) + else: + ydl.record_download_archive(entry) - with YoutubeDL(ydl_opts) as ydl: + with self.YDL as ydl: for url in urls: + info_dict = {} if not ignore_existing_item: - # Get the info dict of the url - info_dict = ydl.extract_info(url, download=False) + if os.path.exists(url): + p = ydl.download_with_info_file(url) + if p == 0: + with open(url, 'r') as f: + info_dict = json.load(f) + else: + # Get the info dict of the url + info_dict = ydl.extract_info(url, download=False) if info_dict.get('_type', 'video') == 'playlist': for entry in info_dict['entries']: @@ -186,7 +189,13 @@ def ydl_progress_hook(d): else: ydl_progress_each(info_dict) else: - info_dict = ydl.extract_info(url) + if os.path.exists(url): + p = ydl.download_with_info_file(url) + if p == 0: + with open(url, 'r') as f: + info_dict = json.load(f) + else: + info_dict = ydl.extract_info(url) downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict)) self.logger.debug( @@ -213,6 +222,8 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict): if info_type == 'playlist': # Iterate and get the filenames through the playlist + if 'entries' not in info_dict: + return set() for video in info_dict['entries']: filenames.add(ydl.prepare_filename(video)) else: @@ -229,11 +240,6 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict): def generate_ydl_options(self, ydl_progress_hook, - cookie_file=None, - proxy_url=None, - ydl_username=None, - ydl_password=None, - use_download_archive=False, ydl_output_template=None): """ Generate a dictionary that contains options that will be used @@ -241,16 +247,6 @@ def generate_ydl_options(self, :param ydl_progress_hook: A function that will be called during the download process by youtube_dl. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be - used to download the resources with - youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. :return: A dictionary that contains options that will be used by youtube_dl. """ @@ -263,8 +259,6 @@ def generate_ydl_options(self, 'progress_with_newline': True, 'forcetitle': True, 'continuedl': True, - 'retries': 9001, - 'fragment_retries': 9001, 'forcejson': False, 'writeinfojson': True, 'writedescription': True, @@ -291,42 +285,35 @@ def generate_ydl_options(self, 'progress_hooks': [ydl_progress_hook] } - if cookie_file is not None: - ydl_opts['cookiefile'] = cookie_file - - if proxy_url is not None: - ydl_opts['proxy'] = proxy_url - - if ydl_username is not None: - ydl_opts['username'] = ydl_username - - if ydl_password is not None: - ydl_opts['password'] = ydl_password - - if use_download_archive: - ydl_opts['download_archive'] = os.path.join(self.dir_path['root'], - '.ytdlarchive') - return ydl_opts - def upload_ia(self, videobasename, custom_meta=None): + def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None): """ Upload video to archive.org. - :param videobasename: A video base name. - :param custom_meta: A custom meta, will be used by internetarchive - library when uploading to archive.org. - :return: A tuple containing item name and metadata used - when uploading to archive.org and whether the item - already exists. + :param videobasename: A video base name. + :param use_upload_archive: Record the video url to the upload archive. + This will upload only videos not listed in + the archive file. Record the IDs of all + uploaded videos in it. + :param custom_meta: A custom meta, will be used by internetarchive + library when uploading to archive.org. + :return: A tuple containing item name and metadata used + when uploading to archive.org and whether the item + already exists. A null item name means upload + didn't happened. """ json_metadata_filepath = videobasename + '.info.json' with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) # Exit if video download did not complete, don't upload .part files to IA + # One glob() + fnmatch() is ten times less expensive than 8 globs(), + # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists) + filenames = glob.glob(glob.escape(videobasename) + '*') for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: - if glob.glob(videobasename + ext): + matching = fnmatch.filter(filenames, ext) + if matching: msg = 'Video download incomplete, please re-run or delete video stubs in downloads folder, exiting...' raise Exception(msg) @@ -334,6 +321,12 @@ def upload_ia(self, videobasename, custom_meta=None): metadata = self.create_archive_org_metadata_from_youtubedl_meta( vid_meta) + if use_upload_archive: + ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')}) + if ydl.in_download_archive(vid_meta): + self.logger.debug('Skipping already uploaded video: %s', metadata['title']) + return None, metadata + # Delete empty description file description_file_path = videobasename + '.description' if (os.path.exists(description_file_path) and @@ -352,7 +345,7 @@ def upload_ia(self, videobasename, custom_meta=None): # Upload all files with videobase name: e.g. video.mp4, # video.info.json, video.srt, etc. - files_to_upload = glob.glob(videobasename + '*') + files_to_upload = glob.glob(glob.escape(videobasename) + '*') # Upload the item to the Internet Archive item = internetarchive.get_item(itemname) @@ -374,46 +367,36 @@ def upload_ia(self, videobasename, custom_meta=None): print(msg) raise Exception(msg) - item.upload(files_to_upload, metadata=metadata, retries=9001, - request_kwargs=dict(timeout=9001), delete=True, + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + + item.upload(files_to_upload, metadata=metadata, retries=15, + request_kwargs=dict(timeout=60), delete=not use_upload_archive, verbose=self.verbose, access_key=s3_access_key, secret_key=s3_secret_key) + if use_upload_archive: + ydl.record_download_archive(vid_meta) + return itemname, metadata - def archive_urls(self, urls, custom_meta=None, - cookie_file=None, proxy=None, - ydl_username=None, ydl_password=None, - use_download_archive=False, + def download_urls(self, urls, ignore_existing_item=False): """ Download and upload videos from youtube_dl supported sites to archive.org - :param urls: List of url that will be downloaded and uploaded - to archive.org - :param custom_meta: A custom metadata that will be used when - uploading the file with archive.org. - :param cookie_file: A cookie file for YoutubeDL. - :param proxy_url: A proxy url for YoutubeDL. - :param ydl_username: Username that will be used to download the - resources with youtube_dl. - :param ydl_password: Password of the related username, will be used - to download the resources with youtube_dl. - :param use_download_archive: Record the video url to the download archive. - This will download only videos not listed in - the archive file. Record the IDs of all - downloaded videos in it. + :param urls: List of url or local info files that will + be downloaded and uploaded to archive.org :param ignore_existing_item: Ignores the check for existing items on archive.org. :return: Tuple containing identifier and metadata of the file that has been uploaded to archive.org. """ - downloaded_file_basenames = self.get_resource_basenames( - urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive, - ignore_existing_item) - for basename in downloaded_file_basenames: - identifier, meta = self.upload_ia(basename, custom_meta) - yield identifier, meta + downloaded_file_basenames = self.get_resource_basenames(urls, ignore_existing_item) + self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames) + return downloaded_file_basenames @staticmethod def determine_collection_type(url): diff --git a/tubeup/__main__.py b/tubeup/__main__.py index 0841765..7608f7d 100644 --- a/tubeup/__main__.py +++ b/tubeup/__main__.py @@ -24,8 +24,11 @@ [--proxy ] [--quiet] [--debug] [--use-download-archive] + [--use-upload-archive] [--output ] [--ignore-existing-item] + [--abort-on-error] + [--yt X...] tubeup -h | --help tubeup --version @@ -38,44 +41,72 @@ Options: -h --help Show this screen. - -p --proxy Use a proxy while uploading. - -u --username Provide a username, for sites like Nico Nico Douga. - -p --password Provide a password, for sites like Nico Nico Douga. - -a --use-download-archive Record the video url to the download archive. - This will download only videos not listed in + -p --proxy Deprecated. Shortcut for the corresponding yt-dlp option. + -u --username Deprecated. Shortcut for the corresponding yt-dlp option. + -p --password Deprecated. Shortcut for the corresponding yt-dlp option. + -a --use-download-archive Shortcut for --yt=--download-archive=%s + -U --use-upload-archive Record the video url to the upload archive at %s + This will upload only videos not listed in the archive file. Record the IDs of all - downloaded videos in it. + uploaded videos in it. -q --quiet Just print errors. -d --debug Print all logs to stdout. + --abort-on-error Abort after the first failed upload. -o --output Youtube-dlc output template. -i --ignore-existing-item Don't check if an item already exists on archive.org + --yt X... Any option to be passed to underlying yt-dlp. + +Example: + Assuming that *.info.json files are consistent and + that yt-dlp output template led to uniform/predictible file names, + then a way to upload existing files based without triggering new downloads + is to use a combination of the following: + * --output='' + * --use-upload-archive + * --use-download-archive + * --ignore-existing-item + * --yt=--no-playlist + * --yt=--match-filter=!playlist + * --yt=--no-overwrites + """ +import os import sys import docopt import logging import traceback +from yt_dlp import parse_options + import internetarchive import internetarchive.cli from tubeup.TubeUp import TubeUp from tubeup import __version__ +DEFAULT_DOWNLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.ytdlarchive') +DEFAULT_UPLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.iauparchive') def main(): # Parse arguments from file docstring - args = docopt.docopt(__doc__, version=__version__) + args = docopt.docopt(__doc__ % (DEFAULT_DOWNLOAD_ARCHIVE, DEFAULT_UPLOAD_ARCHIVE), + version=__version__) URLs = args[''] - cookie_file = args['--cookies'] - proxy_url = args['--proxy'] - username = args['--username'] - password = args['--password'] + for v in ['--cookies', '--proxy', '--username', '--password']: + if v in args and args[v]: + args['--yt'].append('%s=%s' % (v, args[v])) + + if args['--use-download-archive']: + args['--yt'].append('--download-archive=' + DEFAULT_DOWNLOAD_ARCHIVE) + quiet_mode = args['--quiet'] debug_mode = args['--debug'] - use_download_archive = args['--use-download-archive'] + use_upload_archive = args['--use-upload-archive'] ignore_existing_item = args['--ignore-existing-item'] + abort_on_error = args['--abort-on-error'] + parser, opts, all_urls, yt_args = parse_options(args['--yt']) if debug_mode: # Display log messages. @@ -92,28 +123,34 @@ def main(): metadata = internetarchive.cli.argparser.get_args_dict(args['--metadata']) - tu = TubeUp(verbose=not quiet_mode, - output_template=args['--output']) - - try: - for identifier, meta in tu.archive_urls(URLs, metadata, - cookie_file, proxy_url, - username, password, - use_download_archive, - ignore_existing_item): - print('\n:: Upload Finished. Item information:') - print('Title: %s' % meta['title']) - print('Item URL: https://archive.org/details/%s\n' % identifier) - except Exception: - print('\n\033[91m' # Start red color text - 'An exception just occured, if you found this ' - "exception isn't related with any of your connection problem, " - 'please report this issue to ' - 'https://github.com/bibanon/tubeup/issues') - traceback.print_exc() - print('\033[0m') # End the red color text - sys.exit(1) - + tu = TubeUp(verbose=not quiet_mode, output_template=args['--output'], yt_args=yt_args) + downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item) + + failures = [] + for basename in downloaded_file_basenames: + try: + identifier, meta = tu.upload_ia(basename, use_upload_archive, metadata) + if identifier: + print('\n:: Upload Finished. Item information:') + print('Title: %s' % meta['title']) + print('Item URL: https://archive.org/details/%s\n' % identifier) + else: + print('\n:: Upload skipped. Item information:') + print('Title: %s' % meta['title']) + except Exception: + failures.append(basename) + print('\n\033[91m' # Start red color text + 'An exception just occured, if you found this ' + "exception isn't related with any of your connection problem, " + 'please report this issue to ' + 'https://github.com/bibanon/tubeup/issues') + traceback.print_exc() + print('\033[0m') # End the red color text + if abort_on_error: + break + + if len(failures) > 0: + print("Failed uploads:\n" + "\n".join(failures)) if __name__ == '__main__': main() diff --git a/tubeup/utils.py b/tubeup/utils.py index bc12845..de8c3e4 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -1,5 +1,6 @@ import os import re +from urllib.parse import urlparse, parse_qs, urlencode EMPTY_ANNOTATION_FILE = ('' @@ -29,3 +30,39 @@ def check_is_file_empty(filepath): return os.stat(filepath).st_size == 0 else: raise FileNotFoundError("Path '%s' doesn't exist" % filepath) + + +def strip_ip_from_url(url): + """ + Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/ + or in an "ip" query-parameter, like in ?ip=1.2.3.4 + """ + u = urlparse(url) + u = u._replace(path=re.sub(r'%26ip%3D[^%]+', r'%26ip%3DREDACTED%', re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))) + if u.query != '': + qs = parse_qs(u.query) + try: + del (qs['ip']) + u = u._replace(query=urlencode(qs, True)) + except KeyError: + pass + return u.geturl() + + +def strip_ip_from_meta(meta): + modified = False + if 'url' in meta: + redacted_url = strip_ip_from_url(meta['url']) + if redacted_url != meta['url']: + meta['url'] = redacted_url + modified = True + + for _format in meta['formats']: + for field in ['manifest_url', 'fragment_base_url', 'url']: + if field in _format: + redacted_url = strip_ip_from_url(_format[field]) + if redacted_url != _format[field]: + _format[field] = redacted_url + modified = True + + return modified, meta