From d838711ca87eee898b8a6264118c56842287aeca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sat, 16 Sep 2023 23:14:39 -0300
Subject: [PATCH 01/18] Revert "Remove urllib3 dependency"

This reverts commit 09114777c3e599cbe7a393dab6816eb4d71381c9.

urllib3 is needed to postprocess/parse/URL for sanitization and privacy purpose (#192)
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 1e1e948..3893c3a 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@
     },
     install_requires=[
         'internetarchive',
+        'urllib3==1.26.13',
         'docopt==0.6.2',
         'yt-dlp',
     ]

From add619cf92aec85129dd5a8152bb5446b7cb10f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sat, 16 Sep 2023 23:12:58 -0300
Subject: [PATCH 02/18] IA currently leaks the IP address of the submitter.
 This is bad.

We fix this by carefully redacting the IP address in the JSON fields known to contain it.
---
 tubeup/TubeUp.py |  7 ++++++-
 tubeup/utils.py  | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 7dd6801..98bdae7 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -10,7 +10,7 @@
 from internetarchive.config import parse_config_file
 from datetime import datetime
 from yt_dlp import YoutubeDL
-from .utils import (get_itemname, check_is_file_empty,
+from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
                     EMPTY_ANNOTATION_FILE)
 from logging import getLogger
 from urllib.parse import urlparse
@@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None):
         with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
             vid_meta = json.load(f)
 
+        mod, new_meta = strip_ip_from_meta(vid_meta)
+        if mod:
+            with open(json_metadata_filepath, 'w') as f:
+                json.dump(new_meta, f)
+
         # Exit if video download did not complete, don't upload .part files to IA
         for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
             if glob.glob(videobasename + ext):
diff --git a/tubeup/utils.py b/tubeup/utils.py
index bc12845..846c9fb 100644
--- a/tubeup/utils.py
+++ b/tubeup/utils.py
@@ -1,5 +1,6 @@
 import os
 import re
+from urllib.parse import urlparse, parse_qs, urlencode
 
 
 EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>'
@@ -29,3 +30,39 @@ def check_is_file_empty(filepath):
         return os.stat(filepath).st_size == 0
     else:
         raise FileNotFoundError("Path '%s' doesn't exist" % filepath)
+
+
+def strip_ip_from_url(url):
+    """
+    Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/
+    or in an "ip" query-parameter, like in ?ip=1.2.3.4
+    """
+    u = urlparse(url)
+    u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
+    if u.query != '':
+        qs = parse_qs(u.query)
+        try:
+            del(qs['ip'])
+            u = u._replace(query=urlencode(qs, True))
+        except:
+            pass
+    return u.geturl()
+
+
+def strip_ip_from_meta(meta):
+    modified = False
+    if 'url' in meta:
+        redacted_url = strip_ip_from_url(meta['url'])
+        if redacted_url != meta['url']:
+            meta['url'] = redacted_url
+            modified = True
+
+    for _format in meta['formats']:
+        for field in ['manifest_url', 'fragment_base_url', 'url']:
+            if field in _format:
+                redacted_url = strip_ip_from_url(_format[field])
+                if redacted_url != _format[field]:
+                    _format[field] = redacted_url
+                    modified = True
+
+    return modified, meta

From 7cb7b3ec81782f2fef37099dc0c5ef57829f4f1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sat, 16 Sep 2023 23:38:46 -0300
Subject: [PATCH 03/18] added tests

---
 tests/test_utils.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8c409d8..224b9c1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,12 @@
 import unittest
 import os
-from tubeup.utils import sanitize_identifier, check_is_file_empty
+import json
+from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta
 
+current_path = os.path.dirname(os.path.realpath(__file__))
+
+def get_testfile_path(name):
+    return os.path.join(current_path, 'test_tubeup_files', name)
 
 class UtilsTest(unittest.TestCase):
 
@@ -48,3 +53,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self):
                 FileNotFoundError,
                 r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"):
             check_is_file_empty('file_that_doesnt_exist.txt')
+
+    def test_strip_ip_from_meta(self):
+        with open(get_testfile_path(
+                'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.'
+                'info.json')
+        ) as f:
+            vid_meta = json.load(f)
+            mod, new_meta = strip_ip_from_meta(vid_meta)
+            self.assertTrue(mod)
+            self.assertNotEqual(f.read(), json.dumps(new_meta))
+            self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234')

From 402ae19223993bbf74b178e18808e75f7a3f320b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sun, 17 Sep 2023 01:39:40 -0300
Subject: [PATCH 04/18] flake8 linting

---
 tests/test_utils.py | 2 ++
 tubeup/utils.py     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 224b9c1..813271a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,9 +5,11 @@
 
 current_path = os.path.dirname(os.path.realpath(__file__))
 
+
 def get_testfile_path(name):
     return os.path.join(current_path, 'test_tubeup_files', name)
 
+
 class UtilsTest(unittest.TestCase):
 
     def test_preserve_valid_identifiers(self):
diff --git a/tubeup/utils.py b/tubeup/utils.py
index 846c9fb..2be5b86 100644
--- a/tubeup/utils.py
+++ b/tubeup/utils.py
@@ -42,9 +42,9 @@ def strip_ip_from_url(url):
     if u.query != '':
         qs = parse_qs(u.query)
         try:
-            del(qs['ip'])
+            del (qs['ip'])
             u = u._replace(query=urlencode(qs, True))
-        except:
+        except KeyError:
             pass
     return u.geturl()
 

From 9c2ae72643c471623f871290e32f962c0962cc35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Mon, 18 Sep 2023 18:28:23 -0300
Subject: [PATCH 05/18] Dont leak ip address (#310)

* Revert "Remove urllib3 dependency"

This reverts commit 09114777c3e599cbe7a393dab6816eb4d71381c9.

urllib3 is needed to postprocess/parse/URL for sanitization and privacy purpose (#192)

* IA currently leaks the IP address of the submitter. This is bad.

We fix this by carefully redacting the IP address in the JSON fields known to contain it.

* added tests

* flake8 linting
---
 setup.py            |  1 +
 tests/test_utils.py | 20 +++++++++++++++++++-
 tubeup/TubeUp.py    |  7 ++++++-
 tubeup/utils.py     | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 1e1e948..3893c3a 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@
     },
     install_requires=[
         'internetarchive',
+        'urllib3==1.26.13',
         'docopt==0.6.2',
         'yt-dlp',
     ]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8c409d8..813271a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,13 @@
 import unittest
 import os
-from tubeup.utils import sanitize_identifier, check_is_file_empty
+import json
+from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta
+
+current_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def get_testfile_path(name):
+    return os.path.join(current_path, 'test_tubeup_files', name)
 
 
 class UtilsTest(unittest.TestCase):
@@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self):
                 FileNotFoundError,
                 r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"):
             check_is_file_empty('file_that_doesnt_exist.txt')
+
+    def test_strip_ip_from_meta(self):
+        with open(get_testfile_path(
+                'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.'
+                'info.json')
+        ) as f:
+            vid_meta = json.load(f)
+            mod, new_meta = strip_ip_from_meta(vid_meta)
+            self.assertTrue(mod)
+            self.assertNotEqual(f.read(), json.dumps(new_meta))
+            self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234')
diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 7dd6801..98bdae7 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -10,7 +10,7 @@
 from internetarchive.config import parse_config_file
 from datetime import datetime
 from yt_dlp import YoutubeDL
-from .utils import (get_itemname, check_is_file_empty,
+from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
                     EMPTY_ANNOTATION_FILE)
 from logging import getLogger
 from urllib.parse import urlparse
@@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None):
         with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
             vid_meta = json.load(f)
 
+        mod, new_meta = strip_ip_from_meta(vid_meta)
+        if mod:
+            with open(json_metadata_filepath, 'w') as f:
+                json.dump(new_meta, f)
+
         # Exit if video download did not complete, don't upload .part files to IA
         for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
             if glob.glob(videobasename + ext):
diff --git a/tubeup/utils.py b/tubeup/utils.py
index bc12845..2be5b86 100644
--- a/tubeup/utils.py
+++ b/tubeup/utils.py
@@ -1,5 +1,6 @@
 import os
 import re
+from urllib.parse import urlparse, parse_qs, urlencode
 
 
 EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>'
@@ -29,3 +30,39 @@ def check_is_file_empty(filepath):
         return os.stat(filepath).st_size == 0
     else:
         raise FileNotFoundError("Path '%s' doesn't exist" % filepath)
+
+
+def strip_ip_from_url(url):
+    """
+    Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/
+    or in an "ip" query-parameter, like in ?ip=1.2.3.4
+    """
+    u = urlparse(url)
+    u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
+    if u.query != '':
+        qs = parse_qs(u.query)
+        try:
+            del (qs['ip'])
+            u = u._replace(query=urlencode(qs, True))
+        except KeyError:
+            pass
+    return u.geturl()
+
+
+def strip_ip_from_meta(meta):
+    modified = False
+    if 'url' in meta:
+        redacted_url = strip_ip_from_url(meta['url'])
+        if redacted_url != meta['url']:
+            meta['url'] = redacted_url
+            modified = True
+
+    for _format in meta['formats']:
+        for field in ['manifest_url', 'fragment_base_url', 'url']:
+            if field in _format:
+                redacted_url = strip_ip_from_url(_format[field])
+                if redacted_url != _format[field]:
+                    _format[field] = redacted_url
+                    modified = True
+
+    return modified, meta

From a7d8b42f3a09c143c271ba7e874e63a406c2698c Mon Sep 17 00:00:00 2001
From: Paul Henning <vxbinaca@users.noreply.github.com>
Date: Mon, 18 Sep 2023 18:12:52 -0400
Subject: [PATCH 06/18] URLlib3 dependency version lift

- Use latest urllib3 instead of that specific version
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3893c3a..d1f9e39 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
     },
     install_requires=[
         'internetarchive',
-        'urllib3==1.26.13',
+        'urllib3',
         'docopt==0.6.2',
         'yt-dlp',
     ]

From d0b9df63cbe199627cf03f78ef45e011e0c03baf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sun, 17 Sep 2023 01:22:25 -0300
Subject: [PATCH 07/18] Idempotency: Keep a local "archive" (index) of files
 successfully uploaded the way   yt-dlp keep track of downloaded items.

- Amend #19 by adding optional idempotency between runs:
  While concurrent-instances can still rely on a "clean" download directory (#19)
  a single upload node can use --use-upload-archive can avoid source files removal.
  Subsequent runs of `tubeup` will omit files already uploaded.

- Fixes #23 (and part of #233)
- NB: Ability for upload_ia() to return None, paves the way to fix #36 or #109
---
 tests/test_tubeup.py | 80 ++++++++++++++++++++++++++++++++++++++++++++
 tubeup/TubeUp.py     | 37 +++++++++++++++-----
 tubeup/__main__.py   | 17 ++++++++--
 3 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/tests/test_tubeup.py b/tests/test_tubeup.py
index 9983d56..7603e86 100644
--- a/tests/test_tubeup.py
+++ b/tests/test_tubeup.py
@@ -590,3 +590,83 @@ def test_archive_urls(self):
                  'scanner': SCANNER})]
 
             self.assertEqual(expected_result, result)
+
+    def test_archive_deletion(self):
+        root_path = os.path.join(current_path, 'test_tubeup_rootdir')
+        # Clean up before test
+        shutil.rmtree(root_path, ignore_errors=True)
+
+        tu = TubeUp(dir_path=root_path,
+                    ia_config_path=get_testfile_path('ia_config_for_test.ini'))
+
+        videobasename = os.path.join(
+            current_path, 'test_tubeup_rootdir', 'downloads',
+            'KdsN9YhkDrY')
+
+        copy_testfiles_to_tubeup_rootdir_test()
+        dest = os.path.join(root_path, 'downloads', '*')
+        files_before_upload = glob.glob(dest)
+
+        vid_info = {'mediatype': 'movies',
+                    'creator': 'RelaxingWorld',
+                    'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg',
+                    'collection': 'opensource_movies',
+                    'title': 'Epic Ramadan - Video Background HD1080p',
+                    'description': ('If you enjoy my work, please consider Subscribe to my NEW '
+                                    'channel for more videos: <br>'
+                                    'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1 <br>'
+                                    '▷ If you use this video, please put credits to my channel '
+                                    'in description: <br>'
+                                    'Source from RelaxingWorld: https://goo.gl/HsW75m<br>'
+                                    '<br>'
+                                    '▷ Also, do not forget to Subscribe to my channel. Thanks!'),
+                    'date': '2016-06-25',
+                    'year': '2016',
+                    'subject': ('Youtube;video;Film & Animation;Video Background;'
+                                'Footage;Animation;Cinema;Royalty Free Videos;'
+                                'Stock Video Footage;Video Backdrops;'
+                                'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;'
+                                'relaxing music;Ramadan;'),
+                    'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY',
+                    'licenseurl': '',
+                    'scanner': SCANNER}
+
+        with requests_mock.Mocker() as m:
+            # Mock the request to s3.us.archive.org, so it will responds
+            # a custom json. `internetarchive` library sends GET request to
+            # that url to check that we don't violate the upload limit.
+            m.get('https://s3.us.archive.org',
+                  content=b'{"over_limit": 0}',
+                  headers={'content-type': 'application/json'})
+
+            m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY',
+                  content=b'{}',
+                  headers={'content-type': 'application/json'})
+
+            # Mock the PUT requests for internetarchive urls that defined
+            # in mock_upload_response_by_videobasename(), so this test
+            # doesn't perform upload to the real archive.org server.
+            mock_upload_response_by_videobasename(
+                m, 'youtube-KdsN9YhkDrY', videobasename)
+
+            # First upload, this actually get uploaded...
+            result = list(tu.archive_urls(
+                ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))
+
+            # ... and returns a remote IA item name
+            expected_result = [('youtube-KdsN9YhkDrY', vid_info)]
+            self.assertEqual(expected_result, result)
+
+            # ... and no file got deleted
+            files_after_upload = glob.glob(dest)
+            self.assertListEqual(files_before_upload, files_after_upload)
+            # ... and a upload-archive file was created
+            self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive')))
+
+            # Second upload, nothing was actually uploaded...
+            result = list(tu.archive_urls(
+                ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))
+
+            # ... and no remote IA item name is returned
+            expected_result = [(None, vid_info)]
+            self.assertEqual(expected_result, result)
diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 98bdae7..f4223f9 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -309,16 +309,21 @@ def generate_ydl_options(self,
 
         return ydl_opts
 
-    def upload_ia(self, videobasename, custom_meta=None):
+    def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
         """
         Upload video to archive.org.
 
-        :param videobasename:  A video base name.
-        :param custom_meta:    A custom meta, will be used by internetarchive
-                               library when uploading to archive.org.
-        :return:               A tuple containing item name and metadata used
-                               when uploading to archive.org and whether the item
-                               already exists.
+        :param videobasename:         A video base name.
+        :param use_upload_archive:    Record the video url to the upload archive.
+                                      This will upload only videos not listed in
+                                      the archive file. Record the IDs of all
+                                      uploaded videos in it.
+        :param custom_meta:           A custom meta, will be used by internetarchive
+                                      library when uploading to archive.org.
+        :return:                      A tuple containing item name and metadata used
+                                      when uploading to archive.org and whether the item
+                                      already exists. A null item name means upload
+                                      didn't happened.
         """
         json_metadata_filepath = videobasename + '.info.json'
         with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
@@ -339,6 +344,12 @@ def upload_ia(self, videobasename, custom_meta=None):
         metadata = self.create_archive_org_metadata_from_youtubedl_meta(
             vid_meta)
 
+        if use_upload_archive:
+            ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')})
+            if ydl.in_download_archive(vid_meta):
+                self.logger.debug('Skipping already uploaded video: %s', metadata['title'])
+                return None, metadata
+
         # Delete empty description file
         description_file_path = videobasename + '.description'
         if (os.path.exists(description_file_path) and
@@ -380,16 +391,20 @@ def upload_ia(self, videobasename, custom_meta=None):
             raise Exception(msg)
 
         item.upload(files_to_upload, metadata=metadata, retries=9001,
-                    request_kwargs=dict(timeout=9001), delete=True,
+                    request_kwargs=dict(timeout=9001), delete=not use_upload_archive,
                     verbose=self.verbose, access_key=s3_access_key,
                     secret_key=s3_secret_key)
 
+        if use_upload_archive:
+            ydl.record_download_archive(vid_meta)
+
         return itemname, metadata
 
     def archive_urls(self, urls, custom_meta=None,
                      cookie_file=None, proxy=None,
                      ydl_username=None, ydl_password=None,
                      use_download_archive=False,
+                     use_upload_archive=False,
                      ignore_existing_item=False):
         """
         Download and upload videos from youtube_dl supported sites to
@@ -409,6 +424,10 @@ def archive_urls(self, urls, custom_meta=None,
                                       This will download only videos not listed in
                                       the archive file. Record the IDs of all
                                       downloaded videos in it.
+        :param use_upload_archive:    Record the video url to the upload archive.
+                                      This will upload only videos not listed in
+                                      the archive file. Record the IDs of all
+                                      uploaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
         :return:                      Tuple containing identifier and metadata of the
                                       file that has been uploaded to archive.org.
@@ -417,7 +436,7 @@ def archive_urls(self, urls, custom_meta=None,
             urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
             ignore_existing_item)
         for basename in downloaded_file_basenames:
-            identifier, meta = self.upload_ia(basename, custom_meta)
+            identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta)
             yield identifier, meta
 
     @staticmethod
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index 0841765..4bd199a 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -24,6 +24,7 @@
                   [--proxy <prox>]
                   [--quiet] [--debug]
                   [--use-download-archive]
+                  [--use-upload-archive]
                   [--output <output>]
                   [--ignore-existing-item]
   tubeup -h | --help
@@ -45,6 +46,10 @@
                                This will download only videos not listed in
                                the archive file. Record the IDs of all
                                downloaded videos in it.
+  -U --use-upload-archive      Record the video url to the upload archive.
+                               This will upload only videos not listed in
+                               the archive file. Record the IDs of all
+                               uploaded videos in it.
   -q --quiet                   Just print errors.
   -d --debug                   Print all logs to stdout.
   -o --output <output>         Youtube-dlc output template.
@@ -75,6 +80,7 @@ def main():
     quiet_mode = args['--quiet']
     debug_mode = args['--debug']
     use_download_archive = args['--use-download-archive']
+    use_upload_archive = args['--use-upload-archive']
     ignore_existing_item = args['--ignore-existing-item']
 
     if debug_mode:
@@ -100,10 +106,15 @@ def main():
                                                 cookie_file, proxy_url,
                                                 username, password,
                                                 use_download_archive,
+                                                use_upload_archive,
                                                 ignore_existing_item):
-            print('\n:: Upload Finished. Item information:')
-            print('Title: %s' % meta['title'])
-            print('Item URL: https://archive.org/details/%s\n' % identifier)
+            if identifier:
+                print('\n:: Upload Finished. Item information:')
+                print('Title: %s' % meta['title'])
+                print('Item URL: https://archive.org/details/%s\n' % identifier)
+            else:
+                print('\n:: Upload skipped. Item information:')
+                print('Title: %s' % meta['title'])
     except Exception:
         print('\n\033[91m'  # Start red color text
               'An exception just occured, if you found this '

From 545c8fe161229e926330fbf39b559516017311ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Thu, 21 Sep 2023 23:06:21 -0300
Subject: [PATCH 08/18] support arbitrary options from yt-dlp, fix #212

---
 tubeup/TubeUp.py   | 20 ++++++++++++++------
 tubeup/__main__.py |  8 +++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index f4223f9..f1f7a42 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -86,12 +86,13 @@ def get_resource_basenames(self, urls,
                                cookie_file=None, proxy_url=None,
                                ydl_username=None, ydl_password=None,
                                use_download_archive=False,
-                               ignore_existing_item=False):
+                               ignore_existing_item=False,
+                               yt_args=[]):
         """
         Get resource basenames from an url.
 
         :param urls:                  A list of urls that will be downloaded with
-                                      youtubedl.
+                                      youtubedl (or their corresponding info-files)
         :param cookie_file:           A cookie file for YoutubeDL.
         :param proxy_url:             A proxy url for YoutubeDL.
         :param ydl_username:          Username that will be used to download the
@@ -103,6 +104,7 @@ def get_resource_basenames(self, urls,
                                       the archive file. Record the IDs of all
                                       downloaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
+        :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Set of videos basename that has been downloaded.
         """
         downloaded_files_basename = set()
@@ -174,6 +176,10 @@ def ydl_progress_hook(d):
                                              ydl_username, ydl_password,
                                              use_download_archive)
 
+        # Default yt-dlp overriden by tubeup specific options
+        yt_args.update(ydl_opts)
+        ydl_opts = yt_args
+
         with YoutubeDL(ydl_opts) as ydl:
             for url in urls:
                 if not ignore_existing_item:
@@ -405,13 +411,14 @@ def archive_urls(self, urls, custom_meta=None,
                      ydl_username=None, ydl_password=None,
                      use_download_archive=False,
                      use_upload_archive=False,
-                     ignore_existing_item=False):
+                     ignore_existing_item=False,
+                     yt_args=[]):
         """
         Download and upload videos from youtube_dl supported sites to
         archive.org
 
-        :param urls:                  List of url that will be downloaded and uploaded
-                                      to archive.org
+        :param urls:                  List of url or local info files that will
+                                      be downloaded and uploaded to archive.org
         :param custom_meta:           A custom metadata that will be used when
                                       uploading the file with archive.org.
         :param cookie_file:           A cookie file for YoutubeDL.
@@ -429,12 +436,13 @@ def archive_urls(self, urls, custom_meta=None,
                                       the archive file. Record the IDs of all
                                       uploaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
+        :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Tuple containing identifier and metadata of the
                                       file that has been uploaded to archive.org.
         """
         downloaded_file_basenames = self.get_resource_basenames(
             urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
-            ignore_existing_item)
+            ignore_existing_item, yt_args)
         for basename in downloaded_file_basenames:
             identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta)
             yield identifier, meta
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index 4bd199a..70ac6e0 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -27,6 +27,7 @@
                   [--use-upload-archive]
                   [--output <output>]
                   [--ignore-existing-item]
+                  [--yt X...]
   tubeup -h | --help
   tubeup --version
 
@@ -54,6 +55,7 @@
   -d --debug                   Print all logs to stdout.
   -o --output <output>         Youtube-dlc output template.
   -i --ignore-existing-item    Don't check if an item already exists on archive.org
+  --yt X...                    Any option to be passed to underlying yt-dlp.
 """
 
 import sys
@@ -61,6 +63,8 @@
 import logging
 import traceback
 
+from yt_dlp import parse_options
+
 import internetarchive
 import internetarchive.cli
 
@@ -82,6 +86,7 @@ def main():
     use_download_archive = args['--use-download-archive']
     use_upload_archive = args['--use-upload-archive']
     ignore_existing_item = args['--ignore-existing-item']
+    parser, opts, all_urls, yt_args = parse_options(args['--yt'])
 
     if debug_mode:
         # Display log messages.
@@ -107,7 +112,8 @@ def main():
                                                 username, password,
                                                 use_download_archive,
                                                 use_upload_archive,
-                                                ignore_existing_item):
+                                                ignore_existing_item,
+                                                yt_args):
             if identifier:
                 print('\n:: Upload Finished. Item information:')
                 print('Title: %s' % meta['title'])

From a52031cf902ab9096aab655c978f6f57bc1b874f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Thu, 21 Sep 2023 23:07:35 -0300
Subject: [PATCH 09/18] Ability, from an existing info-file, to upload existing
 associated media files.

If `tubeup` is passed the path of info.json files instead of URL,
then it will parse them and, if `--output` was set correctly (same value as previous `yt-dlp`)
then recover the basename of the local files associated with this video.

If used intelligently, especially in conjunction with:
--ignore-existing-item # needed, for now, to avoid the download-archive codepath
--yt=--format=...      # use the same download format
--yt=--no-overwrites
... then existing files may be uploaded without having been re-downloaded/overwritten.

Usage isn't foolproof. Use at your own risks.
---
 tubeup/TubeUp.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index f1f7a42..37967b7 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -182,9 +182,16 @@ def ydl_progress_hook(d):
 
         with YoutubeDL(ydl_opts) as ydl:
             for url in urls:
+                info_dict = {}
                 if not ignore_existing_item:
-                    # Get the info dict of the url
-                    info_dict = ydl.extract_info(url, download=False)
+                    if os.path.exists(url):
+                        p = ydl.download_with_info_file(url)
+                        if p == 0:
+                            with open(url, 'r') as f:
+                                info_dict = json.load(f)
+                    else:
+                        # Get the info dict of the url
+                        info_dict = ydl.extract_info(url, download=False)
 
                     if info_dict.get('_type', 'video') == 'playlist':
                         for entry in info_dict['entries']:
@@ -192,7 +199,13 @@ def ydl_progress_hook(d):
                     else:
                         ydl_progress_each(info_dict)
                 else:
-                    info_dict = ydl.extract_info(url)
+                    if os.path.exists(url):
+                        p = ydl.download_with_info_file(url)
+                        if p == 0:
+                            with open(url, 'r') as f:
+                                info_dict = json.load(f)
+                    else:
+                        info_dict = ydl.extract_info(url)
                     downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict))
 
         self.logger.debug(
@@ -443,6 +456,8 @@ def archive_urls(self, urls, custom_meta=None,
         downloaded_file_basenames = self.get_resource_basenames(
             urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
             ignore_existing_item, yt_args)
+        self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames)
+
         for basename in downloaded_file_basenames:
             identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta)
             yield identifier, meta

From 40f173069a6f29a8957f7ee454e815eb01f9b066 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Fri, 22 Sep 2023 10:06:58 -0300
Subject: [PATCH 10/18] missed another occurence inside m3u8 manifest URL

---
 tubeup/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tubeup/utils.py b/tubeup/utils.py
index 2be5b86..de8c3e4 100644
--- a/tubeup/utils.py
+++ b/tubeup/utils.py
@@ -38,7 +38,7 @@ def strip_ip_from_url(url):
     or in an "ip" query-parameter, like in ?ip=1.2.3.4
     """
     u = urlparse(url)
-    u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
+    u = u._replace(path=re.sub(r'%26ip%3D[^%]+', r'%26ip%3DREDACTED%', re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)))
     if u.query != '':
         qs = parse_qs(u.query)
         try:

From 33a85732e53e0ceb7467c611db77d997e1dc5143 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 17:09:25 -0300
Subject: [PATCH 11/18] Fail-safe when multiple items are queued for upload

---
 tubeup/TubeUp.py   | 14 ++------------
 tubeup/__main__.py | 46 ++++++++++++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 37967b7..b784f9f 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -419,11 +419,10 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
 
         return itemname, metadata
 
-    def archive_urls(self, urls, custom_meta=None,
+    def download_urls(self, urls,
                      cookie_file=None, proxy=None,
                      ydl_username=None, ydl_password=None,
                      use_download_archive=False,
-                     use_upload_archive=False,
                      ignore_existing_item=False,
                      yt_args=[]):
         """
@@ -432,8 +431,6 @@ def archive_urls(self, urls, custom_meta=None,
 
         :param urls:                  List of url or local info files that will
                                       be downloaded and uploaded to archive.org
-        :param custom_meta:           A custom metadata that will be used when
-                                      uploading the file with archive.org.
         :param cookie_file:           A cookie file for YoutubeDL.
         :param proxy_url:             A proxy url for YoutubeDL.
         :param ydl_username:          Username that will be used to download the
@@ -444,10 +441,6 @@ def archive_urls(self, urls, custom_meta=None,
                                       This will download only videos not listed in
                                       the archive file. Record the IDs of all
                                       downloaded videos in it.
-        :param use_upload_archive:    Record the video url to the upload archive.
-                                      This will upload only videos not listed in
-                                      the archive file. Record the IDs of all
-                                      uploaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
         :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Tuple containing identifier and metadata of the
@@ -457,10 +450,7 @@ def archive_urls(self, urls, custom_meta=None,
             urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
             ignore_existing_item, yt_args)
         self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames)
-
-        for basename in downloaded_file_basenames:
-            identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta)
-            yield identifier, meta
+        return downloaded_file_basenames
 
     @staticmethod
     def determine_collection_type(url):
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index 70ac6e0..dac678c 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -27,6 +27,7 @@
                   [--use-upload-archive]
                   [--output <output>]
                   [--ignore-existing-item]
+                  [--abort-on-error]
                   [--yt X...]
   tubeup -h | --help
   tubeup --version
@@ -53,6 +54,7 @@
                                uploaded videos in it.
   -q --quiet                   Just print errors.
   -d --debug                   Print all logs to stdout.
+     --abort-on-error          Abort after the first failed upload.
   -o --output <output>         Youtube-dlc output template.
   -i --ignore-existing-item    Don't check if an item already exists on archive.org
   --yt X...                    Any option to be passed to underlying yt-dlp.
@@ -86,6 +88,7 @@ def main():
     use_download_archive = args['--use-download-archive']
     use_upload_archive = args['--use-upload-archive']
     ignore_existing_item = args['--ignore-existing-item']
+    abort_on_error = args['--abort-on-error']
     parser, opts, all_urls, yt_args = parse_options(args['--yt'])
 
     if debug_mode:
@@ -106,14 +109,17 @@ def main():
     tu = TubeUp(verbose=not quiet_mode,
                 output_template=args['--output'])
 
-    try:
-        for identifier, meta in tu.archive_urls(URLs, metadata,
-                                                cookie_file, proxy_url,
-                                                username, password,
-                                                use_download_archive,
-                                                use_upload_archive,
-                                                ignore_existing_item,
-                                                yt_args):
+    downloaded_file_basenames = tu.download_urls(URLs,
+                                                 cookie_file, proxy_url,
+                                                 username, password,
+                                                 use_download_archive,
+                                                 ignore_existing_item,
+                                                 yt_args)
+
+    failures = []
+    for basename in downloaded_file_basenames:
+        try:
+            identifier, meta = tu.upload_ia(basename, use_upload_archive, metadata)
             if identifier:
                 print('\n:: Upload Finished. Item information:')
                 print('Title: %s' % meta['title'])
@@ -121,16 +127,20 @@ def main():
             else:
                 print('\n:: Upload skipped. Item information:')
                 print('Title: %s' % meta['title'])
-    except Exception:
-        print('\n\033[91m'  # Start red color text
-              'An exception just occured, if you found this '
-              "exception isn't related with any of your connection problem, "
-              'please report this issue to '
-              'https://github.com/bibanon/tubeup/issues')
-        traceback.print_exc()
-        print('\033[0m')  # End the red color text
-        sys.exit(1)
-
+        except Exception:
+            failures.append(basename)
+            print('\n\033[91m'  # Start red color text
+                  'An exception just occured, if you found this '
+                  "exception isn't related with any of your connection problem, "
+                  'please report this issue to '
+                  'https://github.com/bibanon/tubeup/issues')
+            traceback.print_exc()
+            print('\033[0m')  # End the red color text
+            if abort_on_error:
+                break
+
+    if len(failures) > 0:
+        print("Failed uploads:\n" + "\n".join(failures))
 
 if __name__ == '__main__':
     main()

From 8777dfad2b46a361a0b3c04bcc42cf94b7c83041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 17:18:26 -0300
Subject: [PATCH 12/18] network: Use sane (or at least default) values for
 timeouts

---
 tubeup/TubeUp.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index b784f9f..22685c2 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -282,8 +282,6 @@ def generate_ydl_options(self,
             'progress_with_newline': True,
             'forcetitle': True,
             'continuedl': True,
-            'retries': 9001,
-            'fragment_retries': 9001,
             'forcejson': False,
             'writeinfojson': True,
             'writedescription': True,
@@ -409,8 +407,8 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
                 print(msg)
             raise Exception(msg)
 
-        item.upload(files_to_upload, metadata=metadata, retries=9001,
-                    request_kwargs=dict(timeout=9001), delete=not use_upload_archive,
+        item.upload(files_to_upload, metadata=metadata, retries=15,
+                    request_kwargs=dict(timeout=60), delete=not use_upload_archive,
                     verbose=self.verbose, access_key=s3_access_key,
                     secret_key=s3_secret_key)
 

From ff0c21fd93d2bf48634a28b1abf962f0bceb8081 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 17:19:10 -0300
Subject: [PATCH 13/18] playlist: Omit empty playlists

---
 tubeup/TubeUp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 22685c2..406571d 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -232,6 +232,8 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict):
 
         if info_type == 'playlist':
             # Iterate and get the filenames through the playlist
+            if 'entries' not in info_dict:
+                return set()
             for video in info_dict['entries']:
                 filenames.add(ydl.prepare_filename(video))
         else:

From e962f216dc52bd05224ced7b2c936c2a4904254f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 18:23:11 -0300
Subject: [PATCH 14/18] cmd: Further simplify option handling

---
 tubeup/TubeUp.py   | 65 ++--------------------------------------------
 tubeup/__main__.py | 52 +++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 83 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 406571d..343721a 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -83,9 +83,6 @@ def dir_path(self, dir_path):
         }
 
     def get_resource_basenames(self, urls,
-                               cookie_file=None, proxy_url=None,
-                               ydl_username=None, ydl_password=None,
-                               use_download_archive=False,
                                ignore_existing_item=False,
                                yt_args=[]):
         """
@@ -93,16 +90,6 @@ def get_resource_basenames(self, urls,
 
         :param urls:                  A list of urls that will be downloaded with
                                       youtubedl (or their corresponding info-files)
-        :param cookie_file:           A cookie file for YoutubeDL.
-        :param proxy_url:             A proxy url for YoutubeDL.
-        :param ydl_username:          Username that will be used to download the
-                                      resources with youtube_dl.
-        :param ydl_password:          Password of the related username, will be used
-                                      to download the resources with youtube_dl.
-        :param use_download_archive:  Record the video url to the download archive.
-                                      This will download only videos not listed in
-                                      the archive file. Record the IDs of all
-                                      downloaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
         :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Set of videos basename that has been downloaded.
@@ -171,10 +158,7 @@ def ydl_progress_hook(d):
                 if self.verbose:
                     print(msg)
 
-        ydl_opts = self.generate_ydl_options(ydl_progress_hook,
-                                             cookie_file, proxy_url,
-                                             ydl_username, ydl_password,
-                                             use_download_archive)
+        ydl_opts = self.generate_ydl_options(ydl_progress_hook)
 
         # Default yt-dlp overriden by tubeup specific options
         yt_args.update(ydl_opts)
@@ -250,11 +234,6 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict):
 
     def generate_ydl_options(self,
                              ydl_progress_hook,
-                             cookie_file=None,
-                             proxy_url=None,
-                             ydl_username=None,
-                             ydl_password=None,
-                             use_download_archive=False,
                              ydl_output_template=None):
         """
         Generate a dictionary that contains options that will be used
@@ -262,16 +241,6 @@ def generate_ydl_options(self,
 
         :param ydl_progress_hook:     A function that will be called during the
                                       download process by youtube_dl.
-        :param proxy_url:             A proxy url for YoutubeDL.
-        :param ydl_username:          Username that will be used to download the
-                                      resources with youtube_dl.
-        :param ydl_password:          Password of the related username, will be
-                                      used to download the resources with
-                                      youtube_dl.
-        :param use_download_archive:  Record the video url to the download archive.
-                                      This will download only videos not listed in
-                                      the archive file. Record the IDs of all
-                                      downloaded videos in it.
         :return:                      A dictionary that contains options that will
                                       be used by youtube_dl.
         """
@@ -310,22 +279,6 @@ def generate_ydl_options(self,
             'progress_hooks': [ydl_progress_hook]
         }
 
-        if cookie_file is not None:
-            ydl_opts['cookiefile'] = cookie_file
-
-        if proxy_url is not None:
-            ydl_opts['proxy'] = proxy_url
-
-        if ydl_username is not None:
-            ydl_opts['username'] = ydl_username
-
-        if ydl_password is not None:
-            ydl_opts['password'] = ydl_password
-
-        if use_download_archive:
-            ydl_opts['download_archive'] = os.path.join(self.dir_path['root'],
-                                                        '.ytdlarchive')
-
         return ydl_opts
 
     def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
@@ -420,9 +373,6 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
         return itemname, metadata
 
     def download_urls(self, urls,
-                     cookie_file=None, proxy=None,
-                     ydl_username=None, ydl_password=None,
-                     use_download_archive=False,
                      ignore_existing_item=False,
                      yt_args=[]):
         """
@@ -431,24 +381,13 @@ def download_urls(self, urls,
 
         :param urls:                  List of url or local info files that will
                                       be downloaded and uploaded to archive.org
-        :param cookie_file:           A cookie file for YoutubeDL.
-        :param proxy_url:             A proxy url for YoutubeDL.
-        :param ydl_username:          Username that will be used to download the
-                                      resources with youtube_dl.
-        :param ydl_password:          Password of the related username, will be used
-                                      to download the resources with youtube_dl.
-        :param use_download_archive:  Record the video url to the download archive.
-                                      This will download only videos not listed in
-                                      the archive file. Record the IDs of all
-                                      downloaded videos in it.
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
         :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Tuple containing identifier and metadata of the
                                       file that has been uploaded to archive.org.
         """
         downloaded_file_basenames = self.get_resource_basenames(
-            urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
-            ignore_existing_item, yt_args)
+            urls, ignore_existing_item, yt_args)
         self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames)
         return downloaded_file_basenames
 
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index dac678c..17dd6af 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -41,14 +41,11 @@
 
 Options:
   -h --help                    Show this screen.
-  -p --proxy <prox>            Use a proxy while uploading.
-  -u --username <user>         Provide a username, for sites like Nico Nico Douga.
-  -p --password <pass>         Provide a password, for sites like Nico Nico Douga.
-  -a --use-download-archive    Record the video url to the download archive.
-                               This will download only videos not listed in
-                               the archive file. Record the IDs of all
-                               downloaded videos in it.
-  -U --use-upload-archive      Record the video url to the upload archive.
+  -p --proxy <prox>            Deprecated. Shortcut for the corresponding yt-dlp option.
+  -u --username <user>         Deprecated. Shortcut for the corresponding yt-dlp option.
+  -p --password <pass>         Deprecated. Shortcut for the corresponding yt-dlp option.
+  -a --use-download-archive    Shortcut for --yt=--download-archive=%s
+  -U --use-upload-archive      Record the video url to the upload archive at %s
                                This will upload only videos not listed in
                                the archive file. Record the IDs of all
                                uploaded videos in it.
@@ -58,8 +55,23 @@
   -o --output <output>         Youtube-dlc output template.
   -i --ignore-existing-item    Don't check if an item already exists on archive.org
   --yt X...                    Any option to be passed to underlying yt-dlp.
+
+Example:
+  Assuming that *.info.json files are consistent and
+  that yt-dlp output template led to uniform/predictible file names,
+  then a way to upload existing files based without triggering new downloads
+  is to use a combination of the following:
+  * --output='<same as yt-dlp output>'
+  * --use-upload-archive
+  * --use-download-archive
+  * --ignore-existing-item
+  * --yt=--no-playlist
+  * --yt=--match-filter=!playlist
+  * --yt=--no-overwrites
+
 """
 
+import os
 import sys
 import docopt
 import logging
@@ -73,19 +85,24 @@
 from tubeup.TubeUp import TubeUp
 from tubeup import __version__
 
+DEFAULT_DOWNLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.ytdlarchive')
+DEFAULT_UPLOAD_ARCHIVE = os.path.join(os.path.expanduser('~/.tubeup'), '.iauparchive')
 
 def main():
     # Parse arguments from file docstring
-    args = docopt.docopt(__doc__, version=__version__)
+    args = docopt.docopt(__doc__ % (DEFAULT_DOWNLOAD_ARCHIVE, DEFAULT_UPLOAD_ARCHIVE),
+                         version=__version__)
 
     URLs = args['<url>']
-    cookie_file = args['--cookies']
-    proxy_url = args['--proxy']
-    username = args['--username']
-    password = args['--password']
+    for v in ['--cookies', '--proxy', '--username', '--password']:
+        if v in args and args[v]:
+            args['--yt'].append('%s=%s' % (v, args[v]))
+
+    if args['--use-download-archive']:
+        args['--yt'].append('--download-archive=' + DEFAULT_DOWNLOAD_ARCHIVE)
+
     quiet_mode = args['--quiet']
     debug_mode = args['--debug']
-    use_download_archive = args['--use-download-archive']
     use_upload_archive = args['--use-upload-archive']
     ignore_existing_item = args['--ignore-existing-item']
     abort_on_error = args['--abort-on-error']
@@ -109,12 +126,7 @@ def main():
     tu = TubeUp(verbose=not quiet_mode,
                 output_template=args['--output'])
 
-    downloaded_file_basenames = tu.download_urls(URLs,
-                                                 cookie_file, proxy_url,
-                                                 username, password,
-                                                 use_download_archive,
-                                                 ignore_existing_item,
-                                                 yt_args)
+    downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item, yt_args)
 
     failures = []
     for basename in downloaded_file_basenames:

From 4aaa6e7172ed5238dfe0276dd438d3abd2534da2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 18:35:50 -0300
Subject: [PATCH 15/18] perfs: Do not reinstance over and over an (identical)
 yt-dlp object

---
 tubeup/TubeUp.py   | 90 +++++++++++++++++++++++-----------------------
 tubeup/__main__.py |  6 ++--
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 343721a..b8af440 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -27,7 +27,8 @@ def __init__(self,
                  verbose=False,
                  dir_path='~/.tubeup',
                  ia_config_path=None,
-                 output_template=None):
+                 output_template=None,
+                 yt_args=[]):
         """
         `tubeup` is a tool to archive YouTube by downloading the videos and
         uploading it back to the archive.org.
@@ -41,6 +42,7 @@ def __init__(self,
                                 be used in uploading the file.
         :param output_template: A template string that will be used to
                                 generate the output filenames.
+        :param yt_args:         Additional parameters passed to yt-dlp.
         """
         self.dir_path = dir_path
         self.verbose = verbose
@@ -55,6 +57,8 @@ def __init__(self,
         if not self.verbose:
             self.logger.setLevel(logging.ERROR)
 
+        self.YDL = self.get_ytdlp_instance(yt_args)
+
     @property
     def dir_path(self):
         return self._dir_path
@@ -82,42 +86,7 @@ def dir_path(self, dir_path):
                                       DOWNLOAD_DIR_NAME)
         }
 
-    def get_resource_basenames(self, urls,
-                               ignore_existing_item=False,
-                               yt_args=[]):
-        """
-        Get resource basenames from an url.
-
-        :param urls:                  A list of urls that will be downloaded with
-                                      youtubedl (or their corresponding info-files)
-        :param ignore_existing_item:  Ignores the check for existing items on archive.org.
-        :param yt_args:               Additional parameters passed to yt-dlp.
-        :return:                      Set of videos basename that has been downloaded.
-        """
-        downloaded_files_basename = set()
-
-        def check_if_ia_item_exists(infodict):
-            itemname = get_itemname(infodict)
-            item = internetarchive.get_item(itemname)
-            if item.exists and self.verbose:
-                print("\n:: Item already exists. Not downloading.")
-                print('Title: %s' % infodict['title'])
-                print('Video URL: %s\n' % infodict['webpage_url'])
-                return True
-            return False
-
-        def ydl_progress_each(entry):
-            if not entry:
-                self.logger.warning('Video "%s" is not available. Skipping.' % url)
-                return
-            if ydl.in_download_archive(entry):
-                return
-            if not check_if_ia_item_exists(entry):
-                ydl.extract_info(entry['webpage_url'])
-                downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry))
-            else:
-                ydl.record_download_archive(entry)
-
+    def get_ytdlp_instance(self, yt_args=[]):
         def ydl_progress_hook(d):
             if d['status'] == 'downloading' and self.verbose:
                 if d.get('_total_bytes_str') is not None:
@@ -162,9 +131,45 @@ def ydl_progress_hook(d):
 
         # Default yt-dlp overriden by tubeup specific options
         yt_args.update(ydl_opts)
-        ydl_opts = yt_args
 
-        with YoutubeDL(ydl_opts) as ydl:
+        return YoutubeDL(yt_args)
+
+
+    def get_resource_basenames(self, urls,
+                               ignore_existing_item=False):
+        """
+        Get resource basenames from an url.
+
+        :param urls:                  A list of urls that will be downloaded with
+                                      youtubedl (or their corresponding info-files)
+        :param ignore_existing_item:  Ignores the check for existing items on archive.org.
+        :return:                      Set of videos basename that has been downloaded.
+        """
+        downloaded_files_basename = set()
+
+        def check_if_ia_item_exists(infodict):
+            itemname = get_itemname(infodict)
+            item = internetarchive.get_item(itemname)
+            if item.exists and self.verbose:
+                print("\n:: Item already exists. Not downloading.")
+                print('Title: %s' % infodict['title'])
+                print('Video URL: %s\n' % infodict['webpage_url'])
+                return True
+            return False
+
+        def ydl_progress_each(entry):
+            if not entry:
+                self.logger.warning('Video "%s" is not available. Skipping.' % url)
+                return
+            if ydl.in_download_archive(entry):
+                return
+            if not check_if_ia_item_exists(entry):
+                ydl.extract_info(entry['webpage_url'])
+                downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry))
+            else:
+                ydl.record_download_archive(entry)
+
+        with self.YDL as ydl:
             for url in urls:
                 info_dict = {}
                 if not ignore_existing_item:
@@ -373,8 +378,7 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
         return itemname, metadata
 
     def download_urls(self, urls,
-                     ignore_existing_item=False,
-                     yt_args=[]):
+                     ignore_existing_item=False):
         """
         Download and upload videos from youtube_dl supported sites to
         archive.org
@@ -382,12 +386,10 @@ def download_urls(self, urls,
         :param urls:                  List of url or local info files that will
                                       be downloaded and uploaded to archive.org
         :param ignore_existing_item:  Ignores the check for existing items on archive.org.
-        :param yt_args:               Additional parameters passed to yt-dlp.
         :return:                      Tuple containing identifier and metadata of the
                                       file that has been uploaded to archive.org.
         """
-        downloaded_file_basenames = self.get_resource_basenames(
-            urls, ignore_existing_item, yt_args)
+        downloaded_file_basenames = self.get_resource_basenames(urls, ignore_existing_item)
         self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames)
         return downloaded_file_basenames
 
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index 17dd6af..7608f7d 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -123,10 +123,8 @@ def main():
 
     metadata = internetarchive.cli.argparser.get_args_dict(args['--metadata'])
 
-    tu = TubeUp(verbose=not quiet_mode,
-                output_template=args['--output'])
-
-    downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item, yt_args)
+    tu = TubeUp(verbose=not quiet_mode, output_template=args['--output'], yt_args=yt_args)
+    downloaded_file_basenames = tu.download_urls(URLs, ignore_existing_item)
 
     failures = []
     for basename in downloaded_file_basenames:

From e32cee9b74dc51f1514b9162c3adbed764574d74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 19:00:20 -0300
Subject: [PATCH 16/18] perfs: Improve discard-logic of temporary files (and
 actually escape video basename)

---
 tubeup/TubeUp.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index b8af440..7f90101 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -2,6 +2,7 @@
 import sys
 import re
 import glob
+import fnmatch
 import time
 import json
 import logging
@@ -312,8 +313,12 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
                 json.dump(new_meta, f)
 
         # Exit if video download did not complete, don't upload .part files to IA
+        # One glob() + fnmatch() is ten times less expensive than 8 globs(),
+        # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists)
+        filenames = glob.glob(glob.escape(videobasename) + '*')
         for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
-            if glob.glob(videobasename + ext):
+            matching = fnmatch.filter(filenames, ext)
+            if matching:
                 msg = 'Video download incomplete, please re-run or delete video stubs in downloads folder, exiting...'
                 raise Exception(msg)
 
@@ -345,7 +350,7 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
 
         # Upload all files with videobase name: e.g. video.mp4,
         # video.info.json, video.srt, etc.
-        files_to_upload = glob.glob(videobasename + '*')
+        files_to_upload = glob.glob(glob.escape(videobasename) + '*')
 
         # Upload the item to the Internet Archive
         item = internetarchive.get_item(itemname)

From 6f9cc3022090c86fa415f465121283757cf8ecfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 19:04:26 -0300
Subject: [PATCH 17/18] add619cf92 follow-up: Replace JSON info-file if it's
 actually going to be uploaded

---
 tubeup/TubeUp.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 7f90101..08fedb1 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -307,11 +307,6 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
         with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
             vid_meta = json.load(f)
 
-        mod, new_meta = strip_ip_from_meta(vid_meta)
-        if mod:
-            with open(json_metadata_filepath, 'w') as f:
-                json.dump(new_meta, f)
-
         # Exit if video download did not complete, don't upload .part files to IA
         # One glob() + fnmatch() is ten times less expensive than 8 globs(),
         # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists)
@@ -372,6 +367,11 @@ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
                 print(msg)
             raise Exception(msg)
 
+        mod, new_meta = strip_ip_from_meta(vid_meta)
+        if mod:
+            with open(json_metadata_filepath, 'w') as f:
+                json.dump(new_meta, f)
+
         item.upload(files_to_upload, metadata=metadata, retries=15,
                     request_kwargs=dict(timeout=60), delete=not use_upload_archive,
                     verbose=self.verbose, access_key=s3_access_key,

From 055302bc21c4d47a071c27d4d7000067b6b1e7be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Tue, 26 Sep 2023 19:23:19 -0300
Subject: [PATCH 18/18] doc: README.md, list changes

---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9926e38..a8794b6 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,15 @@ Tubeup - a multi-VOD service to Archive.org uploader
 
 It was designed by the [Bibliotheca Anonoma](https://github.com/bibanon/bibanon/wiki) to archive single videos, playlists (see warning below about more than video uploads) or accounts to the Internet Archive.
 
+## Changes specific to this fork
+- Clean-up IP addresses contained by Youtube-generated info files before IA upload
+- Do not abort after a failure but try the next item + saner timeout values
+- Accept arbitrary yt-dlp options (⚠️)
+- Can upload existing resource (⚠️ under certain strict condition, see --help and a52031c)
+- Can upload existing resource based on a local JSON info file
+- More efficient at proceessing a large number of files/URL
+- Broken testsuite (⚠️)
+
 ## Prerequisites
 
 This script strongly recommends Linux or some sort of POSIX system (such as macOS), preferably from a rented VPS and not your personal machine or phone.
@@ -31,7 +40,7 @@ For Debian/Ubuntu:
    At a minimum Python 3.8 and up is required (latest Python preferred).
 
 ```
-   python3 -m pip install -U pip tubeup
+   python3 -m pip install -U pip git+https://github.com/drzraf/tubeup
 ```
 
 3. If you don't already have an Internet Archive account, [register for one](https://archive.org/account/login.createaccount.php) to give the script upload privileges.