Skip to content

Commit

Permalink
Idempotency: Keep a local "archive" (index) of files successfully upl…
Browse files Browse the repository at this point in the history
…oaded the way

  yt-dlp keep track of downloaded items.

- Amend #19 by adding optional idempotency between runs:
  While concurrent-instances can still rely on a "clean" download directory (#19)
  a single upload node can use --use-upload-archive can avoid source files removal.
  Subsequent runs of `tubeup` will omit files already uploaded.

- Fixes #23 (and part of #233)
- NB: Ability for upload_ia() to return None, paves the way to fix #36 or #109
  • Loading branch information
drzraf committed Sep 17, 2023
1 parent 2655580 commit feaa124
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 12 deletions.
80 changes: 80 additions & 0 deletions tests/test_tubeup.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,3 +590,83 @@ def test_archive_urls(self):
'scanner': SCANNER})]

self.assertEqual(expected_result, result)

def test_archive_deletion(self):
root_path = os.path.join(current_path, 'test_tubeup_rootdir')
# Clean up before test
shutil.rmtree(root_path, ignore_errors=True)

tu = TubeUp(dir_path=root_path,
ia_config_path=get_testfile_path('ia_config_for_test.ini'))

videobasename = os.path.join(
current_path, 'test_tubeup_rootdir', 'downloads',
'KdsN9YhkDrY')

copy_testfiles_to_tubeup_rootdir_test()
dest = os.path.join(root_path, 'downloads', '*')
files_before_upload = glob.glob(dest)

vid_info = {'mediatype': 'movies',
'creator': 'RelaxingWorld',
'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg',
'collection': 'opensource_movies',
'title': 'Epic Ramadan - Video Background HD1080p',
'description': ('If you enjoy my work, please consider Subscribe to my NEW '
'channel for more videos: <br>'
'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1 <br>'
'▷ If you use this video, please put credits to my channel '
'in description: <br>'
'Source from RelaxingWorld: https://goo.gl/HsW75m<br>'
'<br>'
'▷ Also, do not forget to Subscribe to my channel. Thanks!'),
'date': '2016-06-25',
'year': '2016',
'subject': ('Youtube;video;Film & Animation;Video Background;'
'Footage;Animation;Cinema;Royalty Free Videos;'
'Stock Video Footage;Video Backdrops;'
'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;'
'relaxing music;Ramadan;'),
'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY',
'licenseurl': '',
'scanner': SCANNER}

with requests_mock.Mocker() as m:
# Mock the request to s3.us.archive.org, so it will responds
# a custom json. `internetarchive` library sends GET request to
# that url to check that we don't violate the upload limit.
m.get('https://s3.us.archive.org',
content=b'{"over_limit": 0}',
headers={'content-type': 'application/json'})

m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY',
content=b'{}',
headers={'content-type': 'application/json'})

# Mock the PUT requests for internetarchive urls that defined
# in mock_upload_response_by_videobasename(), so this test
# doesn't perform upload to the real archive.org server.
mock_upload_response_by_videobasename(
m, 'youtube-KdsN9YhkDrY', videobasename)

# First upload, this actually get uploaded...
result = list(tu.archive_urls(
['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))

# ... and returns a remote IA item name
expected_result = [('youtube-KdsN9YhkDrY', vid_info)]
self.assertEqual(expected_result, result)

# ... and no file got deleted
files_after_upload = glob.glob(dest)
self.assertListEqual(files_before_upload, files_after_upload)
# ... and a upload-archive file was created
self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive')))

# Second upload, nothing was actually uploaded...
result = list(tu.archive_urls(
['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))

# ... and no remote IA item name is returned
expected_result = [(None, vid_info)]
self.assertEqual(expected_result, result)
37 changes: 28 additions & 9 deletions tubeup/TubeUp.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,21 @@ def generate_ydl_options(self,

return ydl_opts

def upload_ia(self, videobasename, custom_meta=None):
def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
"""
Upload video to archive.org.
:param videobasename: A video base name.
:param custom_meta: A custom meta, will be used by internetarchive
library when uploading to archive.org.
:return: A tuple containing item name and metadata used
when uploading to archive.org and whether the item
already exists.
:param videobasename: A video base name.
:param use_upload_archive: Record the video url to the upload archive.
This will upload only videos not listed in
the archive file. Record the IDs of all
uploaded videos in it.
:param custom_meta: A custom meta, will be used by internetarchive
library when uploading to archive.org.
:return: A tuple containing item name and metadata used
when uploading to archive.org and whether the item
already exists. A null item name means upload
didn't happened.
"""
json_metadata_filepath = videobasename + '.info.json'
with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
Expand All @@ -334,6 +339,12 @@ def upload_ia(self, videobasename, custom_meta=None):
metadata = self.create_archive_org_metadata_from_youtubedl_meta(
vid_meta)

if use_upload_archive:
ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')})
if ydl.in_download_archive(vid_meta):
self.logger.debug('Skipping already uploaded video: %s', metadata['title'])
return None, metadata

# Delete empty description file
description_file_path = videobasename + '.description'
if (os.path.exists(description_file_path) and
Expand Down Expand Up @@ -375,16 +386,20 @@ def upload_ia(self, videobasename, custom_meta=None):
raise Exception(msg)

item.upload(files_to_upload, metadata=metadata, retries=9001,
request_kwargs=dict(timeout=9001), delete=True,
request_kwargs=dict(timeout=9001), delete=not use_upload_archive,
verbose=self.verbose, access_key=s3_access_key,
secret_key=s3_secret_key)

if use_upload_archive:
ydl.record_download_archive(vid_meta)

return itemname, metadata

def archive_urls(self, urls, custom_meta=None,
cookie_file=None, proxy=None,
ydl_username=None, ydl_password=None,
use_download_archive=False,
use_upload_archive=False,
ignore_existing_item=False):
"""
Download and upload videos from youtube_dl supported sites to
Expand All @@ -404,6 +419,10 @@ def archive_urls(self, urls, custom_meta=None,
This will download only videos not listed in
the archive file. Record the IDs of all
downloaded videos in it.
:param use_upload_archive: Record the video url to the upload archive.
This will upload only videos not listed in
the archive file. Record the IDs of all
uploaded videos in it.
:param ignore_existing_item: Ignores the check for existing items on archive.org.
:return: Tuple containing identifier and metadata of the
file that has been uploaded to archive.org.
Expand All @@ -412,7 +431,7 @@ def archive_urls(self, urls, custom_meta=None,
urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
ignore_existing_item)
for basename in downloaded_file_basenames:
identifier, meta = self.upload_ia(basename, custom_meta)
identifier, meta = self.upload_ia(basename, use_upload_archive, custom_meta)
yield identifier, meta

@staticmethod
Expand Down
17 changes: 14 additions & 3 deletions tubeup/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
[--proxy <prox>]
[--quiet] [--debug]
[--use-download-archive]
[--use-upload-archive]
[--output <output>]
[--ignore-existing-item]
tubeup -h | --help
Expand All @@ -45,6 +46,10 @@
This will download only videos not listed in
the archive file. Record the IDs of all
downloaded videos in it.
-U --use-upload-archive Record the video url to the upload archive.
This will upload only videos not listed in
the archive file. Record the IDs of all
uploaded videos in it.
-q --quiet Just print errors.
-d --debug Print all logs to stdout.
-o --output <output> Youtube-dlc output template.
Expand Down Expand Up @@ -75,6 +80,7 @@ def main():
quiet_mode = args['--quiet']
debug_mode = args['--debug']
use_download_archive = args['--use-download-archive']
use_upload_archive = args['--use-upload-archive']
ignore_existing_item = args['--ignore-existing-item']

if debug_mode:
Expand All @@ -100,10 +106,15 @@ def main():
cookie_file, proxy_url,
username, password,
use_download_archive,
use_upload_archive,
ignore_existing_item):
print('\n:: Upload Finished. Item information:')
print('Title: %s' % meta['title'])
print('Item URL: https://archive.org/details/%s\n' % identifier)
if identifier:
print('\n:: Upload Finished. Item information:')
print('Title: %s' % meta['title'])
print('Item URL: https://archive.org/details/%s\n' % identifier)
else:
print('\n:: Upload skipped. Item information:')
print('Title: %s' % meta['title'])
except Exception:
print('\n\033[91m' # Start red color text
'An exception just occured, if you found this '
Expand Down

0 comments on commit feaa124

Please sign in to comment.