diff --git a/README.md b/README.md
index 9926e38..a8794b6 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,15 @@ Tubeup - a multi-VOD service to Archive.org uploader
It was designed by the [Bibliotheca Anonoma](https://github.com/bibanon/bibanon/wiki) to archive single videos, playlists (see warning below about more than video uploads) or accounts to the Internet Archive.
+## Changes specific to this fork
+- Clean-up IP addresses contained by Youtube-generated info files before IA upload
+- Do not abort after a failure but try the next item + saner timeout values
+- Accept arbitrary yt-dlp options (⚠️)
+- Can upload existing resource (⚠️ under certain strict condition, see --help and a52031c)
+- Can upload existing resource based on a local JSON info file
+- More efficient at proceessing a large number of files/URL
+- Broken testsuite (⚠️)
+
## Prerequisites
This script strongly recommends Linux or some sort of POSIX system (such as macOS), preferably from a rented VPS and not your personal machine or phone.
@@ -31,7 +40,7 @@ For Debian/Ubuntu:
At a minimum Python 3.8 and up is required (latest Python preferred).
```
- python3 -m pip install -U pip tubeup
+ python3 -m pip install -U pip git+https://github.com/drzraf/tubeup
```
3. If you don't already have an Internet Archive account, [register for one](https://archive.org/account/login.createaccount.php) to give the script upload privileges.
diff --git a/setup.py b/setup.py
index 1e1e948..d1f9e39 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,7 @@
},
install_requires=[
'internetarchive',
+ 'urllib3',
'docopt==0.6.2',
'yt-dlp',
]
diff --git a/tests/test_tubeup.py b/tests/test_tubeup.py
index 9983d56..7603e86 100644
--- a/tests/test_tubeup.py
+++ b/tests/test_tubeup.py
@@ -590,3 +590,83 @@ def test_archive_urls(self):
'scanner': SCANNER})]
self.assertEqual(expected_result, result)
+
+ def test_archive_deletion(self):
+ root_path = os.path.join(current_path, 'test_tubeup_rootdir')
+ # Clean up before test
+ shutil.rmtree(root_path, ignore_errors=True)
+
+ tu = TubeUp(dir_path=root_path,
+ ia_config_path=get_testfile_path('ia_config_for_test.ini'))
+
+ videobasename = os.path.join(
+ current_path, 'test_tubeup_rootdir', 'downloads',
+ 'KdsN9YhkDrY')
+
+ copy_testfiles_to_tubeup_rootdir_test()
+ dest = os.path.join(root_path, 'downloads', '*')
+ files_before_upload = glob.glob(dest)
+
+ vid_info = {'mediatype': 'movies',
+ 'creator': 'RelaxingWorld',
+ 'channel': 'http://www.youtube.com/channel/UCWpsozCMdAnfI16rZHQ9XDg',
+ 'collection': 'opensource_movies',
+ 'title': 'Epic Ramadan - Video Background HD1080p',
+ 'description': ('If you enjoy my work, please consider Subscribe to my NEW '
+ 'channel for more videos:
'
+ 'https://www.youtube.com/MusicForRelaxation?sub_confirmation=1
'
+ '▷ If you use this video, please put credits to my channel '
+ 'in description:
'
+ 'Source from RelaxingWorld: https://goo.gl/HsW75m
'
+ '
'
+ '▷ Also, do not forget to Subscribe to my channel. Thanks!'),
+ 'date': '2016-06-25',
+ 'year': '2016',
+ 'subject': ('Youtube;video;Film & Animation;Video Background;'
+ 'Footage;Animation;Cinema;Royalty Free Videos;'
+ 'Stock Video Footage;Video Backdrops;'
+ 'Amazing Nature;youtube;HD;1080p;Creative Commons Videos;'
+ 'relaxing music;Ramadan;'),
+ 'originalurl': 'https://www.youtube.com/watch?v=KdsN9YhkDrY',
+ 'licenseurl': '',
+ 'scanner': SCANNER}
+
+ with requests_mock.Mocker() as m:
+ # Mock the request to s3.us.archive.org, so it will responds
+ # a custom json. `internetarchive` library sends GET request to
+ # that url to check that we don't violate the upload limit.
+ m.get('https://s3.us.archive.org',
+ content=b'{"over_limit": 0}',
+ headers={'content-type': 'application/json'})
+
+ m.get('https://archive.org/metadata/youtube-KdsN9YhkDrY',
+ content=b'{}',
+ headers={'content-type': 'application/json'})
+
+ # Mock the PUT requests for internetarchive urls that defined
+ # in mock_upload_response_by_videobasename(), so this test
+ # doesn't perform upload to the real archive.org server.
+ mock_upload_response_by_videobasename(
+ m, 'youtube-KdsN9YhkDrY', videobasename)
+
+ # First upload, this actually get uploaded...
+ result = list(tu.archive_urls(
+ ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))
+
+ # ... and returns a remote IA item name
+ expected_result = [('youtube-KdsN9YhkDrY', vid_info)]
+ self.assertEqual(expected_result, result)
+
+ # ... and no file got deleted
+ files_after_upload = glob.glob(dest)
+ self.assertListEqual(files_before_upload, files_after_upload)
+ # ... and a upload-archive file was created
+ self.assertTrue(os.path.exists(os.path.join(root_path, '.iauparchive')))
+
+ # Second upload, nothing was actually uploaded...
+ result = list(tu.archive_urls(
+ ['https://www.youtube.com/watch?v=KdsN9YhkDrY'], use_upload_archive=True))
+
+ # ... and no remote IA item name is returned
+ expected_result = [(None, vid_info)]
+ self.assertEqual(expected_result, result)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8c409d8..813271a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,13 @@
import unittest
import os
-from tubeup.utils import sanitize_identifier, check_is_file_empty
+import json
+from tubeup.utils import sanitize_identifier, check_is_file_empty, strip_ip_from_meta
+
+current_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def get_testfile_path(name):
+ return os.path.join(current_path, 'test_tubeup_files', name)
class UtilsTest(unittest.TestCase):
@@ -48,3 +55,14 @@ def test_check_is_file_empty_when_file_doesnt_exist(self):
FileNotFoundError,
r"^Path 'file_that_doesnt_exist.txt' doesn't exist$"):
check_is_file_empty('file_that_doesnt_exist.txt')
+
+ def test_strip_ip_from_meta(self):
+ with open(get_testfile_path(
+ 'Mountain_3_-_Video_Background_HD_1080p-6iRV8liah8A.'
+ 'info.json')
+ ) as f:
+ vid_meta = json.load(f)
+ mod, new_meta = strip_ip_from_meta(vid_meta)
+ self.assertTrue(mod)
+ self.assertNotEqual(f.read(), json.dumps(new_meta))
+ self.assertNotRegex(json.dumps(new_meta), r'36\.73\.93\.234')
diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 7dd6801..08fedb1 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -2,6 +2,7 @@
import sys
import re
import glob
+import fnmatch
import time
import json
import logging
@@ -10,7 +11,7 @@
from internetarchive.config import parse_config_file
from datetime import datetime
from yt_dlp import YoutubeDL
-from .utils import (get_itemname, check_is_file_empty,
+from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
EMPTY_ANNOTATION_FILE)
from logging import getLogger
from urllib.parse import urlparse
@@ -27,7 +28,8 @@ def __init__(self,
verbose=False,
dir_path='~/.tubeup',
ia_config_path=None,
- output_template=None):
+ output_template=None,
+ yt_args=[]):
"""
`tubeup` is a tool to archive YouTube by downloading the videos and
uploading it back to the archive.org.
@@ -41,6 +43,7 @@ def __init__(self,
be used in uploading the file.
:param output_template: A template string that will be used to
generate the output filenames.
+ :param yt_args: Additional parameters passed to yt-dlp.
"""
self.dir_path = dir_path
self.verbose = verbose
@@ -55,6 +58,8 @@ def __init__(self,
if not self.verbose:
self.logger.setLevel(logging.ERROR)
+ self.YDL = self.get_ytdlp_instance(yt_args)
+
@property
def dir_path(self):
return self._dir_path
@@ -82,53 +87,7 @@ def dir_path(self, dir_path):
DOWNLOAD_DIR_NAME)
}
- def get_resource_basenames(self, urls,
- cookie_file=None, proxy_url=None,
- ydl_username=None, ydl_password=None,
- use_download_archive=False,
- ignore_existing_item=False):
- """
- Get resource basenames from an url.
-
- :param urls: A list of urls that will be downloaded with
- youtubedl.
- :param cookie_file: A cookie file for YoutubeDL.
- :param proxy_url: A proxy url for YoutubeDL.
- :param ydl_username: Username that will be used to download the
- resources with youtube_dl.
- :param ydl_password: Password of the related username, will be used
- to download the resources with youtube_dl.
- :param use_download_archive: Record the video url to the download archive.
- This will download only videos not listed in
- the archive file. Record the IDs of all
- downloaded videos in it.
- :param ignore_existing_item: Ignores the check for existing items on archive.org.
- :return: Set of videos basename that has been downloaded.
- """
- downloaded_files_basename = set()
-
- def check_if_ia_item_exists(infodict):
- itemname = get_itemname(infodict)
- item = internetarchive.get_item(itemname)
- if item.exists and self.verbose:
- print("\n:: Item already exists. Not downloading.")
- print('Title: %s' % infodict['title'])
- print('Video URL: %s\n' % infodict['webpage_url'])
- return True
- return False
-
- def ydl_progress_each(entry):
- if not entry:
- self.logger.warning('Video "%s" is not available. Skipping.' % url)
- return
- if ydl.in_download_archive(entry):
- return
- if not check_if_ia_item_exists(entry):
- ydl.extract_info(entry['webpage_url'])
- downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry))
- else:
- ydl.record_download_archive(entry)
-
+ def get_ytdlp_instance(self, yt_args=[]):
def ydl_progress_hook(d):
if d['status'] == 'downloading' and self.verbose:
if d.get('_total_bytes_str') is not None:
@@ -169,16 +128,60 @@ def ydl_progress_hook(d):
if self.verbose:
print(msg)
- ydl_opts = self.generate_ydl_options(ydl_progress_hook,
- cookie_file, proxy_url,
- ydl_username, ydl_password,
- use_download_archive)
+ ydl_opts = self.generate_ydl_options(ydl_progress_hook)
+
+ # Default yt-dlp overriden by tubeup specific options
+ yt_args.update(ydl_opts)
+
+ return YoutubeDL(yt_args)
+
+
+ def get_resource_basenames(self, urls,
+ ignore_existing_item=False):
+ """
+ Get resource basenames from an url.
+
+ :param urls: A list of urls that will be downloaded with
+ youtubedl (or their corresponding info-files)
+ :param ignore_existing_item: Ignores the check for existing items on archive.org.
+ :return: Set of videos basename that has been downloaded.
+ """
+ downloaded_files_basename = set()
+
+ def check_if_ia_item_exists(infodict):
+ itemname = get_itemname(infodict)
+ item = internetarchive.get_item(itemname)
+ if item.exists and self.verbose:
+ print("\n:: Item already exists. Not downloading.")
+ print('Title: %s' % infodict['title'])
+ print('Video URL: %s\n' % infodict['webpage_url'])
+ return True
+ return False
+
+ def ydl_progress_each(entry):
+ if not entry:
+ self.logger.warning('Video "%s" is not available. Skipping.' % url)
+ return
+ if ydl.in_download_archive(entry):
+ return
+ if not check_if_ia_item_exists(entry):
+ ydl.extract_info(entry['webpage_url'])
+ downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, entry))
+ else:
+ ydl.record_download_archive(entry)
- with YoutubeDL(ydl_opts) as ydl:
+ with self.YDL as ydl:
for url in urls:
+ info_dict = {}
if not ignore_existing_item:
- # Get the info dict of the url
- info_dict = ydl.extract_info(url, download=False)
+ if os.path.exists(url):
+ p = ydl.download_with_info_file(url)
+ if p == 0:
+ with open(url, 'r') as f:
+ info_dict = json.load(f)
+ else:
+ # Get the info dict of the url
+ info_dict = ydl.extract_info(url, download=False)
if info_dict.get('_type', 'video') == 'playlist':
for entry in info_dict['entries']:
@@ -186,7 +189,13 @@ def ydl_progress_hook(d):
else:
ydl_progress_each(info_dict)
else:
- info_dict = ydl.extract_info(url)
+ if os.path.exists(url):
+ p = ydl.download_with_info_file(url)
+ if p == 0:
+ with open(url, 'r') as f:
+ info_dict = json.load(f)
+ else:
+ info_dict = ydl.extract_info(url)
downloaded_files_basename.update(self.create_basenames_from_ydl_info_dict(ydl, info_dict))
self.logger.debug(
@@ -213,6 +222,8 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict):
if info_type == 'playlist':
# Iterate and get the filenames through the playlist
+ if 'entries' not in info_dict:
+ return set()
for video in info_dict['entries']:
filenames.add(ydl.prepare_filename(video))
else:
@@ -229,11 +240,6 @@ def create_basenames_from_ydl_info_dict(self, ydl, info_dict):
def generate_ydl_options(self,
ydl_progress_hook,
- cookie_file=None,
- proxy_url=None,
- ydl_username=None,
- ydl_password=None,
- use_download_archive=False,
ydl_output_template=None):
"""
Generate a dictionary that contains options that will be used
@@ -241,16 +247,6 @@ def generate_ydl_options(self,
:param ydl_progress_hook: A function that will be called during the
download process by youtube_dl.
- :param proxy_url: A proxy url for YoutubeDL.
- :param ydl_username: Username that will be used to download the
- resources with youtube_dl.
- :param ydl_password: Password of the related username, will be
- used to download the resources with
- youtube_dl.
- :param use_download_archive: Record the video url to the download archive.
- This will download only videos not listed in
- the archive file. Record the IDs of all
- downloaded videos in it.
:return: A dictionary that contains options that will
be used by youtube_dl.
"""
@@ -263,8 +259,6 @@ def generate_ydl_options(self,
'progress_with_newline': True,
'forcetitle': True,
'continuedl': True,
- 'retries': 9001,
- 'fragment_retries': 9001,
'forcejson': False,
'writeinfojson': True,
'writedescription': True,
@@ -291,42 +285,35 @@ def generate_ydl_options(self,
'progress_hooks': [ydl_progress_hook]
}
- if cookie_file is not None:
- ydl_opts['cookiefile'] = cookie_file
-
- if proxy_url is not None:
- ydl_opts['proxy'] = proxy_url
-
- if ydl_username is not None:
- ydl_opts['username'] = ydl_username
-
- if ydl_password is not None:
- ydl_opts['password'] = ydl_password
-
- if use_download_archive:
- ydl_opts['download_archive'] = os.path.join(self.dir_path['root'],
- '.ytdlarchive')
-
return ydl_opts
- def upload_ia(self, videobasename, custom_meta=None):
+ def upload_ia(self, videobasename, use_upload_archive=False, custom_meta=None):
"""
Upload video to archive.org.
- :param videobasename: A video base name.
- :param custom_meta: A custom meta, will be used by internetarchive
- library when uploading to archive.org.
- :return: A tuple containing item name and metadata used
- when uploading to archive.org and whether the item
- already exists.
+ :param videobasename: A video base name.
+ :param use_upload_archive: Record the video url to the upload archive.
+ This will upload only videos not listed in
+ the archive file. Record the IDs of all
+ uploaded videos in it.
+ :param custom_meta: A custom meta, will be used by internetarchive
+ library when uploading to archive.org.
+ :return: A tuple containing item name and metadata used
+ when uploading to archive.org and whether the item
+ already exists. A null item name means upload
+ didn't happened.
"""
json_metadata_filepath = videobasename + '.info.json'
with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
vid_meta = json.load(f)
# Exit if video download did not complete, don't upload .part files to IA
+ # One glob() + fnmatch() is ten times less expensive than 8 globs(),
+ # (Half a second vs 5 seconds on 250k files, what is significant when resuming large playlists)
+ filenames = glob.glob(glob.escape(videobasename) + '*')
for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
- if glob.glob(videobasename + ext):
+ matching = fnmatch.filter(filenames, ext)
+ if matching:
msg = 'Video download incomplete, please re-run or delete video stubs in downloads folder, exiting...'
raise Exception(msg)
@@ -334,6 +321,12 @@ def upload_ia(self, videobasename, custom_meta=None):
metadata = self.create_archive_org_metadata_from_youtubedl_meta(
vid_meta)
+ if use_upload_archive:
+ ydl = YoutubeDL({'download_archive': os.path.join(self.dir_path['root'], '.iauparchive')})
+ if ydl.in_download_archive(vid_meta):
+ self.logger.debug('Skipping already uploaded video: %s', metadata['title'])
+ return None, metadata
+
# Delete empty description file
description_file_path = videobasename + '.description'
if (os.path.exists(description_file_path) and
@@ -352,7 +345,7 @@ def upload_ia(self, videobasename, custom_meta=None):
# Upload all files with videobase name: e.g. video.mp4,
# video.info.json, video.srt, etc.
- files_to_upload = glob.glob(videobasename + '*')
+ files_to_upload = glob.glob(glob.escape(videobasename) + '*')
# Upload the item to the Internet Archive
item = internetarchive.get_item(itemname)
@@ -374,46 +367,36 @@ def upload_ia(self, videobasename, custom_meta=None):
print(msg)
raise Exception(msg)
- item.upload(files_to_upload, metadata=metadata, retries=9001,
- request_kwargs=dict(timeout=9001), delete=True,
+ mod, new_meta = strip_ip_from_meta(vid_meta)
+ if mod:
+ with open(json_metadata_filepath, 'w') as f:
+ json.dump(new_meta, f)
+
+ item.upload(files_to_upload, metadata=metadata, retries=15,
+ request_kwargs=dict(timeout=60), delete=not use_upload_archive,
verbose=self.verbose, access_key=s3_access_key,
secret_key=s3_secret_key)
+ if use_upload_archive:
+ ydl.record_download_archive(vid_meta)
+
return itemname, metadata
- def archive_urls(self, urls, custom_meta=None,
- cookie_file=None, proxy=None,
- ydl_username=None, ydl_password=None,
- use_download_archive=False,
+ def download_urls(self, urls,
ignore_existing_item=False):
"""
Download and upload videos from youtube_dl supported sites to
archive.org
- :param urls: List of url that will be downloaded and uploaded
- to archive.org
- :param custom_meta: A custom metadata that will be used when
- uploading the file with archive.org.
- :param cookie_file: A cookie file for YoutubeDL.
- :param proxy_url: A proxy url for YoutubeDL.
- :param ydl_username: Username that will be used to download the
- resources with youtube_dl.
- :param ydl_password: Password of the related username, will be used
- to download the resources with youtube_dl.
- :param use_download_archive: Record the video url to the download archive.
- This will download only videos not listed in
- the archive file. Record the IDs of all
- downloaded videos in it.
+ :param urls: List of url or local info files that will
+ be downloaded and uploaded to archive.org
:param ignore_existing_item: Ignores the check for existing items on archive.org.
:return: Tuple containing identifier and metadata of the
file that has been uploaded to archive.org.
"""
- downloaded_file_basenames = self.get_resource_basenames(
- urls, cookie_file, proxy, ydl_username, ydl_password, use_download_archive,
- ignore_existing_item)
- for basename in downloaded_file_basenames:
- identifier, meta = self.upload_ia(basename, custom_meta)
- yield identifier, meta
+ downloaded_file_basenames = self.get_resource_basenames(urls, ignore_existing_item)
+ self.logger.debug('Archiving files from %d videos: %s', len(downloaded_file_basenames), downloaded_file_basenames)
+ return downloaded_file_basenames
@staticmethod
def determine_collection_type(url):
diff --git a/tubeup/__main__.py b/tubeup/__main__.py
index 0841765..7608f7d 100644
--- a/tubeup/__main__.py
+++ b/tubeup/__main__.py
@@ -24,8 +24,11 @@
[--proxy ]
[--quiet] [--debug]
[--use-download-archive]
+ [--use-upload-archive]
[--output