From add619cf92aec85129dd5a8152bb5446b7cb10f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com> Date: Sat, 16 Sep 2023 23:12:58 -0300 Subject: [PATCH] IA currently leaks the IP address of the submitter. This is bad. We fix this by carefully redacting the IP address in the JSON fields known to contain it. --- tubeup/TubeUp.py | 7 ++++++- tubeup/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py index 7dd6801..98bdae7 100644 --- a/tubeup/TubeUp.py +++ b/tubeup/TubeUp.py @@ -10,7 +10,7 @@ from internetarchive.config import parse_config_file from datetime import datetime from yt_dlp import YoutubeDL -from .utils import (get_itemname, check_is_file_empty, +from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta, EMPTY_ANNOTATION_FILE) from logging import getLogger from urllib.parse import urlparse @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None): with open(json_metadata_filepath, 'r', encoding='utf-8') as f: vid_meta = json.load(f) + mod, new_meta = strip_ip_from_meta(vid_meta) + if mod: + with open(json_metadata_filepath, 'w') as f: + json.dump(new_meta, f) + # Exit if video download did not complete, don't upload .part files to IA for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']: if glob.glob(videobasename + ext): diff --git a/tubeup/utils.py b/tubeup/utils.py index bc12845..846c9fb 100644 --- a/tubeup/utils.py +++ b/tubeup/utils.py @@ -1,5 +1,6 @@ import os import re +from urllib.parse import urlparse, parse_qs, urlencode EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>' @@ -29,3 +30,39 @@ def check_is_file_empty(filepath): return os.stat(filepath).st_size == 0 else: raise FileNotFoundError("Path '%s' doesn't exist" % filepath) + + +def strip_ip_from_url(url): + """ + Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/ + or in an "ip" query-parameter, like in ?ip=1.2.3.4 + """ + u = urlparse(url) + u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path)) + if u.query != '': + qs = parse_qs(u.query) + try: + del(qs['ip']) + u = u._replace(query=urlencode(qs, True)) + except: + pass + return u.geturl() + + +def strip_ip_from_meta(meta): + modified = False + if 'url' in meta: + redacted_url = strip_ip_from_url(meta['url']) + if redacted_url != meta['url']: + meta['url'] = redacted_url + modified = True + + for _format in meta['formats']: + for field in ['manifest_url', 'fragment_base_url', 'url']: + if field in _format: + redacted_url = strip_ip_from_url(_format[field]) + if redacted_url != _format[field]: + _format[field] = redacted_url + modified = True + + return modified, meta