Skip to content

Commit

Permalink
IA currently leaks the IP address of the submitter. This is bad.
Browse files Browse the repository at this point in the history
We fix this by carefully redacting the IP address in the JSON fields known to contain it.
  • Loading branch information
drzraf committed Sep 17, 2023
1 parent d838711 commit add619c
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
7 changes: 6 additions & 1 deletion tubeup/TubeUp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from internetarchive.config import parse_config_file
from datetime import datetime
from yt_dlp import YoutubeDL
from .utils import (get_itemname, check_is_file_empty,
from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
EMPTY_ANNOTATION_FILE)
from logging import getLogger
from urllib.parse import urlparse
Expand Down Expand Up @@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None):
with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
vid_meta = json.load(f)

mod, new_meta = strip_ip_from_meta(vid_meta)
if mod:
with open(json_metadata_filepath, 'w') as f:
json.dump(new_meta, f)

# Exit if video download did not complete, don't upload .part files to IA
for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
if glob.glob(videobasename + ext):
Expand Down
37 changes: 37 additions & 0 deletions tubeup/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import re
from urllib.parse import urlparse, parse_qs, urlencode


EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>'
Expand Down Expand Up @@ -29,3 +30,39 @@ def check_is_file_empty(filepath):
return os.stat(filepath).st_size == 0
else:
raise FileNotFoundError("Path '%s' doesn't exist" % filepath)


def strip_ip_from_url(url):
"""
Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/
or in an "ip" query-parameter, like in ?ip=1.2.3.4
"""
u = urlparse(url)
u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
if u.query != '':
qs = parse_qs(u.query)
try:
del(qs['ip'])
u = u._replace(query=urlencode(qs, True))
except:
pass
return u.geturl()


def strip_ip_from_meta(meta):
modified = False
if 'url' in meta:
redacted_url = strip_ip_from_url(meta['url'])
if redacted_url != meta['url']:
meta['url'] = redacted_url
modified = True

for _format in meta['formats']:
for field in ['manifest_url', 'fragment_base_url', 'url']:
if field in _format:
redacted_url = strip_ip_from_url(_format[field])
if redacted_url != _format[field]:
_format[field] = redacted_url
modified = True

return modified, meta

0 comments on commit add619c

Please sign in to comment.