From add619cf92aec85129dd5a8152bb5446b7cb10f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= <raphael.droz@gmail.com>
Date: Sat, 16 Sep 2023 23:12:58 -0300
Subject: [PATCH] IA currently leaks the IP address of the submitter. This is
 bad.

We fix this by carefully redacting the IP address in the JSON fields known to contain it.
---
 tubeup/TubeUp.py |  7 ++++++-
 tubeup/utils.py  | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/tubeup/TubeUp.py b/tubeup/TubeUp.py
index 7dd6801..98bdae7 100644
--- a/tubeup/TubeUp.py
+++ b/tubeup/TubeUp.py
@@ -10,7 +10,7 @@
 from internetarchive.config import parse_config_file
 from datetime import datetime
 from yt_dlp import YoutubeDL
-from .utils import (get_itemname, check_is_file_empty,
+from .utils import (get_itemname, check_is_file_empty, strip_ip_from_meta,
                     EMPTY_ANNOTATION_FILE)
 from logging import getLogger
 from urllib.parse import urlparse
@@ -324,6 +324,11 @@ def upload_ia(self, videobasename, custom_meta=None):
         with open(json_metadata_filepath, 'r', encoding='utf-8') as f:
             vid_meta = json.load(f)
 
+        mod, new_meta = strip_ip_from_meta(vid_meta)
+        if mod:
+            with open(json_metadata_filepath, 'w') as f:
+                json.dump(new_meta, f)
+
         # Exit if video download did not complete, don't upload .part files to IA
         for ext in ['*.part', '*.f303.*', '*.f302.*', '*.ytdl', '*.f251.*', '*.248.*', '*.f247.*', '*.temp']:
             if glob.glob(videobasename + ext):
diff --git a/tubeup/utils.py b/tubeup/utils.py
index bc12845..846c9fb 100644
--- a/tubeup/utils.py
+++ b/tubeup/utils.py
@@ -1,5 +1,6 @@
 import os
 import re
+from urllib.parse import urlparse, parse_qs, urlencode
 
 
 EMPTY_ANNOTATION_FILE = ('<?xml version="1.0" encoding="UTF-8" ?>'
@@ -29,3 +30,39 @@ def check_is_file_empty(filepath):
         return os.stat(filepath).st_size == 0
     else:
         raise FileNotFoundError("Path '%s' doesn't exist" % filepath)
+
+
+def strip_ip_from_url(url):
+    """
+    Strip occurence of IP address as found in path segments like in /ip/1.2.3.4/
+    or in an "ip" query-parameter, like in ?ip=1.2.3.4
+    """
+    u = urlparse(url)
+    u = u._replace(path=re.sub(r'/ip/[^/]+', r'/ip/REDACTED', u.path))
+    if u.query != '':
+        qs = parse_qs(u.query)
+        try:
+            del(qs['ip'])
+            u = u._replace(query=urlencode(qs, True))
+        except:
+            pass
+    return u.geturl()
+
+
+def strip_ip_from_meta(meta):
+    modified = False
+    if 'url' in meta:
+        redacted_url = strip_ip_from_url(meta['url'])
+        if redacted_url != meta['url']:
+            meta['url'] = redacted_url
+            modified = True
+
+    for _format in meta['formats']:
+        for field in ['manifest_url', 'fragment_base_url', 'url']:
+            if field in _format:
+                redacted_url = strip_ip_from_url(_format[field])
+                if redacted_url != _format[field]:
+                    _format[field] = redacted_url
+                    modified = True
+
+    return modified, meta