diff --git a/requirements.txt b/requirements.txt
index 065a744..95fb19a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,8 +3,10 @@ aiohttp==3.9.3
 aiosignal==1.3.1
 async-timeout==4.0.3
 attrs==23.2.0
+beautifulsoup4==4.12.3
 black==22.10.0
-cachetools==5.3.2
+bs4==0.0.1
+cachetools==5.3.3
 certifi==2022.12.7
 cfgv==3.4.0
 charset-normalizer==2.1.1
@@ -22,13 +24,14 @@ flake8==5.0.4
 frozenlist==1.4.1
 fsspec==2023.10.0
 funcsigs==1.0.2
-future==0.18.3
+future==1.0.0
 google-api-core==2.17.1
 google-api-python-client==2.118.0
-google-auth==2.28.0
+google-auth==2.28.1
 google-auth-httplib2==0.2.0
 googleapis-common-protos==1.62.0
 h11==0.14.0
+html2text==2024.2.26
 httplib2==0.22.0
 huggingface-hub==0.20.3
 identify==2.5.35
@@ -45,11 +48,13 @@ mccabe==0.7.0
 mpmath==1.3.0
 multidict==6.0.5
 multiprocess==0.70.15
+mwclient==0.10.1
 mypy==1.5.1
 mypy-extensions==1.0.0
 networkx==3.2.1
 nodeenv==1.8.0
 numpy==1.26.4
+oauthlib==3.2.2
 outcome==1.3.0.post0
 packaging==23.2
 pandas==2.1.4
@@ -66,23 +71,25 @@ pyasn1==0.5.1
 pyasn1-modules==0.3.0
 pycodestyle==2.9.1
 pyflakes==2.5.0
-pyparsing==3.1.1
+pyparsing==3.1.2
 PySocks==1.7.1
 pytest==7.2.2
-python-dateutil==2.8.2
+python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2024.1
 PyWavelets==1.5.0
 PyYAML==6.0.1
 requests==2.31.0
+requests-oauthlib==1.3.1
 rsa==4.9
 scikit-learn==1.4.1.post1
 scipy==1.12.0
 selenium==4.17.2
 shutilwhich==1.1.0
 six==1.16.0
-sniffio==1.3.0
+sniffio==1.3.1
 sortedcontainers==2.4.0
+soupsieve==2.5
 sympy==1.12
 tempdir==0.7.1
 threadpoolctl==3.3.0
@@ -94,11 +101,11 @@ tqdm==4.66.2
 trio==0.24.0
 trio-websocket==0.11.1
 triton==2.0.0
-typing_extensions==4.9.0
+typing_extensions==4.10.0
 tzdata==2024.1
 uritemplate==4.1.1
 urllib3==1.26.13
-virtualenv==20.25.0
+virtualenv==20.25.1
 wsproto==1.2.0
 xxhash==3.4.1
 yarl==1.9.4
diff --git a/src/image2structure/collect.py b/src/image2structure/collect.py
index 0069946..461848f 100644
--- a/src/image2structure/collect.py
+++ b/src/image2structure/collect.py
@@ -9,6 +9,7 @@
 import shutil
 import tarfile
 import time
+import uuid
 
 from .runner import Runner
 from .run_specs import _RUNNER_REGISTRY
@@ -271,7 +272,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
             done: bool = False
             for compilation_result in accepted_results:
                 category: str = compilation_result.category
-                num_id: int = 0
+                file_name: str = str(uuid.uuid4())
                 if category not in num_instances_collected:
                     # First time we collect this category
                     # Create the directories
@@ -280,30 +281,25 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
                             os.path.join(output_path, category, dir), exist_ok=True
                         )
                     num_instances_collected[category] = 0
-                else:
-                    # Increment the number of instances collected
-                    num_id = num_files_in_dir(
-                        os.path.join(output_path, category, "metadata")
-                    )
 
                 # Copy shared metadata to compiled metadata
                 compiled_metadata: Dict[str, Any] = {
                     **remove_unparsable_object_from_dict(metadata),
                     "assets": compilation_result.assets_path,
                     "category": category,
-                    "num_id": num_id,
+                    "uuid": file_name,
                 }
 
                 # Save the metadata
                 instance_metadata_path: str = os.path.join(
-                    output_path, category, "metadata", f"{num_id}.json"
+                    output_path, category, "metadata", f"{file_name}.json"
                 )
                 with open(instance_metadata_path, "w") as f:
                     json.dump(compiled_metadata, f, indent=4)
 
                 # Save the image
                 instance_image_path: str = os.path.join(
-                    output_path, category, "images", f"{num_id}.png"
+                    output_path, category, "images", f"{file_name}.png"
                 )
                 shutil.copy(compilation_result.rendering_path, instance_image_path)
 
@@ -319,7 +315,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
                 # Save the text
                 if compilation_result.text is not None:
                     instance_text_path: str = os.path.join(
-                        output_path, category, "text", f"{num_id}.txt"
+                        output_path, category, "text", f"{file_name}.txt"
                     )
                     with open(instance_text_path, "w") as f:
                         f.write(compilation_result.text)
@@ -332,7 +328,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
                         else ""
                     )
                     instance_structure_path: str = os.path.join(
-                        output_path, category, "structures", f"{num_id}{extension}"
+                        output_path, category, "structures", f"{file_name}{extension}"
                     )
                     if os.path.isdir(compilation_result.data_path):
                         # First delete all files that we do not want to include
@@ -361,7 +357,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
                 assert category in num_instances_collected
                 num_instances_collected[category] += 1
                 runner.compiler.acknowledge_compilation(category)
-                print(f"Instance number {num_id} of category {category} collected")
+                print(f"Instance {file_name} of category {category} collected")
 
                 done = True
                 for category in num_instances_collected.keys():
diff --git a/src/image2structure/upload.py b/src/image2structure/upload.py
index e512074..676e886 100644
--- a/src/image2structure/upload.py
+++ b/src/image2structure/upload.py
@@ -115,21 +115,25 @@ def main():
         # Figure out the extension of the structure files
         extension: str = ""
         if has_structure:
-            file_name: str = os.listdir(structure_path)[0]
-            extension = os.path.splitext(file_name)[-1]
-            if file_name.endswith(".tar.gz"):
+            first_file_name: str = os.listdir(structure_path)[0]
+            extension = os.path.splitext(first_file_name)[-1]
+            if first_file_name.endswith(".tar.gz"):
                 extension = ".tar.gz"
 
         # Load the structure
         df: pd.DataFrame = pd.DataFrame()
         structure_set = set()
+        file_names: List[str] = os.listdir(structure_path)
         image_set = set()
         for i in tqdm(range(num_data_points), desc="Loading data"):
             try:
                 values = {}
+                file_name: str = file_names[i].replace(extension, "")
 
                 if has_structure:
-                    structure_file = os.path.join(structure_path, f"{i}{extension}")
+                    structure_file = os.path.join(
+                        structure_path, f"{file_name}{extension}"
+                    )
                     structure: str
                     if extension == ".tar.gz" or extension == ".zip":
                         structure = load_archive(structure_file)
@@ -141,23 +145,25 @@ def main():
                     structure_set.add(structure)
 
                 if has_text:
-                    text: str = load_file(os.path.join(text_path, f"{i}.txt"))
+                    text: str = load_file(os.path.join(text_path, f"{file_name}.txt"))
                     values["text"] = [text]
 
-                image = os.path.join(image_path, f"{i}.png")
+                image = os.path.join(image_path, f"{file_name}.png")
                 hashed_img: str = str(imagehash.average_hash(load_image(image)))
                 if hashed_img in image_set:
                     continue
                 image_set.add(hashed_img)
                 values["image"] = [image]
 
-                metadata = os.path.join(metadata_path, f"{i}.json")
+                metadata = os.path.join(metadata_path, f"{file_name}.json")
                 values["metadata"] = [metadata]
 
                 df = pd.concat([df, pd.DataFrame(values)])
 
             except FileNotFoundError as e:
-                print(f"Skipping {i} as it is missing one of the required files: {e}")
+                print(
+                    f"Skipping {file_name} as it is missing one of the required files: {e}"
+                )
                 continue
 
         # Remove duplicates