diff --git a/requirements.txt b/requirements.txt index 065a744..95fb19a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,10 @@ aiohttp==3.9.3 aiosignal==1.3.1 async-timeout==4.0.3 attrs==23.2.0 +beautifulsoup4==4.12.3 black==22.10.0 -cachetools==5.3.2 +bs4==0.0.1 +cachetools==5.3.3 certifi==2022.12.7 cfgv==3.4.0 charset-normalizer==2.1.1 @@ -22,13 +24,14 @@ flake8==5.0.4 frozenlist==1.4.1 fsspec==2023.10.0 funcsigs==1.0.2 -future==0.18.3 +future==1.0.0 google-api-core==2.17.1 google-api-python-client==2.118.0 -google-auth==2.28.0 +google-auth==2.28.1 google-auth-httplib2==0.2.0 googleapis-common-protos==1.62.0 h11==0.14.0 +html2text==2024.2.26 httplib2==0.22.0 huggingface-hub==0.20.3 identify==2.5.35 @@ -45,11 +48,13 @@ mccabe==0.7.0 mpmath==1.3.0 multidict==6.0.5 multiprocess==0.70.15 +mwclient==0.10.1 mypy==1.5.1 mypy-extensions==1.0.0 networkx==3.2.1 nodeenv==1.8.0 numpy==1.26.4 +oauthlib==3.2.2 outcome==1.3.0.post0 packaging==23.2 pandas==2.1.4 @@ -66,23 +71,25 @@ pyasn1==0.5.1 pyasn1-modules==0.3.0 pycodestyle==2.9.1 pyflakes==2.5.0 -pyparsing==3.1.1 +pyparsing==3.1.2 PySocks==1.7.1 pytest==7.2.2 -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 python-dotenv==1.0.1 pytz==2024.1 PyWavelets==1.5.0 PyYAML==6.0.1 requests==2.31.0 +requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.4.1.post1 scipy==1.12.0 selenium==4.17.2 shutilwhich==1.1.0 six==1.16.0 -sniffio==1.3.0 +sniffio==1.3.1 sortedcontainers==2.4.0 +soupsieve==2.5 sympy==1.12 tempdir==0.7.1 threadpoolctl==3.3.0 @@ -94,11 +101,11 @@ tqdm==4.66.2 trio==0.24.0 trio-websocket==0.11.1 triton==2.0.0 -typing_extensions==4.9.0 +typing_extensions==4.10.0 tzdata==2024.1 uritemplate==4.1.1 urllib3==1.26.13 -virtualenv==20.25.0 +virtualenv==20.25.1 wsproto==1.2.0 xxhash==3.4.1 yarl==1.9.4 diff --git a/src/image2structure/collect.py b/src/image2structure/collect.py index 0069946..461848f 100644 --- a/src/image2structure/collect.py +++ b/src/image2structure/collect.py @@ -9,6 +9,7 @@ import shutil import tarfile import time +import uuid from .runner import Runner from .run_specs import _RUNNER_REGISTRY @@ -271,7 +272,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None: done: bool = False for compilation_result in accepted_results: category: str = compilation_result.category - num_id: int = 0 + file_name: str = str(uuid.uuid4()) if category not in num_instances_collected: # First time we collect this category # Create the directories @@ -280,30 +281,25 @@ def run(runner: Runner, args: argparse.Namespace) -> None: os.path.join(output_path, category, dir), exist_ok=True ) num_instances_collected[category] = 0 - else: - # Increment the number of instances collected - num_id = num_files_in_dir( - os.path.join(output_path, category, "metadata") - ) # Copy shared metadata to compiled metadata compiled_metadata: Dict[str, Any] = { **remove_unparsable_object_from_dict(metadata), "assets": compilation_result.assets_path, "category": category, - "num_id": num_id, + "uuid": file_name, } # Save the metadata instance_metadata_path: str = os.path.join( - output_path, category, "metadata", f"{num_id}.json" + output_path, category, "metadata", f"{file_name}.json" ) with open(instance_metadata_path, "w") as f: json.dump(compiled_metadata, f, indent=4) # Save the image instance_image_path: str = os.path.join( - output_path, category, "images", f"{num_id}.png" + output_path, category, "images", f"{file_name}.png" ) shutil.copy(compilation_result.rendering_path, instance_image_path) @@ -319,7 +315,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None: # Save the text if compilation_result.text is not None: instance_text_path: str = os.path.join( - output_path, category, "text", f"{num_id}.txt" + output_path, category, "text", f"{file_name}.txt" ) with open(instance_text_path, "w") as f: f.write(compilation_result.text) @@ -332,7 +328,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None: else "" ) instance_structure_path: str = os.path.join( - output_path, category, "structures", f"{num_id}{extension}" + output_path, category, "structures", f"{file_name}{extension}" ) if os.path.isdir(compilation_result.data_path): # First delete all files that we do not want to include @@ -361,7 +357,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None: assert category in num_instances_collected num_instances_collected[category] += 1 runner.compiler.acknowledge_compilation(category) - print(f"Instance number {num_id} of category {category} collected") + print(f"Instance {file_name} of category {category} collected") done = True for category in num_instances_collected.keys(): diff --git a/src/image2structure/upload.py b/src/image2structure/upload.py index e512074..676e886 100644 --- a/src/image2structure/upload.py +++ b/src/image2structure/upload.py @@ -115,21 +115,25 @@ def main(): # Figure out the extension of the structure files extension: str = "" if has_structure: - file_name: str = os.listdir(structure_path)[0] - extension = os.path.splitext(file_name)[-1] - if file_name.endswith(".tar.gz"): + first_file_name: str = os.listdir(structure_path)[0] + extension = os.path.splitext(first_file_name)[-1] + if first_file_name.endswith(".tar.gz"): extension = ".tar.gz" # Load the structure df: pd.DataFrame = pd.DataFrame() structure_set = set() + file_names: List[str] = os.listdir(structure_path) image_set = set() for i in tqdm(range(num_data_points), desc="Loading data"): try: values = {} + file_name: str = file_names[i].replace(extension, "") if has_structure: - structure_file = os.path.join(structure_path, f"{i}{extension}") + structure_file = os.path.join( + structure_path, f"{file_name}{extension}" + ) structure: str if extension == ".tar.gz" or extension == ".zip": structure = load_archive(structure_file) @@ -141,23 +145,25 @@ def main(): structure_set.add(structure) if has_text: - text: str = load_file(os.path.join(text_path, f"{i}.txt")) + text: str = load_file(os.path.join(text_path, f"{file_name}.txt")) values["text"] = [text] - image = os.path.join(image_path, f"{i}.png") + image = os.path.join(image_path, f"{file_name}.png") hashed_img: str = str(imagehash.average_hash(load_image(image))) if hashed_img in image_set: continue image_set.add(hashed_img) values["image"] = [image] - metadata = os.path.join(metadata_path, f"{i}.json") + metadata = os.path.join(metadata_path, f"{file_name}.json") values["metadata"] = [metadata] df = pd.concat([df, pd.DataFrame(values)]) except FileNotFoundError as e: - print(f"Skipping {i} as it is missing one of the required files: {e}") + print( + f"Skipping {file_name} as it is missing one of the required files: {e}" + ) continue # Remove duplicates