Skip to content

Commit

Permalink
Merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts committed Mar 15, 2024
2 parents e45dfb3 + feb6848 commit 3108f28
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 28 deletions.
23 changes: 15 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ aiohttp==3.9.3
aiosignal==1.3.1
async-timeout==4.0.3
attrs==23.2.0
beautifulsoup4==4.12.3
black==22.10.0
cachetools==5.3.2
bs4==0.0.1
cachetools==5.3.3
certifi==2022.12.7
cfgv==3.4.0
charset-normalizer==2.1.1
Expand All @@ -22,13 +24,14 @@ flake8==5.0.4
frozenlist==1.4.1
fsspec==2023.10.0
funcsigs==1.0.2
future==0.18.3
future==1.0.0
google-api-core==2.17.1
google-api-python-client==2.118.0
google-auth==2.28.0
google-auth==2.28.1
google-auth-httplib2==0.2.0
googleapis-common-protos==1.62.0
h11==0.14.0
html2text==2024.2.26
httplib2==0.22.0
huggingface-hub==0.20.3
identify==2.5.35
Expand All @@ -45,11 +48,13 @@ mccabe==0.7.0
mpmath==1.3.0
multidict==6.0.5
multiprocess==0.70.15
mwclient==0.10.1
mypy==1.5.1
mypy-extensions==1.0.0
networkx==3.2.1
nodeenv==1.8.0
numpy==1.26.4
oauthlib==3.2.2
outcome==1.3.0.post0
packaging==23.2
pandas==2.1.4
Expand All @@ -66,23 +71,25 @@ pyasn1==0.5.1
pyasn1-modules==0.3.0
pycodestyle==2.9.1
pyflakes==2.5.0
pyparsing==3.1.1
pyparsing==3.1.2
PySocks==1.7.1
pytest==7.2.2
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.1
PyWavelets==1.5.0
PyYAML==6.0.1
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
scikit-learn==1.4.1.post1
scipy==1.12.0
selenium==4.17.2
shutilwhich==1.1.0
six==1.16.0
sniffio==1.3.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.5
sympy==1.12
tempdir==0.7.1
threadpoolctl==3.3.0
Expand All @@ -94,11 +101,11 @@ tqdm==4.66.2
trio==0.24.0
trio-websocket==0.11.1
triton==2.0.0
typing_extensions==4.9.0
typing_extensions==4.10.0
tzdata==2024.1
uritemplate==4.1.1
urllib3==1.26.13
virtualenv==20.25.0
virtualenv==20.25.1
wsproto==1.2.0
xxhash==3.4.1
yarl==1.9.4
20 changes: 8 additions & 12 deletions src/image2structure/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import shutil
import tarfile
import time
import uuid

from .runner import Runner
from .run_specs import _RUNNER_REGISTRY
Expand Down Expand Up @@ -271,7 +272,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
done: bool = False
for compilation_result in accepted_results:
category: str = compilation_result.category
num_id: int = 0
file_name: str = str(uuid.uuid4())
if category not in num_instances_collected:
# First time we collect this category
# Create the directories
Expand All @@ -280,30 +281,25 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
os.path.join(output_path, category, dir), exist_ok=True
)
num_instances_collected[category] = 0
else:
# Increment the number of instances collected
num_id = num_files_in_dir(
os.path.join(output_path, category, "metadata")
)

# Copy shared metadata to compiled metadata
compiled_metadata: Dict[str, Any] = {
**remove_unparsable_object_from_dict(metadata),
"assets": compilation_result.assets_path,
"category": category,
"num_id": num_id,
"uuid": file_name,
}

# Save the metadata
instance_metadata_path: str = os.path.join(
output_path, category, "metadata", f"{num_id}.json"
output_path, category, "metadata", f"{file_name}.json"
)
with open(instance_metadata_path, "w") as f:
json.dump(compiled_metadata, f, indent=4)

# Save the image
instance_image_path: str = os.path.join(
output_path, category, "images", f"{num_id}.png"
output_path, category, "images", f"{file_name}.png"
)
shutil.copy(compilation_result.rendering_path, instance_image_path)

Expand All @@ -319,7 +315,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
# Save the text
if compilation_result.text is not None:
instance_text_path: str = os.path.join(
output_path, category, "text", f"{num_id}.txt"
output_path, category, "text", f"{file_name}.txt"
)
with open(instance_text_path, "w") as f:
f.write(compilation_result.text)
Expand All @@ -332,7 +328,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
else ""
)
instance_structure_path: str = os.path.join(
output_path, category, "structures", f"{num_id}{extension}"
output_path, category, "structures", f"{file_name}{extension}"
)
if os.path.isdir(compilation_result.data_path):
# First delete all files that we do not want to include
Expand Down Expand Up @@ -361,7 +357,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
assert category in num_instances_collected
num_instances_collected[category] += 1
runner.compiler.acknowledge_compilation(category)
print(f"Instance number {num_id} of category {category} collected")
print(f"Instance {file_name} of category {category} collected")

done = True
for category in num_instances_collected.keys():
Expand Down
22 changes: 14 additions & 8 deletions src/image2structure/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,25 @@ def main():
# Figure out the extension of the structure files
extension: str = ""
if has_structure:
file_name: str = os.listdir(structure_path)[0]
extension = os.path.splitext(file_name)[-1]
if file_name.endswith(".tar.gz"):
first_file_name: str = os.listdir(structure_path)[0]
extension = os.path.splitext(first_file_name)[-1]
if first_file_name.endswith(".tar.gz"):
extension = ".tar.gz"

# Load the structure
df: pd.DataFrame = pd.DataFrame()
structure_set = set()
file_names: List[str] = os.listdir(structure_path)
image_set = set()
for i in tqdm(range(num_data_points), desc="Loading data"):
try:
values = {}
file_name: str = file_names[i].replace(extension, "")

if has_structure:
structure_file = os.path.join(structure_path, f"{i}{extension}")
structure_file = os.path.join(
structure_path, f"{file_name}{extension}"
)
structure: str
if extension == ".tar.gz" or extension == ".zip":
structure = load_archive(structure_file)
Expand All @@ -141,23 +145,25 @@ def main():
structure_set.add(structure)

if has_text:
text: str = load_file(os.path.join(text_path, f"{i}.txt"))
text: str = load_file(os.path.join(text_path, f"{file_name}.txt"))
values["text"] = [text]

image = os.path.join(image_path, f"{i}.png")
image = os.path.join(image_path, f"{file_name}.png")
hashed_img: str = str(imagehash.average_hash(load_image(image)))
if hashed_img in image_set:
continue
image_set.add(hashed_img)
values["image"] = [image]

metadata = os.path.join(metadata_path, f"{i}.json")
metadata = os.path.join(metadata_path, f"{file_name}.json")
values["metadata"] = [metadata]

df = pd.concat([df, pd.DataFrame(values)])

except FileNotFoundError as e:
print(f"Skipping {i} as it is missing one of the required files: {e}")
print(
f"Skipping {file_name} as it is missing one of the required files: {e}"
)
continue

# Remove duplicates
Expand Down

0 comments on commit 3108f28

Please sign in to comment.