Skip to content

Commit

Permalink
Music Compiler E2E (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts authored Mar 15, 2024
1 parent feb6848 commit 6f86d73
Show file tree
Hide file tree
Showing 15 changed files with 616 additions and 107 deletions.
96 changes: 57 additions & 39 deletions experimental/pull_music_sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from tqdm import tqdm
from torchvision import transforms, models
import torch
import imslp

from image2structure.util.credentials_utils import get_credentials
from image2structure.util.hierarchical_logger import htrack_block, hlog
Expand Down Expand Up @@ -136,6 +137,7 @@ def fetch_music_sheets(
credentials: Dict[str, str] = get_credentials(credentials_path)
username: str = credentials["username"]
password: str = credentials["password"]
print(f"Username: {username}, password: {password}")

c = client.ImslpClient(username=username, password=password)
hlog("Login to IMSLP was successful. Created ImslpClient.\n")
Expand All @@ -157,33 +159,47 @@ def fetch_music_sheets(
with htrack_block(
"Searching for all works. Please be patient as this may take a few minutes."
):
results = c.search_works()
results = set(
imslp.interfaces.internal.list_works(
start=0,
count=100,
cache=False,
)
)
hlog(f"Found {len(results)} works.")
print(results)

generated_count: int = 0
with htrack_block("Processing the results..."):
for result in tqdm(results):
url: str = result["permlink"]
if not url.startswith(imslp_url):
print("Not an IMSLP URL")
continue

name: str = url.replace(imslp_url, "")
page = Page(c._site, name)
image_metadatas = fetch_images_metadata(page)

if len(image_metadatas) == 0:
print("No image metadata")

for metadata in image_metadatas:
if "obj" not in metadata or metadata["obj"] is None:
print("No obj in metadata")
continue

image: Image = metadata["obj"]
timestamp: str = image.imageinfo["timestamp"]
year: Optional[int] = int(timestamp[:4])

if year is None or year < year_range[0] or year > year_range[1]:
print("Year out of range")
continue

file_name: str = image.imageinfo["url"].split("/")[-1]
if not file_name.endswith(".pdf"):
print("Not a pdf")
continue

total_num_pages: Optional[int] = metadata["page_count"]
Expand All @@ -192,6 +208,7 @@ def fetch_music_sheets(
hlog(
f"Skipping {file_name} with {total_num_pages} pages. Too many pages."
)
print("Too many pages")
continue

file_path: str = os.path.join(output_dir, file_name)
Expand All @@ -200,44 +217,45 @@ def fetch_music_sheets(
# Download
with open(file_path, "wb") as f:
image.download(f)

image_path: str = os.path.join(
output_dir, file_name.replace(".pdf", ".png")
)

# Select a random page but preferably not the first two pages (which could be a title
# and not the sheet music) and the last two pages (which could be a blank page)
page_number: int
if total_num_pages > 4:
page_number = random.randint(3, total_num_pages - 2)
elif total_num_pages == 4:
page_number = 3
elif total_num_pages == 2 or total_num_pages == 3:
page_number = 2
else:
page_number = 1

generated: bool = generate_sheet_image(
file_path, image_path, page_number
)

# Remove the PDF file
os.remove(file_path)
if generated:
if not model.is_sheet_music(image_path):
hlog(
f"Removing {image_path} as it was identified as not a sheet music."
)
os.remove(image_path)
continue

generated_count += 1
hlog(f"Generated {generated_count} of {num_examples} examples.")
break

# Add a delay to avoid subscription prompt
hlog("Sleeping for 5 seconds...")
time.sleep(5)
generated_count += 1

# image_path: str = os.path.join(
# output_dir, file_name.replace(".pdf", ".png")
# )

# # Select a random page but preferably not the first two pages (which could be a title
# # and not the sheet music) and the last two pages (which could be a blank page)
# page_number: int
# if total_num_pages > 4:
# page_number = random.randint(3, total_num_pages - 2)
# elif total_num_pages == 4:
# page_number = 3
# elif total_num_pages == 2 or total_num_pages == 3:
# page_number = 2
# else:
# page_number = 1

# generated: bool = generate_sheet_image(
# file_path, image_path, page_number
# )

# # Remove the PDF file
# # os.remove(file_path)
# if generated:
# if not model.is_sheet_music(image_path):
# hlog(
# f"Removing {image_path} as it was identified as not a sheet music."
# )
# os.remove(image_path)
# continue

# generated_count += 1
# hlog(f"Generated {generated_count} of {num_examples} examples.")
# break

# # Add a delay to avoid subscription prompt
# hlog("Sleeping for 5 seconds...")
# time.sleep(5)

if generated_count >= num_examples:
hlog(f"Generated {num_examples} examples. Exiting...")
Expand Down
93 changes: 58 additions & 35 deletions src/image2structure/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@
from image2structure.fetch.fetcher import DownloadError


def remove_unparsable_object_from_dict(d: Dict[str, Any]) -> Dict[str, Any]:
for key in list(d.keys()):
if isinstance(d[key], dict):
d[key] = remove_unparsable_object_from_dict(d[key])
# If it's not JSON parsable, remove it
try:
json.dumps(d[key])
except TypeError:
del d[key]
return d


def get_args_parser() -> (
Tuple[argparse.ArgumentParser, Dict[str, argparse.ArgumentParser]]
):
Expand Down Expand Up @@ -128,9 +140,9 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
for scrape_result in scrape_results:
# Create clean temporaty working directory
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
pass # raise ValueError("strop") # shutil.rmtree(tmp_dir)
for path in [tmp_dir, tmp_structure_path, tmp_image_path]:
os.makedirs(path, exist_ok=False)
os.makedirs(path, exist_ok=True)

# Flag to continue to the next instance
should_continue: bool = False
Expand Down Expand Up @@ -272,7 +284,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:

# Copy shared metadata to compiled metadata
compiled_metadata: Dict[str, Any] = {
**metadata,
**remove_unparsable_object_from_dict(metadata),
"assets": compilation_result.assets_path,
"category": category,
"uuid": file_name,
Expand Down Expand Up @@ -301,41 +313,45 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
shutil.copy(asset_path, instance_asset_path)

# Save the text
instance_text_path: str = os.path.join(
output_path, category, "text", f"{file_name}.txt"
)
with open(instance_text_path, "w") as f:
f.write(compilation_result.text)
if compilation_result.text is not None:
instance_text_path: str = os.path.join(
output_path, category, "text", f"{file_name}.txt"
)
with open(instance_text_path, "w") as f:
f.write(compilation_result.text)

# Save the structure
extension: str = (
os.path.splitext(compilation_result.data_path)[-1]
if "." in compilation_result.data_path
else ""
)
instance_structure_path: str = os.path.join(
output_path, category, "structures", f"{file_name}{extension}"
)
if os.path.isdir(compilation_result.data_path):
# First delete all files that we do not want to include
# in the tar.gz. This is to avoid including the .git
# directory and other files that are not necessary such
# as the _site directory. We filter these files
# by removing the folder that starts with an underscore
# or a dot.
for root, dirs, files in os.walk(compilation_result.data_path):
for dir in dirs:
if dir.startswith(("_site", ".")):
shutil.rmtree(os.path.join(root, dir))

# Compress the directory in .tar.gz to the instance_structure_path
shutil.make_archive(
instance_structure_path,
"gztar",
compilation_result.data_path,
if compilation_result.data_path is not None:
extension: str = (
os.path.splitext(compilation_result.data_path)[-1]
if "." in compilation_result.data_path
else ""
)
instance_structure_path: str = os.path.join(
output_path, category, "structures", f"{file_name}{extension}"
)
else:
shutil.copy(compilation_result.data_path, instance_structure_path)
if os.path.isdir(compilation_result.data_path):
# First delete all files that we do not want to include
# in the tar.gz. This is to avoid including the .git
# directory and other files that are not necessary such
# as the _site directory. We filter these files
# by removing the folder that starts with an underscore
# or a dot.
for root, dirs, files in os.walk(compilation_result.data_path):
for dir in dirs:
if dir.startswith(("_site", ".")):
shutil.rmtree(os.path.join(root, dir))

# Compress the directory in .tar.gz to the instance_structure_path
shutil.make_archive(
instance_structure_path,
"gztar",
compilation_result.data_path,
)
else:
shutil.copy(
compilation_result.data_path, instance_structure_path
)

# Increment the number of instances collected
assert category in num_instances_collected
Expand All @@ -353,6 +369,13 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
if done:
break

# Check if there are elements in the text/structures folder, otherwise remove it
for category in num_instances_collected.keys():
for dir in ["text", "structures"]:
path = os.path.join(output_path, category, dir)
if not os.listdir(path):
os.rmdir(path)

print("Scraping complete!")
print(f" - {num_instances_downloaded} instances downloaded")
print(f" - {num_instances_compiled} instances compiled")
Expand Down
6 changes: 3 additions & 3 deletions src/image2structure/compilation/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
class CompilationResult:
"""The result of a compilation."""

data_path: str
"""The path to the compiled data."""

rendering_path: str
"""The path to the rendering of the compiled data."""

Expand All @@ -20,6 +17,9 @@ class CompilationResult:
For TeX, this is the type of the environment (e.g. equation, figure, table, algorithm).
For web pages, this can be the language used (HTML, CSS, JavaScript, etc.)."""

data_path: Optional[str] = None
"""The path to the compiled data."""

text: Optional[str] = None
"""The text extracted from the compiled data."""

Expand Down
Loading

0 comments on commit 6f86d73

Please sign in to comment.