Music Compiler E2E (#34)

stanford-crfm · Mar 15, 2024 · 6f86d73 · 6f86d73
1 parent feb6848
commit 6f86d73
Show file tree

Hide file tree

Showing 15 changed files with 616 additions and 107 deletions.
diff --git a/experimental/pull_music_sheets.py b/experimental/pull_music_sheets.py
@@ -13,6 +13,7 @@
 from tqdm import tqdm
 from torchvision import transforms, models
 import torch
+import imslp
 
 from image2structure.util.credentials_utils import get_credentials
 from image2structure.util.hierarchical_logger import htrack_block, hlog
@@ -136,6 +137,7 @@ def fetch_music_sheets(
     credentials: Dict[str, str] = get_credentials(credentials_path)
     username: str = credentials["username"]
     password: str = credentials["password"]
+    print(f"Username: {username}, password: {password}")
 
     c = client.ImslpClient(username=username, password=password)
     hlog("Login to IMSLP was successful. Created ImslpClient.\n")
@@ -157,33 +159,47 @@ def fetch_music_sheets(
         with htrack_block(
             "Searching for all works. Please be patient as this may take a few minutes."
         ):
-            results = c.search_works()
+            results = set(
+                imslp.interfaces.internal.list_works(
+                    start=0,
+                    count=100,
+                    cache=False,
+                )
+            )
             hlog(f"Found {len(results)} works.")
+        print(results)
 
         generated_count: int = 0
         with htrack_block("Processing the results..."):
             for result in tqdm(results):
                 url: str = result["permlink"]
                 if not url.startswith(imslp_url):
+                    print("Not an IMSLP URL")
                     continue
 
                 name: str = url.replace(imslp_url, "")
                 page = Page(c._site, name)
                 image_metadatas = fetch_images_metadata(page)
 
+                if len(image_metadatas) == 0:
+                    print("No image metadata")
+
                 for metadata in image_metadatas:
                     if "obj" not in metadata or metadata["obj"] is None:
+                        print("No obj in metadata")
                         continue
 
                     image: Image = metadata["obj"]
                     timestamp: str = image.imageinfo["timestamp"]
                     year: Optional[int] = int(timestamp[:4])
 
                     if year is None or year < year_range[0] or year > year_range[1]:
+                        print("Year out of range")
                         continue
 
                     file_name: str = image.imageinfo["url"].split("/")[-1]
                     if not file_name.endswith(".pdf"):
+                        print("Not a pdf")
                         continue
 
                     total_num_pages: Optional[int] = metadata["page_count"]
@@ -192,6 +208,7 @@ def fetch_music_sheets(
                         hlog(
                             f"Skipping {file_name} with {total_num_pages} pages. Too many pages."
                         )
+                        print("Too many pages")
                         continue
 
                     file_path: str = os.path.join(output_dir, file_name)
@@ -200,44 +217,45 @@ def fetch_music_sheets(
                     # Download
                     with open(file_path, "wb") as f:
                         image.download(f)
-
-                    image_path: str = os.path.join(
-                        output_dir, file_name.replace(".pdf", ".png")
-                    )
-
-                    # Select a random page but preferably not the first two pages (which could be a title
-                    # and not the sheet music) and the last two pages (which could be a blank page)
-                    page_number: int
-                    if total_num_pages > 4:
-                        page_number = random.randint(3, total_num_pages - 2)
-                    elif total_num_pages == 4:
-                        page_number = 3
-                    elif total_num_pages == 2 or total_num_pages == 3:
-                        page_number = 2
-                    else:
-                        page_number = 1
-
-                    generated: bool = generate_sheet_image(
-                        file_path, image_path, page_number
-                    )
-
-                    # Remove the PDF file
-                    os.remove(file_path)
-                    if generated:
-                        if not model.is_sheet_music(image_path):
-                            hlog(
-                                f"Removing {image_path} as it was identified as not a sheet music."
-                            )
-                            os.remove(image_path)
-                            continue
-
-                        generated_count += 1
-                        hlog(f"Generated {generated_count} of {num_examples} examples.")
-                        break
-
-                    # Add a delay to avoid subscription prompt
-                    hlog("Sleeping for 5 seconds...")
-                    time.sleep(5)
+                    generated_count += 1
+
+                    # image_path: str = os.path.join(
+                    #     output_dir, file_name.replace(".pdf", ".png")
+                    # )
+
+                    # # Select a random page but preferably not the first two pages (which could be a title
+                    # # and not the sheet music) and the last two pages (which could be a blank page)
+                    # page_number: int
+                    # if total_num_pages > 4:
+                    #     page_number = random.randint(3, total_num_pages - 2)
+                    # elif total_num_pages == 4:
+                    #     page_number = 3
+                    # elif total_num_pages == 2 or total_num_pages == 3:
+                    #     page_number = 2
+                    # else:
+                    #     page_number = 1
+
+                    # generated: bool = generate_sheet_image(
+                    #     file_path, image_path, page_number
+                    # )
+
+                    # # Remove the PDF file
+                    # # os.remove(file_path)
+                    # if generated:
+                    #     if not model.is_sheet_music(image_path):
+                    #         hlog(
+                    #             f"Removing {image_path} as it was identified as not a sheet music."
+                    #         )
+                    #         os.remove(image_path)
+                    #         continue
+
+                    #     generated_count += 1
+                    #     hlog(f"Generated {generated_count} of {num_examples} examples.")
+                    #     break
+
+                    # # Add a delay to avoid subscription prompt
+                    # hlog("Sleeping for 5 seconds...")
+                    # time.sleep(5)
 
                 if generated_count >= num_examples:
                     hlog(f"Generated {num_examples} examples. Exiting...")

diff --git a/src/image2structure/collect.py b/src/image2structure/collect.py
@@ -19,6 +19,18 @@
 from image2structure.fetch.fetcher import DownloadError
 
 
+def remove_unparsable_object_from_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    for key in list(d.keys()):
+        if isinstance(d[key], dict):
+            d[key] = remove_unparsable_object_from_dict(d[key])
+        # If it's not JSON parsable, remove it
+        try:
+            json.dumps(d[key])
+        except TypeError:
+            del d[key]
+    return d
+
+
 def get_args_parser() -> (
     Tuple[argparse.ArgumentParser, Dict[str, argparse.ArgumentParser]]
 ):
@@ -128,9 +140,9 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
         for scrape_result in scrape_results:
             # Create clean temporaty working directory
             if os.path.exists(tmp_dir):
-                shutil.rmtree(tmp_dir)
+                pass  # raise ValueError("strop")  # shutil.rmtree(tmp_dir)
             for path in [tmp_dir, tmp_structure_path, tmp_image_path]:
-                os.makedirs(path, exist_ok=False)
+                os.makedirs(path, exist_ok=True)
 
             # Flag to continue to the next instance
             should_continue: bool = False
@@ -272,7 +284,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
 
                 # Copy shared metadata to compiled metadata
                 compiled_metadata: Dict[str, Any] = {
-                    **metadata,
+                    **remove_unparsable_object_from_dict(metadata),
                     "assets": compilation_result.assets_path,
                     "category": category,
                     "uuid": file_name,
@@ -301,41 +313,45 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
                     shutil.copy(asset_path, instance_asset_path)
 
                 # Save the text
-                instance_text_path: str = os.path.join(
-                    output_path, category, "text", f"{file_name}.txt"
-                )
-                with open(instance_text_path, "w") as f:
-                    f.write(compilation_result.text)
+                if compilation_result.text is not None:
+                    instance_text_path: str = os.path.join(
+                        output_path, category, "text", f"{file_name}.txt"
+                    )
+                    with open(instance_text_path, "w") as f:
+                        f.write(compilation_result.text)
 
                 # Save the structure
-                extension: str = (
-                    os.path.splitext(compilation_result.data_path)[-1]
-                    if "." in compilation_result.data_path
-                    else ""
-                )
-                instance_structure_path: str = os.path.join(
-                    output_path, category, "structures", f"{file_name}{extension}"
-                )
-                if os.path.isdir(compilation_result.data_path):
-                    # First delete all files that we do not want to include
-                    # in the tar.gz. This is to avoid including the .git
-                    # directory and other files that are not necessary such
-                    # as the _site directory. We filter these files
-                    # by removing the folder that starts with an underscore
-                    # or a dot.
-                    for root, dirs, files in os.walk(compilation_result.data_path):
-                        for dir in dirs:
-                            if dir.startswith(("_site", ".")):
-                                shutil.rmtree(os.path.join(root, dir))
-
-                    # Compress the directory in .tar.gz to the instance_structure_path
-                    shutil.make_archive(
-                        instance_structure_path,
-                        "gztar",
-                        compilation_result.data_path,
+                if compilation_result.data_path is not None:
+                    extension: str = (
+                        os.path.splitext(compilation_result.data_path)[-1]
+                        if "." in compilation_result.data_path
+                        else ""
+                    )
+                    instance_structure_path: str = os.path.join(
+                        output_path, category, "structures", f"{file_name}{extension}"
                     )
-                else:
-                    shutil.copy(compilation_result.data_path, instance_structure_path)
+                    if os.path.isdir(compilation_result.data_path):
+                        # First delete all files that we do not want to include
+                        # in the tar.gz. This is to avoid including the .git
+                        # directory and other files that are not necessary such
+                        # as the _site directory. We filter these files
+                        # by removing the folder that starts with an underscore
+                        # or a dot.
+                        for root, dirs, files in os.walk(compilation_result.data_path):
+                            for dir in dirs:
+                                if dir.startswith(("_site", ".")):
+                                    shutil.rmtree(os.path.join(root, dir))
+
+                        # Compress the directory in .tar.gz to the instance_structure_path
+                        shutil.make_archive(
+                            instance_structure_path,
+                            "gztar",
+                            compilation_result.data_path,
+                        )
+                    else:
+                        shutil.copy(
+                            compilation_result.data_path, instance_structure_path
+                        )
 
                 # Increment the number of instances collected
                 assert category in num_instances_collected
@@ -353,6 +369,13 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
             if done:
                 break
 
+    # Check if there are elements in the text/structures folder, otherwise remove it
+    for category in num_instances_collected.keys():
+        for dir in ["text", "structures"]:
+            path = os.path.join(output_path, category, dir)
+            if not os.listdir(path):
+                os.rmdir(path)
+
     print("Scraping complete!")
     print(f" - {num_instances_downloaded} instances downloaded")
     print(f" - {num_instances_compiled} instances compiled")

diff --git a/src/image2structure/compilation/compiler.py b/src/image2structure/compilation/compiler.py
@@ -9,9 +9,6 @@
 class CompilationResult:
     """The result of a compilation."""
 
-    data_path: str
-    """The path to the compiled data."""
-
     rendering_path: str
     """The path to the rendering of the compiled data."""
 
@@ -20,6 +17,9 @@ class CompilationResult:
     For TeX, this is the type of the environment (e.g. equation, figure, table, algorithm).
     For web pages, this can be the language used (HTML, CSS, JavaScript, etc.)."""
 
+    data_path: Optional[str] = None
+    """The path to the compiled data."""
+
     text: Optional[str] = None
     """The text extracted from the compiled data."""