VikParuchuri · HDembinski · Dec 2, 2024
diff --git a/benchmarks/overall.py b/benchmarks/overall.py
@@ -23,20 +23,40 @@
 
 def nougat_prediction(pdf_filename, batch_size=1):
     out_dir = tempfile.mkdtemp()
-    subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
+    subprocess.run(
+        [
+            "nougat",
+            pdf_filename,
+            "-o",
+            out_dir,
+            "--no-skipping",
+            "--recompute",
+            "--batchsize",
+            str(batch_size),
+        ],
+        check=True,
+    )
     md_file = os.listdir(out_dir)[0]
-    with open(os.path.join(out_dir, md_file), "r") as f:
+    with open(os.path.join(out_dir, md_file), "r", encoding="utf-8") as f:
         data = f.read()
     shutil.rmtree(out_dir)
     return data
 
+
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.argument("in_folder", type=str)
 @click.argument("reference_folder", type=str)
 @click.argument("out_file", type=str)
 @click.option("--nougat", is_flag=True, help="Run nougat and compare")
-@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
-def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
+@click.option(
+    "--md_out_path",
+    type=str,
+    default=None,
+    help="Output path for generated markdown files",
+)
+def main(
+    in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str
+):
     methods = ["marker"]
     if nougat:
         methods.append("nougat")
@@ -68,7 +88,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
                     config=config_parser.generate_config_dict(),
                     artifact_dict=model_dict,
                     processor_list=None,
-                    renderer=config_parser.get_renderer()
+                    renderer=config_parser.get_renderer(),
                 )
                 full_text = converter(pdf_filename).markdown
             elif method == "nougat":
@@ -85,29 +105,29 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
 
             if md_out_path:
                 md_out_filename = f"{method}_{md_filename}"
-                with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
+                with open(
+                    os.path.join(md_out_path, md_out_filename), "w+", encoding="utf-8"
+                ) as f:
                     f.write(full_text)
 
     total_pages = sum(pages.values())
-    with open(out_file, "w+") as f:
+    with open(out_file, "w+", encoding="utf-8") as f:
         write_data = defaultdict(dict)
         for method in methods:
             total_time = sum(times[method].values())
             file_stats = {
-                fname:
-                {
+                fname: {
                     "time": times[method][fname],
                     "score": scores[method][fname],
-                    "pages": pages[fname]
+                    "pages": pages[fname],
                 }
-
                 for fname in benchmark_files
             }
             write_data[method] = {
                 "files": file_stats,
                 "avg_score": sum(scores[method].values()) / len(scores[method]),
                 "time_per_page": total_time / total_pages,
-                "time_per_doc": total_time / len(scores[method])
+                "time_per_doc": total_time / len(scores[method]),
             }
 
         json.dump(write_data, f, indent=4)
@@ -116,15 +136,28 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
     score_table = []
     score_headers = benchmark_files
     for method in methods:
-        summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
-        score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
-
-    print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
+        summary_table.append(
+            [
+                method,
+                write_data[method]["avg_score"],
+                write_data[method]["time_per_page"],
+                write_data[method]["time_per_doc"],
+            ]
+        )
+        score_table.append(
+            [method, *[write_data[method]["files"][h]["score"] for h in score_headers]]
+        )
+
+    print(
+        tabulate(
+            summary_table,
+            headers=["Method", "Average Score", "Time per page", "Time per document"],
+        )
+    )
     print("")
     print("Scores by file")
     print(tabulate(score_table, headers=["Method", *score_headers]))
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/benchmarks/verify_scores.py b/benchmarks/verify_scores.py
@@ -3,7 +3,7 @@
 
 
 def verify_scores(file_path):
-    with open(file_path, 'r') as file:
+    with open(file_path, "r", encoding="utf-8") as file:
         data = json.load(file)
 
     multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
@@ -14,7 +14,7 @@ def verify_scores(file_path):
 
 
 def verify_table_scores(file_path):
-    with open(file_path, 'r') as file:
+    with open(file_path, "r", encoding="utf-8") as file:
         data = json.load(file)
 
     avg = sum([r["score"] for r in data]) / len(data)
@@ -25,7 +25,9 @@ def verify_table_scores(file_path):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Verify benchmark scores")
     parser.add_argument("file_path", type=str, help="Path to the json file")
-    parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
+    parser.add_argument(
+        "--type", type=str, help="Type of file to verify", default="marker"
+    )
     args = parser.parse_args()
     if args.type == "marker":
         verify_scores(args.file_path)

diff --git a/marker/config/parser.py b/marker/config/parser.py
@@ -17,21 +17,53 @@ def __init__(self, cli_options: dict):
 
     @staticmethod
     def common_options(fn):
-        fn = click.option("--output_dir", type=click.Path(exists=False), required=False, default=settings.OUTPUT_DIR,
-                          help="Directory to save output.")(fn)
-        fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
-        fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
-                          help="Format to output results in.")(fn)
-        fn = click.option("--page_range", type=str, default=None,
-                          help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20")(
-            fn)
-        fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
-        fn = click.option("--processors", type=str, default=None,
-                          help="Comma separated list of processors to use.  Must use full module path.")(fn)
-        fn = click.option("--config_json", type=str, default=None,
-                          help="Path to JSON file with additional configuration.")(fn)
-        fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
-        fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
+        fn = click.option(
+            "--output_dir",
+            type=click.Path(exists=False),
+            required=False,
+            default=settings.OUTPUT_DIR,
+            help="Directory to save output.",
+        )(fn)
+        fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
+        fn = click.option(
+            "--output_format",
+            type=click.Choice(["markdown", "json", "html"]),
+            default="markdown",
+            help="Format to output results in.",
+        )(fn)
+        fn = click.option(
+            "--page_range",
+            type=str,
+            default=None,
+            help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20",
+        )(fn)
+        fn = click.option(
+            "--force_ocr", is_flag=True, help="Force OCR on the whole document."
+        )(fn)
+        fn = click.option(
+            "--processors",
+            type=str,
+            default=None,
+            help="Comma separated list of processors to use.  Must use full module path.",
+        )(fn)
+        fn = click.option(
+            "--config_json",
+            type=str,
+            default=None,
+            help="Path to JSON file with additional configuration.",
+        )(fn)
+        fn = click.option(
+            "--languages",
+            type=str,
+            default=None,
+            help="Comma separated list of languages to use for OCR.",
+        )(fn)
+        fn = click.option(
+            "--disable_multiprocessing",
+            is_flag=True,
+            default=False,
+            help="Disable multiprocessing.",
+        )(fn)
         return fn
 
     def generate_config_dict(self) -> Dict[str, any]:
@@ -56,7 +88,7 @@ def generate_config_dict(self) -> Dict[str, any]:
                         config["languages"] = v.split(",")
                 case "config_json":
                     if v:
-                        with open(v, "r") as f:
+                        with open(v, "r", encoding="utf-8") as f:
                             config.update(json.load(f))
                 case "disable_multiprocessing":
                     if v:

diff --git a/marker/output.py b/marker/output.py
@@ -30,9 +30,13 @@ def text_from_rendered(rendered: BaseModel):
 def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
     text, ext, images = text_from_rendered(rendered)
 
-    with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
+    with open(
+        os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding="utf-8"
+    ) as f:
         f.write(text)
-    with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
+    with open(
+        os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding="utf-8"
+    ) as f:
         f.write(json.dumps(rendered.metadata, indent=2))
 
     for img_name, img in images.items():