Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add encoding explicitly to all calls to open #409

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 50 additions & 17 deletions benchmarks/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,40 @@

def nougat_prediction(pdf_filename, batch_size=1):
out_dir = tempfile.mkdtemp()
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
subprocess.run(
[
"nougat",
pdf_filename,
"-o",
out_dir,
"--no-skipping",
"--recompute",
"--batchsize",
str(batch_size),
],
check=True,
)
md_file = os.listdir(out_dir)[0]
with open(os.path.join(out_dir, md_file), "r") as f:
with open(os.path.join(out_dir, md_file), "r", encoding="utf-8") as f:
data = f.read()
shutil.rmtree(out_dir)
return data


@click.command(help="Benchmark PDF to MD conversion.")
@click.argument("in_folder", type=str)
@click.argument("reference_folder", type=str)
@click.argument("out_file", type=str)
@click.option("--nougat", is_flag=True, help="Run nougat and compare")
@click.option("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str):
@click.option(
"--md_out_path",
type=str,
default=None,
help="Output path for generated markdown files",
)
def main(
in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_out_path: str
):
methods = ["marker"]
if nougat:
methods.append("nougat")
Expand Down Expand Up @@ -68,7 +88,7 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
config=config_parser.generate_config_dict(),
artifact_dict=model_dict,
processor_list=None,
renderer=config_parser.get_renderer()
renderer=config_parser.get_renderer(),
)
full_text = converter(pdf_filename).markdown
elif method == "nougat":
Expand All @@ -85,29 +105,29 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_

if md_out_path:
md_out_filename = f"{method}_{md_filename}"
with open(os.path.join(md_out_path, md_out_filename), "w+") as f:
with open(
os.path.join(md_out_path, md_out_filename), "w+", encoding="utf-8"
) as f:
f.write(full_text)

total_pages = sum(pages.values())
with open(out_file, "w+") as f:
with open(out_file, "w+", encoding="utf-8") as f:
write_data = defaultdict(dict)
for method in methods:
total_time = sum(times[method].values())
file_stats = {
fname:
{
fname: {
"time": times[method][fname],
"score": scores[method][fname],
"pages": pages[fname]
"pages": pages[fname],
}

for fname in benchmark_files
}
write_data[method] = {
"files": file_stats,
"avg_score": sum(scores[method].values()) / len(scores[method]),
"time_per_page": total_time / total_pages,
"time_per_doc": total_time / len(scores[method])
"time_per_doc": total_time / len(scores[method]),
}

json.dump(write_data, f, indent=4)
Expand All @@ -116,15 +136,28 @@ def main(in_folder: str, reference_folder: str, out_file: str, nougat: bool, md_
score_table = []
score_headers = benchmark_files
for method in methods:
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])

print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
summary_table.append(
[
method,
write_data[method]["avg_score"],
write_data[method]["time_per_page"],
write_data[method]["time_per_doc"],
]
)
score_table.append(
[method, *[write_data[method]["files"][h]["score"] for h in score_headers]]
)

print(
tabulate(
summary_table,
headers=["Method", "Average Score", "Time per page", "Time per document"],
)
)
print("")
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))


if __name__ == "__main__":
main()

8 changes: 5 additions & 3 deletions benchmarks/verify_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


def verify_scores(file_path):
with open(file_path, 'r') as file:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)

multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
Expand All @@ -14,7 +14,7 @@ def verify_scores(file_path):


def verify_table_scores(file_path):
with open(file_path, 'r') as file:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)

avg = sum([r["score"] for r in data]) / len(data)
Expand All @@ -25,7 +25,9 @@ def verify_table_scores(file_path):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Verify benchmark scores")
parser.add_argument("file_path", type=str, help="Path to the json file")
parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
parser.add_argument(
"--type", type=str, help="Type of file to verify", default="marker"
)
args = parser.parse_args()
if args.type == "marker":
verify_scores(args.file_path)
Expand Down
64 changes: 48 additions & 16 deletions marker/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,53 @@ def __init__(self, cli_options: dict):

@staticmethod
def common_options(fn):
fn = click.option("--output_dir", type=click.Path(exists=False), required=False, default=settings.OUTPUT_DIR,
help="Directory to save output.")(fn)
fn = click.option('--debug', '-d', is_flag=True, help='Enable debug mode.')(fn)
fn = click.option("--output_format", type=click.Choice(["markdown", "json", "html"]), default="markdown",
help="Format to output results in.")(fn)
fn = click.option("--page_range", type=str, default=None,
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20")(
fn)
fn = click.option("--force_ocr", is_flag=True, help="Force OCR on the whole document.")(fn)
fn = click.option("--processors", type=str, default=None,
help="Comma separated list of processors to use. Must use full module path.")(fn)
fn = click.option("--config_json", type=str, default=None,
help="Path to JSON file with additional configuration.")(fn)
fn = click.option("--languages", type=str, default=None, help="Comma separated list of languages to use for OCR.")(fn)
fn = click.option("--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.")(fn)
fn = click.option(
"--output_dir",
type=click.Path(exists=False),
required=False,
default=settings.OUTPUT_DIR,
help="Directory to save output.",
)(fn)
fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
fn = click.option(
"--output_format",
type=click.Choice(["markdown", "json", "html"]),
default="markdown",
help="Format to output results in.",
)(fn)
fn = click.option(
"--page_range",
type=str,
default=None,
help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20",
)(fn)
fn = click.option(
"--force_ocr", is_flag=True, help="Force OCR on the whole document."
)(fn)
fn = click.option(
"--processors",
type=str,
default=None,
help="Comma separated list of processors to use. Must use full module path.",
)(fn)
fn = click.option(
"--config_json",
type=str,
default=None,
help="Path to JSON file with additional configuration.",
)(fn)
fn = click.option(
"--languages",
type=str,
default=None,
help="Comma separated list of languages to use for OCR.",
)(fn)
fn = click.option(
"--disable_multiprocessing",
is_flag=True,
default=False,
help="Disable multiprocessing.",
)(fn)
return fn

def generate_config_dict(self) -> Dict[str, any]:
Expand All @@ -56,7 +88,7 @@ def generate_config_dict(self) -> Dict[str, any]:
config["languages"] = v.split(",")
case "config_json":
if v:
with open(v, "r") as f:
with open(v, "r", encoding="utf-8") as f:
config.update(json.load(f))
case "disable_multiprocessing":
if v:
Expand Down
8 changes: 6 additions & 2 deletions marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@ def text_from_rendered(rendered: BaseModel):
def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)

with open(os.path.join(output_dir, f"{fname_base}.{ext}"), "w+") as f:
with open(
os.path.join(output_dir, f"{fname_base}.{ext}"), "w+", encoding="utf-8"
) as f:
f.write(text)
with open(os.path.join(output_dir, f"{fname_base}_meta.json"), "w+") as f:
with open(
os.path.join(output_dir, f"{fname_base}_meta.json"), "w+", encoding="utf-8"
) as f:
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
Expand Down
Loading
Loading