Skip to content

Commit

Permalink
Merge pull request #66 from CambioML/seis-dev
Browse files Browse the repository at this point in the history
chore: update output schema for parse and extract_tables
  • Loading branch information
lingjiekong authored Nov 19, 2024
2 parents ad7cb88 + 1e80ab0 commit a27a68b
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 16 deletions.
9 changes: 3 additions & 6 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,7 @@ def parse(

try:
response_data = response.json()
result = "\n".join(
response_data["markdown"]
) # Using direct extraction instead of extract_key
result = response_data["markdown"]
return result, f"Time Elapsed: {info}"
except json.JSONDecodeError:
return f"Error: Invalid JSON response: {response.text}", ""
Expand Down Expand Up @@ -213,7 +211,7 @@ def extract_tables(

try:
response_data = response.json()
result = "\n".join(response_data["markdown"])
result = response_data["markdown"]
return result, f"Time Elapsed: {info}"
except json.JSONDecodeError:
return f"Error: Invalid JSON response: {response.text}", ""
Expand Down Expand Up @@ -438,8 +436,7 @@ def async_fetch(
elif "pii_extraction" in result:
return result["pii_extraction"]
elif "markdown" in result:
markdown_list = result["markdown"]
return "\n".join(markdown_list)
return result["markdown"]
return f"Error: Invalid response format\n {result}"
if response.status_code == 202:
return ""
Expand Down
30 changes: 20 additions & 10 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ def test_pdf_sync_parse(self):
correct_output_file = "./tests/outputs/correct_pdf_output.txt"

# extract
markdown, elapsed_time = self.ap.parse(file_path=working_file)
markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
markdown = "\n".join(markdown_list)

self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
Expand All @@ -73,9 +74,10 @@ def test_pdf_sync_parse_with_file_content(self):
file_type = Path(working_file).suffix.lower().lstrip(".")

# extract
markdown, elapsed_time = self.ap.parse(
markdown_list, elapsed_time = self.ap.parse(
file_content=file_content, file_type=file_type
)
markdown = "\n".join(markdown_list)

self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
Expand All @@ -95,7 +97,8 @@ def test_pdf_async_parse_and_fetch(self):
file_id = self.ap.async_parse(file_path=working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
markdown_list = self.ap.async_fetch(file_id=file_id)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -117,7 +120,8 @@ def test_pdf_async_parse_and_fetch_with_file_content(self):
file_id = self.ap.async_parse(file_content=file_content, file_type=file_type)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
markdown_list = self.ap.async_fetch(file_id=file_id)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -132,7 +136,8 @@ def test_docx_sync_extract(self):
correct_output_file = "./tests/outputs/correct_docx_output.txt"

# extract
markdown, elapsed_time = self.ap.parse(file_path=working_file)
markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -151,7 +156,8 @@ def test_docx_async_parse_and_fetch(self):
file_id = self.ap.async_parse(file_path=working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
markdown_list = self.ap.async_fetch(file_id=file_id)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -166,7 +172,8 @@ def test_pptx_sync_extract(self):
correct_output_file = "./tests/outputs/correct_pptx_output.txt"

# extract
markdown, elapsed_time = self.ap.parse(file_path=working_file)
markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -185,7 +192,8 @@ def test_pptx_async_parse_and_fetch(self):
file_id = self.ap.async_parse(file_path=working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
markdown_list = self.ap.async_fetch(file_id=file_id)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -200,7 +208,8 @@ def test_image_sync_extract(self):
correct_output_file = "./tests/outputs/correct_png_output.txt"

# extract
markdown, elapsed_time = self.ap.parse(file_path=working_file)
markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand All @@ -219,7 +228,8 @@ def test_image_async_parse_and_fetch(self):
file_id = self.ap.async_parse(file_path=working_file)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
markdown = self.ap.async_fetch(file_id=file_id)
markdown_list = self.ap.async_fetch(file_id=file_id)
markdown = "\n".join(markdown_list)
self.assertFalse(markdown.startswith("Error:"), markdown)
correct_output = get_ground_truth(correct_output_file)
percentage = compare_markdown(markdown, correct_output)
Expand Down

0 comments on commit a27a68b

Please sign in to comment.