Merge pull request #66 from CambioML/seis-dev

chore: update output schema for parse and extract_tables
CambioML · Nov 19, 2024 · a27a68b · a27a68b
2 parents ad7cb88 + 1e80ab0
commit a27a68b
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 16 deletions.
diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
@@ -153,9 +153,7 @@ def parse(
 
         try:
             response_data = response.json()
-            result = "\n".join(
-                response_data["markdown"]
-            )  # Using direct extraction instead of extract_key
+            result = response_data["markdown"]
             return result, f"Time Elapsed: {info}"
         except json.JSONDecodeError:
             return f"Error: Invalid JSON response: {response.text}", ""
@@ -213,7 +211,7 @@ def extract_tables(
 
         try:
             response_data = response.json()
-            result = "\n".join(response_data["markdown"])
+            result = response_data["markdown"]
             return result, f"Time Elapsed: {info}"
         except json.JSONDecodeError:
             return f"Error: Invalid JSON response: {response.text}", ""
@@ -438,8 +436,7 @@ def async_fetch(
             elif "pii_extraction" in result:
                 return result["pii_extraction"]
             elif "markdown" in result:
-                markdown_list = result["markdown"]
-                return "\n".join(markdown_list)
+                return result["markdown"]
             return f"Error: Invalid response format\n {result}"
         if response.status_code == 202:
             return ""

diff --git a/tests/test.py b/tests/test.py
@@ -52,7 +52,8 @@ def test_pdf_sync_parse(self):
         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
 
         # extract
-        markdown, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown = "\n".join(markdown_list)
 
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
@@ -73,9 +74,10 @@ def test_pdf_sync_parse_with_file_content(self):
             file_type = Path(working_file).suffix.lower().lstrip(".")
 
         # extract
-        markdown, elapsed_time = self.ap.parse(
+        markdown_list, elapsed_time = self.ap.parse(
             file_content=file_content, file_type=file_type
         )
+        markdown = "\n".join(markdown_list)
 
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
@@ -95,7 +97,8 @@ def test_pdf_async_parse_and_fetch(self):
         file_id = self.ap.async_parse(file_path=working_file)
         self.assertFalse(file_id.startswith("Error:"), file_id)
         # fetch
-        markdown = self.ap.async_fetch(file_id=file_id)
+        markdown_list = self.ap.async_fetch(file_id=file_id)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -117,7 +120,8 @@ def test_pdf_async_parse_and_fetch_with_file_content(self):
         file_id = self.ap.async_parse(file_content=file_content, file_type=file_type)
         self.assertFalse(file_id.startswith("Error:"), file_id)
         # fetch
-        markdown = self.ap.async_fetch(file_id=file_id)
+        markdown_list = self.ap.async_fetch(file_id=file_id)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -132,7 +136,8 @@ def test_docx_sync_extract(self):
         correct_output_file = "./tests/outputs/correct_docx_output.txt"
 
         # extract
-        markdown, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -151,7 +156,8 @@ def test_docx_async_parse_and_fetch(self):
         file_id = self.ap.async_parse(file_path=working_file)
         self.assertFalse(file_id.startswith("Error:"), file_id)
         # fetch
-        markdown = self.ap.async_fetch(file_id=file_id)
+        markdown_list = self.ap.async_fetch(file_id=file_id)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -166,7 +172,8 @@ def test_pptx_sync_extract(self):
         correct_output_file = "./tests/outputs/correct_pptx_output.txt"
 
         # extract
-        markdown, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -185,7 +192,8 @@ def test_pptx_async_parse_and_fetch(self):
         file_id = self.ap.async_parse(file_path=working_file)
         self.assertFalse(file_id.startswith("Error:"), file_id)
         # fetch
-        markdown = self.ap.async_fetch(file_id=file_id)
+        markdown_list = self.ap.async_fetch(file_id=file_id)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -200,7 +208,8 @@ def test_image_sync_extract(self):
         correct_output_file = "./tests/outputs/correct_png_output.txt"
 
         # extract
-        markdown, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)
@@ -219,7 +228,8 @@ def test_image_async_parse_and_fetch(self):
         file_id = self.ap.async_parse(file_path=working_file)
         self.assertFalse(file_id.startswith("Error:"), file_id)
         # fetch
-        markdown = self.ap.async_fetch(file_id=file_id)
+        markdown_list = self.ap.async_fetch(file_id=file_id)
+        markdown = "\n".join(markdown_list)
         self.assertFalse(markdown.startswith("Error:"), markdown)
         correct_output = get_ground_truth(correct_output_file)
         percentage = compare_markdown(markdown, correct_output)