Use new document output structure (#3)

* Added new models to represent the document with a new structure * Fixed boundingBox's model * Update README.md * Update README.md * Update README.md * Fixed code after merge request review (MR !2) * fixed a bug in `pdfact_to_document` * fixed a bug in `pdfact_to_document` * Added method that filters headings * Imports refactor --------- Co-authored-by: AnnaMarika01 <[email protected]> Co-authored-by: Andrea Ponti <[email protected]>
OneOffTech · Jul 16, 2024 · e4dd97f · e4dd97f
1 parent 546120a
commit e4dd97f
Show file tree

Hide file tree

Showing 11 changed files with 192 additions and 104 deletions.
diff --git a/README.md b/README.md
@@ -35,26 +35,34 @@ with the following input as a `json` body:
 
 > **warning** The processing is performed synchronously
 
-
-The response is a JSON with the extracted text splitted in chunks. In particular, the structure is as follows:
-
-- `text`: The list of chunks, each composed by:
-    - `text`: The text extracted from the chunk.
-    - `metadata`: A json with additional information regarding the chunk.
-- `fonts`: The list of fonts used in the document. 
-Each font is represented by `name`, `id`, `is-bold`, `is-type3` and `is-italic`. 
-Available only using `pdfact` driver.
-- `colors`: The list of colors used in the document.
-Each color is represented by `r`, `g`, `b` and `id`.
-Available only using `pdfact` driver.
-
-The `metadata` of each chunk contains the following information:
-- `page`: The page number from which the chunk has been extracted.
-- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.)
-- `positions`: A list of bounding box containing the text. 
-Each bounding box is identified by 4 coordinated: `minY`, `minX`, `maxY` and `maxX`.
-- `font`: The font of the chunk.
-- `color`: The color of the chunk.
+The response is a JSON with the extracted text organized into typed nodes, making it easy to navigate and understand the different components of a document.
+In particular, the structure is as follows:
+- `category`: A string specifying the node category, which is `doc`
+- `content`: A list of `page` nodes representing the pages within the document.
+
+Each page node contains the following information:
+- `category`: A string specifying the node category, which is `page`.
+- `attributes`: A list containing attributes of the page. Currently, it includes only `page`, the number of the node page.
+- `content`: A list of chunk each representing a segment of text extracted from the page.
+
+In particular, each `content` contains the following information:
+  - `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.)
+  - `text`: The text extracted from the chunk.
+  - `marks`: A list of marks that characterize the text extracted from the chunk.
+  - `attributes`: A list containing attributes of the chunk, currently including:
+    - A list of `bounding_box` attributes that contain the text. Each bounding box is identified by 4 coordinated: 
+    `min_x`,`min_y`, `max_x`, `max_y` and `page`, which is the page number where the bounding box is located.
+
+The `marks` of the chunks contains:
+- `category`: the type of the mark, which can be: `bold`, `italic`, `textStyle`, `link`
+
+If the mark type is `textStyle`, it includes additional attributes:
+- `font`: An object representing the font of the text chunk. 
+Each font is represented by `name`, `id`, and `size`. Available only using `pdfact` driver.
+- `color`: Which is the color of the text chunk. 
+Each color is represented by `r`, `g`, `b` and `id`. Available only using `pdfact` driver.
+
+if the mark category is `link`, it provides the `url` of the link.
 
 ### Error handling
 

diff --git a/text_extractor/models/__init__.py b/text_extractor/models/__init__.py
@@ -1,5 +1,4 @@
+from .chunk import Chunk
 from .color import Color
-from .document import Document
+from .document import Document, Attributes, BoundingBox, Content, NodeAttributes, Node
 from .font import Font
-from .paragraph import Paragraph, Metadata
-from .position import Position
diff --git a/text_extractor/models/document.py b/text_extractor/models/document.py
@@ -1,13 +1,40 @@
-from typing import List, Optional
+from typing import List, Union
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
+from typing_extensions import TypedDict
 
-from text_extractor.models.color import Color
-from text_extractor.models.font import Font
-from text_extractor.models.paragraph import Paragraph
+from text_extractor.models.marks import Mark, TextStyleMark
+
+
+class BoundingBox(TypedDict):
+    min_x: float
+    min_y: float
+    max_x: float
+    max_y: float
+    page: int
+
+
+class Attributes(BaseModel):
+    bounding_box: List[BoundingBox] = []
+
+
+class Content(BaseModel):
+    role: str = "body"
+    text: str
+    marks: List[Union[Mark, TextStyleMark]] = []
+    attributes: Attributes = Attributes()
+
+
+class NodeAttributes(BaseModel):
+    page: int
+
+
+class Node(BaseModel):
+    category: str = Field("page")
+    attributes: NodeAttributes
+    content: List[Content]
 
 
 class Document(BaseModel):
-    fonts: Optional[List[Font]] = None
-    text: List[Paragraph]
-    colors: Optional[List[Color]] = None
+    type: str = Field("doc")
+    content: List[Node]
diff --git a/text_extractor/models/font.py b/text_extractor/models/font.py
@@ -1,9 +1,9 @@
-from pydantic import BaseModel, Field
+from typing import Optional
+
+from pydantic import BaseModel
 
 
 class Font(BaseModel):
     name: str
     id: str
-    is_bold: bool = Field(False, alias='is-bold')
-    is_type3: bool = Field(False, alias='is-type3')
-    is_italic: bool = Field(False, alias='is-italic')
+    size: Optional[int] = None
diff --git a/text_extractor/models/marks.py b/text_extractor/models/marks.py
@@ -0,0 +1,39 @@
+from typing import Any
+from typing import Literal, Optional
+
+from pydantic import BaseModel, model_validator
+
+from text_extractor.models.color import Color
+from text_extractor.models.font import Font
+
+
+class Mark(BaseModel):
+    category: Literal['bold', 'italic', 'textStyle', 'link']
+
+    @model_validator(mode='before')
+    @classmethod
+    def check_details(cls, data: Any) -> Any:
+        mark_type = data.get('type')
+
+        if mark_type == 'textStyle':
+            if 'color' not in data and 'font' not in data:
+                raise ValueError('color or font must be provided when type is textStyle')
+            if 'url' in data:
+                raise ValueError('url should not be provided when type is textStyle')
+
+        elif mark_type == 'link':
+            if 'url' not in data:
+                raise ValueError('url must be provided when type is link')
+            if 'textStyle' in data:
+                raise ValueError('textStyle should not be provided when type is link')
+
+        return data
+
+
+class TextStyleMark(Mark):
+    color: Optional[Color] = None
+    font: Optional[Font] = None
+
+
+class UrlMark(Mark):
+    url: str
diff --git a/text_extractor/models/paragraph.py b/text_extractor/models/paragraph.py
diff --git a/text_extractor/models/position.py b/text_extractor/models/position.py
diff --git a/text_extractor/parser/pdfact_parser.py b/text_extractor/parser/pdfact_parser.py
@@ -1,10 +1,12 @@
 import logging
+from typing import List, Dict
 
 import requests
 from fastapi import HTTPException
 from requests.exceptions import RequestException
 
-from text_extractor.models import Document, Metadata, Paragraph, Position, Color, Font
+from text_extractor.models import Document, Color, Font, Attributes, BoundingBox, Content, NodeAttributes, Node
+from text_extractor.models.marks import Mark, TextStyleMark
 from text_extractor.parser.pdf_parser import PDFParser
 
 logger = logging.getLogger(__name__)
@@ -28,6 +30,7 @@ def parse(self, filename: str, **kwargs) -> Document:
             res = response.json()
             if unit == 'paragraph' or unit is None:
                 res = pdfact_formatter(res)
+                res = heading_filter(res)
             document = pdfact_to_document(res)
             return document
         except RequestException as e:
@@ -37,51 +40,79 @@ def parse(self, filename: str, **kwargs) -> Document:
 
 def pdfact_to_document(json_data: dict) -> Document:
     colors = [Color(**color) for color in json_data.get('colors', [])]
-
     fonts = [Font(**font) for font in json_data.get('fonts', [])]
+    pages: Dict[int, List[Content]] = {}
 
-    paragraphs = []
     for para in json_data.get('paragraphs', []):
         paragraph_detail = para['paragraph']
+        page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None
         color_id = paragraph_detail['color']['id']
-
         color = next((c for c in colors if c.id == color_id), None)
 
         font_id = paragraph_detail['font']['id']
-        font = next((f for f in fonts if f.id == font_id), None)
-
-        positions = [
-            Position(
-                minY=pos['minY'],
-                minX=pos['minX'],
-                maxY=pos['maxY'],
-                maxX=pos['maxX']
+        font_size = paragraph_detail['font']['font-size']
+        original_font = next((f for f in fonts if f.id == font_id), None)
+
+        if original_font and font_size:
+            font = Font(name=original_font.name, id=original_font.id, size=round(font_size))
+        else:
+            font = original_font
+
+        font_info = next((font for font in json_data.get('fonts', []) if font.get('id') == font_id), None)
+
+        is_bold = False
+        is_italic = False
+        if font_info:
+            is_bold = font_info.get('is-bold')
+            is_italic = font_info.get('is-italic')
+
+        # TODO implement logic for links
+        marks = []
+        if color or font:
+            mark = TextStyleMark(category='textStyle', color=color, font=font)
+            marks.append(mark)
+        if is_bold:
+            mark = Mark(category='bold')
+            marks.append(mark)
+        if is_italic:
+            mark = Mark(category='italic')
+            marks.append(mark)
+
+        bounding_boxs = [
+            BoundingBox(
+                min_x=pos['minX'],
+                min_y=pos['minY'],
+                max_x=pos['maxX'],
+                max_y=pos['maxY'],
+                page=pos['page']
             ) for pos in paragraph_detail.get('positions', [])
         ]
 
-        page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None
+        attributes = Attributes(bounding_box=bounding_boxs)
 
-        metadata = Metadata(
+        content = Content(
             role=paragraph_detail['role'],
-            color=color,
-            positions=positions,
-            font=font,
-            page=page
-        )
-        paragraph = Paragraph(
             text=paragraph_detail['text'],
-            metadata=metadata
+            marks=marks,
+            attributes=attributes
         )
 
-        paragraphs.append(paragraph)
+        if page not in pages:
+            pages[page] = []
+        pages[page].append(content)
 
-    document = Document(
-        fonts=fonts,
-        text=paragraphs,
-        colors=colors
+    nodes = [
+        Node(
+            attributes=NodeAttributes(page=page),
+            content=content_list
+        ) for page, content_list in pages.items()
+    ]
+
+    doc = Document(
+        content=nodes
     )
 
-    return document
+    return doc
 
 
 def pdfact_formatter(json_file):
@@ -133,6 +164,11 @@ def aggregate_paragraphs(json_file):
 def compare_paragraphs(p1, p2, tr=25):
     if p1["paragraph"]["role"] != p2["paragraph"]["role"]:
         return False
+    if p1["paragraph"]["color"] != p2["paragraph"]["color"]:
+        return False
+    if p1["paragraph"]["font"] != p2["paragraph"]["font"]:
+        return False
+
     positions1, positions2 = p1["paragraph"]["positions"], p2["paragraph"]["positions"]
 
     for pos1 in positions1:
@@ -173,3 +209,15 @@ def merge_pargraphs(p1, p2):
     }
 
     return paragraph
+
+
+def heading_filter(json_file):
+    min_font_size_body = min(paragraph["paragraph"]["font"]["font-size"] for paragraph in json_file["paragraphs"] if
+                             paragraph["paragraph"]["role"] == "body")
+    for i in range(len(json_file["paragraphs"])):
+        paragraph = json_file["paragraphs"][i]
+        if paragraph["paragraph"]["role"] == "heading":
+            font_size = paragraph["paragraph"]["font"]["font-size"]
+            if font_size == min_font_size_body:
+                paragraph["paragraph"]["role"] = "body"
+    return json_file
diff --git a/text_extractor/parser/pymupdf_parser.py b/text_extractor/parser/pymupdf_parser.py
@@ -2,8 +2,7 @@
 
 import fitz
 
-from text_extractor.models import Document, Metadata, Paragraph
-from text_extractor.models.chunk import Chunk
+from text_extractor.models import Document, Chunk, Content, NodeAttributes, Node
 from text_extractor.parser.pdf_parser import PDFParser, clean_text
 
 
@@ -19,20 +18,20 @@ def parse(self, filename: str, **kwargs) -> Document:
 
 
 def chunks_to_document(doc_parsed: List[Chunk]) -> Document:
-    paragraphs = []
+    nodes = []
     for page in doc_parsed:
         page_number = page.metadata['page_number']
+        attributes = NodeAttributes(page=page_number)
+        content = [Content(text=page.text)]
 
-        metadata = Metadata(page=page_number)
-
-        paragraph = Paragraph(
-            text=page.text,
-            metadata=metadata
+        node = Node(
+            attributes=attributes,
+            content=content
         )
 
-        paragraphs.append(paragraph)
+        nodes.append(node)
 
     document = Document(
-        text=paragraphs,
+        content=nodes,
     )
     return document
diff --git a/text_extractor_api/__init__.py b/text_extractor_api/__init__.py
@@ -1 +0,0 @@
-