From e4dd97fb682823cde25afc4cb09c8df0c5d9d2ad Mon Sep 17 00:00:00 2001 From: Alessio Date: Tue, 16 Jul 2024 12:25:17 +0200 Subject: [PATCH] Use new document output structure (#3) * Added new models to represent the document with a new structure * Fixed boundingBox's model * Update README.md * Update README.md * Update README.md * Fixed code after merge request review (MR !2) * fixed a bug in `pdfact_to_document` * fixed a bug in `pdfact_to_document` * Added method that filters headings * Imports refactor --------- Co-authored-by: AnnaMarika01 Co-authored-by: Andrea Ponti --- README.md | 48 ++++++----- text_extractor/models/__init__.py | 5 +- text_extractor/models/document.py | 43 ++++++++-- text_extractor/models/font.py | 8 +- text_extractor/models/marks.py | 39 +++++++++ text_extractor/models/paragraph.py | 20 ----- text_extractor/models/position.py | 10 --- text_extractor/parser/pdfact_parser.py | 102 +++++++++++++++++------- text_extractor/parser/pymupdf_parser.py | 19 +++-- text_extractor_api/__init__.py | 1 - text_extractor_api/main.py | 1 - 11 files changed, 192 insertions(+), 104 deletions(-) create mode 100644 text_extractor/models/marks.py delete mode 100644 text_extractor/models/paragraph.py delete mode 100644 text_extractor/models/position.py diff --git a/README.md b/README.md index 05c9610..904d4ef 100644 --- a/README.md +++ b/README.md @@ -35,26 +35,34 @@ with the following input as a `json` body: > **warning** The processing is performed synchronously - -The response is a JSON with the extracted text splitted in chunks. In particular, the structure is as follows: - -- `text`: The list of chunks, each composed by: - - `text`: The text extracted from the chunk. - - `metadata`: A json with additional information regarding the chunk. -- `fonts`: The list of fonts used in the document. -Each font is represented by `name`, `id`, `is-bold`, `is-type3` and `is-italic`. -Available only using `pdfact` driver. -- `colors`: The list of colors used in the document. -Each color is represented by `r`, `g`, `b` and `id`. -Available only using `pdfact` driver. - -The `metadata` of each chunk contains the following information: -- `page`: The page number from which the chunk has been extracted. -- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.) -- `positions`: A list of bounding box containing the text. -Each bounding box is identified by 4 coordinated: `minY`, `minX`, `maxY` and `maxX`. -- `font`: The font of the chunk. -- `color`: The color of the chunk. +The response is a JSON with the extracted text organized into typed nodes, making it easy to navigate and understand the different components of a document. +In particular, the structure is as follows: +- `category`: A string specifying the node category, which is `doc` +- `content`: A list of `page` nodes representing the pages within the document. + +Each page node contains the following information: +- `category`: A string specifying the node category, which is `page`. +- `attributes`: A list containing attributes of the page. Currently, it includes only `page`, the number of the node page. +- `content`: A list of chunk each representing a segment of text extracted from the page. + +In particular, each `content` contains the following information: + - `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.) + - `text`: The text extracted from the chunk. + - `marks`: A list of marks that characterize the text extracted from the chunk. + - `attributes`: A list containing attributes of the chunk, currently including: + - A list of `bounding_box` attributes that contain the text. Each bounding box is identified by 4 coordinated: + `min_x`,`min_y`, `max_x`, `max_y` and `page`, which is the page number where the bounding box is located. + +The `marks` of the chunks contains: +- `category`: the type of the mark, which can be: `bold`, `italic`, `textStyle`, `link` + +If the mark type is `textStyle`, it includes additional attributes: +- `font`: An object representing the font of the text chunk. +Each font is represented by `name`, `id`, and `size`. Available only using `pdfact` driver. +- `color`: Which is the color of the text chunk. +Each color is represented by `r`, `g`, `b` and `id`. Available only using `pdfact` driver. + +if the mark category is `link`, it provides the `url` of the link. ### Error handling diff --git a/text_extractor/models/__init__.py b/text_extractor/models/__init__.py index df07121..1ab5d9b 100644 --- a/text_extractor/models/__init__.py +++ b/text_extractor/models/__init__.py @@ -1,5 +1,4 @@ +from .chunk import Chunk from .color import Color -from .document import Document +from .document import Document, Attributes, BoundingBox, Content, NodeAttributes, Node from .font import Font -from .paragraph import Paragraph, Metadata -from .position import Position diff --git a/text_extractor/models/document.py b/text_extractor/models/document.py index 11e4f87..9ec4d11 100644 --- a/text_extractor/models/document.py +++ b/text_extractor/models/document.py @@ -1,13 +1,40 @@ -from typing import List, Optional +from typing import List, Union -from pydantic import BaseModel +from pydantic import BaseModel, Field +from typing_extensions import TypedDict -from text_extractor.models.color import Color -from text_extractor.models.font import Font -from text_extractor.models.paragraph import Paragraph +from text_extractor.models.marks import Mark, TextStyleMark + + +class BoundingBox(TypedDict): + min_x: float + min_y: float + max_x: float + max_y: float + page: int + + +class Attributes(BaseModel): + bounding_box: List[BoundingBox] = [] + + +class Content(BaseModel): + role: str = "body" + text: str + marks: List[Union[Mark, TextStyleMark]] = [] + attributes: Attributes = Attributes() + + +class NodeAttributes(BaseModel): + page: int + + +class Node(BaseModel): + category: str = Field("page") + attributes: NodeAttributes + content: List[Content] class Document(BaseModel): - fonts: Optional[List[Font]] = None - text: List[Paragraph] - colors: Optional[List[Color]] = None + type: str = Field("doc") + content: List[Node] diff --git a/text_extractor/models/font.py b/text_extractor/models/font.py index 277f6bf..a6ee95d 100644 --- a/text_extractor/models/font.py +++ b/text_extractor/models/font.py @@ -1,9 +1,9 @@ -from pydantic import BaseModel, Field +from typing import Optional + +from pydantic import BaseModel class Font(BaseModel): name: str id: str - is_bold: bool = Field(False, alias='is-bold') - is_type3: bool = Field(False, alias='is-type3') - is_italic: bool = Field(False, alias='is-italic') + size: Optional[int] = None diff --git a/text_extractor/models/marks.py b/text_extractor/models/marks.py new file mode 100644 index 0000000..4a3a05e --- /dev/null +++ b/text_extractor/models/marks.py @@ -0,0 +1,39 @@ +from typing import Any +from typing import Literal, Optional + +from pydantic import BaseModel, model_validator + +from text_extractor.models.color import Color +from text_extractor.models.font import Font + + +class Mark(BaseModel): + category: Literal['bold', 'italic', 'textStyle', 'link'] + + @model_validator(mode='before') + @classmethod + def check_details(cls, data: Any) -> Any: + mark_type = data.get('type') + + if mark_type == 'textStyle': + if 'color' not in data and 'font' not in data: + raise ValueError('color or font must be provided when type is textStyle') + if 'url' in data: + raise ValueError('url should not be provided when type is textStyle') + + elif mark_type == 'link': + if 'url' not in data: + raise ValueError('url must be provided when type is link') + if 'textStyle' in data: + raise ValueError('textStyle should not be provided when type is link') + + return data + + +class TextStyleMark(Mark): + color: Optional[Color] = None + font: Optional[Font] = None + + +class UrlMark(Mark): + url: str diff --git a/text_extractor/models/paragraph.py b/text_extractor/models/paragraph.py deleted file mode 100644 index a4b9c6f..0000000 --- a/text_extractor/models/paragraph.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import List, Optional - -from pydantic import BaseModel - -from text_extractor.models.color import Color -from text_extractor.models.font import Font -from text_extractor.models.position import Position - - -class Metadata(BaseModel): - role: Optional[str] = None - color: Optional[Color] = None - positions: Optional[List[Position]] = None - font: Optional[Font] = None - page: int - - -class Paragraph(BaseModel): - text: str - metadata: Metadata diff --git a/text_extractor/models/position.py b/text_extractor/models/position.py deleted file mode 100644 index 69a4527..0000000 --- a/text_extractor/models/position.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class Position(BaseModel): - minY: Optional[float] = None - minX: Optional[float] = None - maxY: Optional[float] = None - maxX: Optional[float] = None diff --git a/text_extractor/parser/pdfact_parser.py b/text_extractor/parser/pdfact_parser.py index df6ed35..6958fe2 100644 --- a/text_extractor/parser/pdfact_parser.py +++ b/text_extractor/parser/pdfact_parser.py @@ -1,10 +1,12 @@ import logging +from typing import List, Dict import requests from fastapi import HTTPException from requests.exceptions import RequestException -from text_extractor.models import Document, Metadata, Paragraph, Position, Color, Font +from text_extractor.models import Document, Color, Font, Attributes, BoundingBox, Content, NodeAttributes, Node +from text_extractor.models.marks import Mark, TextStyleMark from text_extractor.parser.pdf_parser import PDFParser logger = logging.getLogger(__name__) @@ -28,6 +30,7 @@ def parse(self, filename: str, **kwargs) -> Document: res = response.json() if unit == 'paragraph' or unit is None: res = pdfact_formatter(res) + res = heading_filter(res) document = pdfact_to_document(res) return document except RequestException as e: @@ -37,51 +40,79 @@ def parse(self, filename: str, **kwargs) -> Document: def pdfact_to_document(json_data: dict) -> Document: colors = [Color(**color) for color in json_data.get('colors', [])] - fonts = [Font(**font) for font in json_data.get('fonts', [])] + pages: Dict[int, List[Content]] = {} - paragraphs = [] for para in json_data.get('paragraphs', []): paragraph_detail = para['paragraph'] + page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None color_id = paragraph_detail['color']['id'] - color = next((c for c in colors if c.id == color_id), None) font_id = paragraph_detail['font']['id'] - font = next((f for f in fonts if f.id == font_id), None) - - positions = [ - Position( - minY=pos['minY'], - minX=pos['minX'], - maxY=pos['maxY'], - maxX=pos['maxX'] + font_size = paragraph_detail['font']['font-size'] + original_font = next((f for f in fonts if f.id == font_id), None) + + if original_font and font_size: + font = Font(name=original_font.name, id=original_font.id, size=round(font_size)) + else: + font = original_font + + font_info = next((font for font in json_data.get('fonts', []) if font.get('id') == font_id), None) + + is_bold = False + is_italic = False + if font_info: + is_bold = font_info.get('is-bold') + is_italic = font_info.get('is-italic') + + # TODO implement logic for links + marks = [] + if color or font: + mark = TextStyleMark(category='textStyle', color=color, font=font) + marks.append(mark) + if is_bold: + mark = Mark(category='bold') + marks.append(mark) + if is_italic: + mark = Mark(category='italic') + marks.append(mark) + + bounding_boxs = [ + BoundingBox( + min_x=pos['minX'], + min_y=pos['minY'], + max_x=pos['maxX'], + max_y=pos['maxY'], + page=pos['page'] ) for pos in paragraph_detail.get('positions', []) ] - page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None + attributes = Attributes(bounding_box=bounding_boxs) - metadata = Metadata( + content = Content( role=paragraph_detail['role'], - color=color, - positions=positions, - font=font, - page=page - ) - paragraph = Paragraph( text=paragraph_detail['text'], - metadata=metadata + marks=marks, + attributes=attributes ) - paragraphs.append(paragraph) + if page not in pages: + pages[page] = [] + pages[page].append(content) - document = Document( - fonts=fonts, - text=paragraphs, - colors=colors + nodes = [ + Node( + attributes=NodeAttributes(page=page), + content=content_list + ) for page, content_list in pages.items() + ] + + doc = Document( + content=nodes ) - return document + return doc def pdfact_formatter(json_file): @@ -133,6 +164,11 @@ def aggregate_paragraphs(json_file): def compare_paragraphs(p1, p2, tr=25): if p1["paragraph"]["role"] != p2["paragraph"]["role"]: return False + if p1["paragraph"]["color"] != p2["paragraph"]["color"]: + return False + if p1["paragraph"]["font"] != p2["paragraph"]["font"]: + return False + positions1, positions2 = p1["paragraph"]["positions"], p2["paragraph"]["positions"] for pos1 in positions1: @@ -173,3 +209,15 @@ def merge_pargraphs(p1, p2): } return paragraph + + +def heading_filter(json_file): + min_font_size_body = min(paragraph["paragraph"]["font"]["font-size"] for paragraph in json_file["paragraphs"] if + paragraph["paragraph"]["role"] == "body") + for i in range(len(json_file["paragraphs"])): + paragraph = json_file["paragraphs"][i] + if paragraph["paragraph"]["role"] == "heading": + font_size = paragraph["paragraph"]["font"]["font-size"] + if font_size == min_font_size_body: + paragraph["paragraph"]["role"] = "body" + return json_file diff --git a/text_extractor/parser/pymupdf_parser.py b/text_extractor/parser/pymupdf_parser.py index bde4c89..f045def 100644 --- a/text_extractor/parser/pymupdf_parser.py +++ b/text_extractor/parser/pymupdf_parser.py @@ -2,8 +2,7 @@ import fitz -from text_extractor.models import Document, Metadata, Paragraph -from text_extractor.models.chunk import Chunk +from text_extractor.models import Document, Chunk, Content, NodeAttributes, Node from text_extractor.parser.pdf_parser import PDFParser, clean_text @@ -19,20 +18,20 @@ def parse(self, filename: str, **kwargs) -> Document: def chunks_to_document(doc_parsed: List[Chunk]) -> Document: - paragraphs = [] + nodes = [] for page in doc_parsed: page_number = page.metadata['page_number'] + attributes = NodeAttributes(page=page_number) + content = [Content(text=page.text)] - metadata = Metadata(page=page_number) - - paragraph = Paragraph( - text=page.text, - metadata=metadata + node = Node( + attributes=attributes, + content=content ) - paragraphs.append(paragraph) + nodes.append(node) document = Document( - text=paragraphs, + content=nodes, ) return document diff --git a/text_extractor_api/__init__.py b/text_extractor_api/__init__.py index 8b13789..e69de29 100644 --- a/text_extractor_api/__init__.py +++ b/text_extractor_api/__init__.py @@ -1 +0,0 @@ - diff --git a/text_extractor_api/main.py b/text_extractor_api/main.py index 1082a20..c24cc6c 100644 --- a/text_extractor_api/main.py +++ b/text_extractor_api/main.py @@ -11,7 +11,6 @@ app.include_router(parser.router) -@app.get("/") async def root(): logger.info("Welcome to text extractor!") return {"message": "Welcome to text extractor!"}