Skip to content

Commit

Permalink
Use new document output structure (#3)
Browse files Browse the repository at this point in the history
* Added new models to represent the document with a new structure

* Fixed boundingBox's model

* Update README.md

* Update README.md

* Update README.md

* Fixed code after merge request review (MR !2)

* fixed a bug in `pdfact_to_document`

* fixed a bug in `pdfact_to_document`

* Added method that filters headings

* Imports refactor

---------

Co-authored-by: AnnaMarika01 <[email protected]>
Co-authored-by: Andrea Ponti <[email protected]>
  • Loading branch information
3 people authored Jul 16, 2024
1 parent 546120a commit e4dd97f
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 104 deletions.
48 changes: 28 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,34 @@ with the following input as a `json` body:

> **warning** The processing is performed synchronously

The response is a JSON with the extracted text splitted in chunks. In particular, the structure is as follows:

- `text`: The list of chunks, each composed by:
- `text`: The text extracted from the chunk.
- `metadata`: A json with additional information regarding the chunk.
- `fonts`: The list of fonts used in the document.
Each font is represented by `name`, `id`, `is-bold`, `is-type3` and `is-italic`.
Available only using `pdfact` driver.
- `colors`: The list of colors used in the document.
Each color is represented by `r`, `g`, `b` and `id`.
Available only using `pdfact` driver.

The `metadata` of each chunk contains the following information:
- `page`: The page number from which the chunk has been extracted.
- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.)
- `positions`: A list of bounding box containing the text.
Each bounding box is identified by 4 coordinated: `minY`, `minX`, `maxY` and `maxX`.
- `font`: The font of the chunk.
- `color`: The color of the chunk.
The response is a JSON with the extracted text organized into typed nodes, making it easy to navigate and understand the different components of a document.
In particular, the structure is as follows:
- `category`: A string specifying the node category, which is `doc`
- `content`: A list of `page` nodes representing the pages within the document.

Each page node contains the following information:
- `category`: A string specifying the node category, which is `page`.
- `attributes`: A list containing attributes of the page. Currently, it includes only `page`, the number of the node page.
- `content`: A list of chunk each representing a segment of text extracted from the page.

In particular, each `content` contains the following information:
- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.)
- `text`: The text extracted from the chunk.
- `marks`: A list of marks that characterize the text extracted from the chunk.
- `attributes`: A list containing attributes of the chunk, currently including:
- A list of `bounding_box` attributes that contain the text. Each bounding box is identified by 4 coordinated:
`min_x`,`min_y`, `max_x`, `max_y` and `page`, which is the page number where the bounding box is located.

The `marks` of the chunks contains:
- `category`: the type of the mark, which can be: `bold`, `italic`, `textStyle`, `link`

If the mark type is `textStyle`, it includes additional attributes:
- `font`: An object representing the font of the text chunk.
Each font is represented by `name`, `id`, and `size`. Available only using `pdfact` driver.
- `color`: Which is the color of the text chunk.
Each color is represented by `r`, `g`, `b` and `id`. Available only using `pdfact` driver.

if the mark category is `link`, it provides the `url` of the link.

### Error handling

Expand Down
5 changes: 2 additions & 3 deletions text_extractor/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .chunk import Chunk
from .color import Color
from .document import Document
from .document import Document, Attributes, BoundingBox, Content, NodeAttributes, Node
from .font import Font
from .paragraph import Paragraph, Metadata
from .position import Position
43 changes: 35 additions & 8 deletions text_extractor/models/document.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,40 @@
from typing import List, Optional
from typing import List, Union

from pydantic import BaseModel
from pydantic import BaseModel, Field
from typing_extensions import TypedDict

from text_extractor.models.color import Color
from text_extractor.models.font import Font
from text_extractor.models.paragraph import Paragraph
from text_extractor.models.marks import Mark, TextStyleMark


class BoundingBox(TypedDict):
min_x: float
min_y: float
max_x: float
max_y: float
page: int


class Attributes(BaseModel):
bounding_box: List[BoundingBox] = []


class Content(BaseModel):
role: str = "body"
text: str
marks: List[Union[Mark, TextStyleMark]] = []
attributes: Attributes = Attributes()


class NodeAttributes(BaseModel):
page: int


class Node(BaseModel):
category: str = Field("page")
attributes: NodeAttributes
content: List[Content]


class Document(BaseModel):
fonts: Optional[List[Font]] = None
text: List[Paragraph]
colors: Optional[List[Color]] = None
type: str = Field("doc")
content: List[Node]
8 changes: 4 additions & 4 deletions text_extractor/models/font.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pydantic import BaseModel, Field
from typing import Optional

from pydantic import BaseModel


class Font(BaseModel):
name: str
id: str
is_bold: bool = Field(False, alias='is-bold')
is_type3: bool = Field(False, alias='is-type3')
is_italic: bool = Field(False, alias='is-italic')
size: Optional[int] = None
39 changes: 39 additions & 0 deletions text_extractor/models/marks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Any
from typing import Literal, Optional

from pydantic import BaseModel, model_validator

from text_extractor.models.color import Color
from text_extractor.models.font import Font


class Mark(BaseModel):
category: Literal['bold', 'italic', 'textStyle', 'link']

@model_validator(mode='before')
@classmethod
def check_details(cls, data: Any) -> Any:
mark_type = data.get('type')

if mark_type == 'textStyle':
if 'color' not in data and 'font' not in data:
raise ValueError('color or font must be provided when type is textStyle')
if 'url' in data:
raise ValueError('url should not be provided when type is textStyle')

elif mark_type == 'link':
if 'url' not in data:
raise ValueError('url must be provided when type is link')
if 'textStyle' in data:
raise ValueError('textStyle should not be provided when type is link')

return data


class TextStyleMark(Mark):
color: Optional[Color] = None
font: Optional[Font] = None


class UrlMark(Mark):
url: str
20 changes: 0 additions & 20 deletions text_extractor/models/paragraph.py

This file was deleted.

10 changes: 0 additions & 10 deletions text_extractor/models/position.py

This file was deleted.

102 changes: 75 additions & 27 deletions text_extractor/parser/pdfact_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
from typing import List, Dict

import requests
from fastapi import HTTPException
from requests.exceptions import RequestException

from text_extractor.models import Document, Metadata, Paragraph, Position, Color, Font
from text_extractor.models import Document, Color, Font, Attributes, BoundingBox, Content, NodeAttributes, Node
from text_extractor.models.marks import Mark, TextStyleMark
from text_extractor.parser.pdf_parser import PDFParser

logger = logging.getLogger(__name__)
Expand All @@ -28,6 +30,7 @@ def parse(self, filename: str, **kwargs) -> Document:
res = response.json()
if unit == 'paragraph' or unit is None:
res = pdfact_formatter(res)
res = heading_filter(res)
document = pdfact_to_document(res)
return document
except RequestException as e:
Expand All @@ -37,51 +40,79 @@ def parse(self, filename: str, **kwargs) -> Document:

def pdfact_to_document(json_data: dict) -> Document:
colors = [Color(**color) for color in json_data.get('colors', [])]

fonts = [Font(**font) for font in json_data.get('fonts', [])]
pages: Dict[int, List[Content]] = {}

paragraphs = []
for para in json_data.get('paragraphs', []):
paragraph_detail = para['paragraph']
page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None
color_id = paragraph_detail['color']['id']

color = next((c for c in colors if c.id == color_id), None)

font_id = paragraph_detail['font']['id']
font = next((f for f in fonts if f.id == font_id), None)

positions = [
Position(
minY=pos['minY'],
minX=pos['minX'],
maxY=pos['maxY'],
maxX=pos['maxX']
font_size = paragraph_detail['font']['font-size']
original_font = next((f for f in fonts if f.id == font_id), None)

if original_font and font_size:
font = Font(name=original_font.name, id=original_font.id, size=round(font_size))
else:
font = original_font

font_info = next((font for font in json_data.get('fonts', []) if font.get('id') == font_id), None)

is_bold = False
is_italic = False
if font_info:
is_bold = font_info.get('is-bold')
is_italic = font_info.get('is-italic')

# TODO implement logic for links
marks = []
if color or font:
mark = TextStyleMark(category='textStyle', color=color, font=font)
marks.append(mark)
if is_bold:
mark = Mark(category='bold')
marks.append(mark)
if is_italic:
mark = Mark(category='italic')
marks.append(mark)

bounding_boxs = [
BoundingBox(
min_x=pos['minX'],
min_y=pos['minY'],
max_x=pos['maxX'],
max_y=pos['maxY'],
page=pos['page']
) for pos in paragraph_detail.get('positions', [])
]

page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None
attributes = Attributes(bounding_box=bounding_boxs)

metadata = Metadata(
content = Content(
role=paragraph_detail['role'],
color=color,
positions=positions,
font=font,
page=page
)
paragraph = Paragraph(
text=paragraph_detail['text'],
metadata=metadata
marks=marks,
attributes=attributes
)

paragraphs.append(paragraph)
if page not in pages:
pages[page] = []
pages[page].append(content)

document = Document(
fonts=fonts,
text=paragraphs,
colors=colors
nodes = [
Node(
attributes=NodeAttributes(page=page),
content=content_list
) for page, content_list in pages.items()
]

doc = Document(
content=nodes
)

return document
return doc


def pdfact_formatter(json_file):
Expand Down Expand Up @@ -133,6 +164,11 @@ def aggregate_paragraphs(json_file):
def compare_paragraphs(p1, p2, tr=25):
if p1["paragraph"]["role"] != p2["paragraph"]["role"]:
return False
if p1["paragraph"]["color"] != p2["paragraph"]["color"]:
return False
if p1["paragraph"]["font"] != p2["paragraph"]["font"]:
return False

positions1, positions2 = p1["paragraph"]["positions"], p2["paragraph"]["positions"]

for pos1 in positions1:
Expand Down Expand Up @@ -173,3 +209,15 @@ def merge_pargraphs(p1, p2):
}

return paragraph


def heading_filter(json_file):
min_font_size_body = min(paragraph["paragraph"]["font"]["font-size"] for paragraph in json_file["paragraphs"] if
paragraph["paragraph"]["role"] == "body")
for i in range(len(json_file["paragraphs"])):
paragraph = json_file["paragraphs"][i]
if paragraph["paragraph"]["role"] == "heading":
font_size = paragraph["paragraph"]["font"]["font-size"]
if font_size == min_font_size_body:
paragraph["paragraph"]["role"] = "body"
return json_file
19 changes: 9 additions & 10 deletions text_extractor/parser/pymupdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import fitz

from text_extractor.models import Document, Metadata, Paragraph
from text_extractor.models.chunk import Chunk
from text_extractor.models import Document, Chunk, Content, NodeAttributes, Node
from text_extractor.parser.pdf_parser import PDFParser, clean_text


Expand All @@ -19,20 +18,20 @@ def parse(self, filename: str, **kwargs) -> Document:


def chunks_to_document(doc_parsed: List[Chunk]) -> Document:
paragraphs = []
nodes = []
for page in doc_parsed:
page_number = page.metadata['page_number']
attributes = NodeAttributes(page=page_number)
content = [Content(text=page.text)]

metadata = Metadata(page=page_number)

paragraph = Paragraph(
text=page.text,
metadata=metadata
node = Node(
attributes=attributes,
content=content
)

paragraphs.append(paragraph)
nodes.append(node)

document = Document(
text=paragraphs,
content=nodes,
)
return document
1 change: 0 additions & 1 deletion text_extractor_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@

Loading

0 comments on commit e4dd97f

Please sign in to comment.