Skip to content

Commit

Permalink
Add heading level (#7)
Browse files Browse the repository at this point in the history
* Add a basic way to identify heading levels

* Integrate the release document model

---------

Co-authored-by: AnnaMarika01 <[email protected]>
Co-authored-by: Andrea Ponti <[email protected]>
  • Loading branch information
3 people authored Oct 2, 2024
1 parent f611871 commit 62e977f
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 123 deletions.
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pymupdf==1.22.5
numpy~=1.24.3
requests==2.32.3
fastapi~=0.111.0
pydantic~=2.7.1
pydantic~=2.9.0
pydantic_settings~=2.2.1
uvicorn==0.22.0
uvicorn==0.22.0
parse-document-model==0.2.0
3 changes: 0 additions & 3 deletions text_extractor/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
from .chunk import Chunk
from .color import Color
from .document import Document, Attributes, BoundingBox, Content, NodeAttributes, Node
from .font import Font
8 changes: 0 additions & 8 deletions text_extractor/models/color.py

This file was deleted.

40 changes: 0 additions & 40 deletions text_extractor/models/document.py

This file was deleted.

9 changes: 0 additions & 9 deletions text_extractor/models/font.py

This file was deleted.

39 changes: 0 additions & 39 deletions text_extractor/models/marks.py

This file was deleted.

2 changes: 1 addition & 1 deletion text_extractor/parser/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from abc import ABC, abstractmethod

from text_extractor.models import Document
from parse_document_model import Document


class PDFParser(ABC):
Expand Down
181 changes: 168 additions & 13 deletions text_extractor/parser/pdfact_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
from typing import List, Dict
from collections import Counter
from typing import List, Dict, Any

import requests
from parse_document_model import Document, Page
from parse_document_model.attributes import TextAttributes, PageAttributes, BoundingBox
from parse_document_model.document import Text
from parse_document_model.marks import Mark, TextStyleMark, Color, Font
from requests.exceptions import RequestException

from text_extractor.models import Document, Color, Font, Attributes, BoundingBox, Content, NodeAttributes, Node
from text_extractor.models.marks import Mark, TextStyleMark
from text_extractor.parser.pdf_parser import PDFParser

logger = logging.getLogger(__name__)
Expand All @@ -31,6 +34,7 @@ def parse(self, filename: str, **kwargs) -> Document:
res = pdfact_formatter(res)
res = heading_filter(res)
document = pdfact_to_document(res)
document = determine_heading_level(document)
return document
except RequestException as e:
logger.exception(f"An error occurred while trying to reach the API: {e}", exc_info=True)
Expand All @@ -39,8 +43,8 @@ def parse(self, filename: str, **kwargs) -> Document:

def pdfact_to_document(json_data: dict) -> Document:
colors = [Color(**color) for color in json_data.get('colors', [])]
fonts = [Font(**font) for font in json_data.get('fonts', [])]
pages: Dict[int, List[Content]] = {}
fonts = [Font(id=font['id'], name=font['name'], size=-1) for font in json_data.get('fonts', [])]
pages: Dict[int, List[Text]] = {}

for para in json_data.get('paragraphs', []):
paragraph_detail = para['paragraph']
Expand Down Expand Up @@ -87,11 +91,11 @@ def pdfact_to_document(json_data: dict) -> Document:
) for pos in paragraph_detail.get('positions', [])
]

attributes = Attributes(bounding_box=bounding_boxs)
attributes = TextAttributes(bounding_box=bounding_boxs)

content = Content(
role=paragraph_detail['role'],
text=paragraph_detail['text'],
content = Text(
category=paragraph_detail['role'],
content=paragraph_detail['text'],
marks=marks,
attributes=attributes
)
Expand All @@ -100,15 +104,15 @@ def pdfact_to_document(json_data: dict) -> Document:
pages[page] = []
pages[page].append(content)

nodes = [
Node(
attributes=NodeAttributes(page=page),
nodes_page = [
Page(
attributes=PageAttributes(page=page),
content=content_list
) for page, content_list in pages.items()
]

doc = Document(
content=nodes
content=nodes_page
)

return doc
Expand Down Expand Up @@ -231,3 +235,154 @@ def heading_filter(json_file):
if font_size == min_font_size_body:
paragraph["paragraph"]["role"] = "body"
return json_file


def determine_heading_level(document: Document) -> Document:
"""
Determines the heading level based on the font style (font name and font size) of headings in the document.
The function iterates over each page and each node of the document to identify headings and collects their font
styles. These styles are then sorted by font size in descending order, assuming that larger font sizes correspond
to higher-level headings. Finally, the headings are assigned levels based on their font styles.
:param document: The input document object, containing content structured into pages. Each page consists
of nodes representing portions of text.
:return: The document with updated heading levels assigned to each heading node.
"""
heading_styles = []

for page in document.content:
for node in page.content:
if node.category == "heading" and node.marks:
marks = node.marks
font_name = None
font_size = None

for mark in marks:
if mark.category == 'textStyle':
font_name = mark.font.name
font_size = mark.font.size

if font_name and font_size:
# Avoid duplicates: only add new styles
existing_style = next((style for style in heading_styles if
style['font_name'] == font_name and style['font_size'] == font_size), None)
if not existing_style:
heading_styles.append({
'font_name': font_name,
'font_size': font_size,
'occurrences': 1
})
else:
existing_style['occurrences'] += 1

if not heading_styles:
return document

# Sort the styles by font size in descending order
heading_styles = sorted(heading_styles, key=lambda x: x['font_size'], reverse=True)

largest_font_style = heading_styles[0] if heading_styles else None
if largest_font_style and largest_font_style['occurrences'] == 1:
heading_styles = heading_styles[1:]
# Assign levels to the sorted heading styles
assigned_levels = assign_heading_levels(heading_styles)

for page in document.content:
for node in page.content:
if node.category == "heading" and node.marks:
marks = node.marks
font_name = None
font_size = None

for mark in marks:
if mark.category == 'textStyle':
font_name = mark.font.name
font_size = mark.font.size

if font_name and font_size:
if (largest_font_style and
largest_font_style['font_name'] == font_name and
largest_font_style['font_size'] == font_size):
node.category = "title"
else:
level = 4
for style in assigned_levels:
if style['font_name'] == font_name and style['font_size'] == font_size:
level = style['level']
break
node.attributes.level = level

return document


def assign_heading_levels(heading_styles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Assigns heading levels to a list of heading styles based on font size and frequency.
:param heading_styles: A list of dictionaries where each dictionary contains
information about a heading's font name ('font_name')
and its font size ('font_size').
:return: A list of dictionaries, where each dictionary includes 'font_name',
'font_size', and the assigned 'level' (from 1 to 4).
Level 1 is for the largest and level 4 is for the smallest.
"""
# Count the number of occurrences for each font
font_count = Counter([font['font_name'] for font in heading_styles])

# Identify the most common font (likely the main heading font)
main_font = font_count.most_common(1)[0][0]
# Sort the main font headings by decreasing font size
main_fonts = sorted([f for f in heading_styles if f['font_name'] == main_font],
key=lambda x: -x['font_size'])
# Collect other fonts that are not the main font
other_fonts = [f for f in heading_styles if f['font_name'] != main_font]
levels_assigned = {}
# Assign levels (1-4) to the main font headings based on font size
for i, font in enumerate(main_fonts):
level = min(i + 1, 4)
levels_assigned[(font['font_name'], font['font_size'])] = level

# For other fonts, assign levels based on font size comparisons
for font in other_fonts:
size = font['font_size']
same_size_fonts = [f for f in levels_assigned if f[1] == size]

# If the same size exists, assign its level
if same_size_fonts:
level = levels_assigned[same_size_fonts[0]]
else:
# Otherwise, assign level based on size relative to existing main fonts
existing_sizes = sorted([f[1] for f in levels_assigned])

if size > existing_sizes[-1]:
level = 1
elif size < existing_sizes[0]:
level = 4
else:
for i in range(len(existing_sizes) - 1):
if existing_sizes[i + 1] > size > existing_sizes[i]:
mid_point = (existing_sizes[i] + existing_sizes[i + 1]) / 2
# Ensure we select a font size for which a level is already assigned
larger_font = next(f for f in levels_assigned if f[1] == existing_sizes[i + 1])
smaller_font = next(f for f in levels_assigned if f[1] == existing_sizes[i])
if size >= mid_point:
level = levels_assigned[larger_font]
else:
level = levels_assigned[smaller_font]
break

levels_assigned[(font['font_name'], font['font_size'])] = level

result = []
for font in heading_styles:
font_info = {
'font_name': font['font_name'],
'font_size': font['font_size'],
'level': levels_assigned[(font['font_name'], font['font_size'])]
}
result.append(font_info)

return result
17 changes: 10 additions & 7 deletions text_extractor/parser/pymupdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from typing import List

import fitz
from parse_document_model import Document, Page
from parse_document_model.attributes import PageAttributes
from parse_document_model.document import Text

from text_extractor.models import Document, Chunk, Content, NodeAttributes, Node
from text_extractor.models import Chunk
from text_extractor.parser.pdf_parser import PDFParser, clean_text


Expand All @@ -18,20 +21,20 @@ def parse(self, filename: str, **kwargs) -> Document:


def chunks_to_document(doc_parsed: List[Chunk]) -> Document:
nodes = []
pages = []
for page in doc_parsed:
page_number = page.metadata['page_number']
attributes = NodeAttributes(page=page_number)
content = [Content(text=page.text)]
attributes = PageAttributes(page=page_number)
content = [Text(content=page.text, category="body")]

node = Node(
page = Page(
attributes=attributes,
content=content
)

nodes.append(node)
pages.append(page)

document = Document(
content=nodes,
content=pages,
)
return document
Loading

0 comments on commit 62e977f

Please sign in to comment.