From b18be2392550fda4f9009e39aa7cda59190af79d Mon Sep 17 00:00:00 2001 From: Andrea Ponti <59694427+andreaponti5@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:46:54 +0200 Subject: [PATCH] FastAPI and PDFAct new implementation (#2) * Replace Flask with FastAPI * Add PDFAct driver --------- Co-authored-by: AnnaMarika01 --- .github/workflows/ci.yml | 2 +- .gitignore | 1 + Dockerfile | 10 +- README.md | 73 ++++---- docker-compose.yaml | 19 +- gunicorn.sh | 2 - parsing_service/__init__.py | 80 -------- parsing_service/implementation/chunk.py | 10 - .../implementation/parser_factory.py | 19 -- parsing_service/implementation/pdf_parser.py | 41 ---- parsing_service/models/parser.py | 20 -- requirements.txt | 8 +- text_extractor/__init__.py | 5 + {parsing_service => text_extractor}/logger.py | 0 text_extractor/models/__init__.py | 5 + .../models/chunk.py | 10 +- text_extractor/models/color.py | 8 + text_extractor/models/document.py | 13 ++ text_extractor/models/font.py | 9 + text_extractor/models/paragraph.py | 20 ++ text_extractor/models/position.py | 10 + text_extractor/parser/__init__.py | 0 text_extractor/parser/pdf_parser.py | 24 +++ text_extractor/parser/pdfact_parser.py | 175 ++++++++++++++++++ text_extractor/parser/pymupdf_parser.py | 38 ++++ text_extractor_api/__init__.py | 1 + text_extractor_api/config.py | 10 + text_extractor_api/main.py | 17 ++ text_extractor_api/models/__init__.py | 1 + .../models/extract_text_request.py | 11 ++ text_extractor_api/routers/__init__.py | 0 text_extractor_api/routers/parser.py | 76 ++++++++ uvicorn.sh | 2 + 33 files changed, 485 insertions(+), 235 deletions(-) delete mode 100644 gunicorn.sh delete mode 100644 parsing_service/__init__.py delete mode 100644 parsing_service/implementation/chunk.py delete mode 100644 parsing_service/implementation/parser_factory.py delete mode 100644 parsing_service/implementation/pdf_parser.py delete mode 100644 parsing_service/models/parser.py create mode 100644 text_extractor/__init__.py rename {parsing_service => text_extractor}/logger.py (100%) create mode 100644 text_extractor/models/__init__.py rename parsing_service/models/chunck.py => text_extractor/models/chunk.py (70%) create mode 100644 text_extractor/models/color.py create mode 100644 text_extractor/models/document.py create mode 100644 text_extractor/models/font.py create mode 100644 text_extractor/models/paragraph.py create mode 100644 text_extractor/models/position.py create mode 100644 text_extractor/parser/__init__.py create mode 100644 text_extractor/parser/pdf_parser.py create mode 100644 text_extractor/parser/pdfact_parser.py create mode 100644 text_extractor/parser/pymupdf_parser.py create mode 100644 text_extractor_api/__init__.py create mode 100644 text_extractor_api/config.py create mode 100644 text_extractor_api/main.py create mode 100644 text_extractor_api/models/__init__.py create mode 100644 text_extractor_api/models/extract_text_request.py create mode 100644 text_extractor_api/routers/__init__.py create mode 100644 text_extractor_api/routers/parser.py create mode 100644 uvicorn.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 487d762..f1dcfd1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,4 +23,4 @@ jobs: fetch-depth: 1 - name: Lint the Shell scripts - run: shellcheck ./gunicorn.sh + run: shellcheck ./uvicorn.sh diff --git a/.gitignore b/.gitignore index 3bf780b..5cf0957 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .idea +logs .env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 0aa64fd..6591947 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,13 +33,15 @@ WORKDIR /app COPY --from=build-image /opt/venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" -COPY parsing_service/ parsing_service/ -COPY root.py gunicorn.sh ./ -RUN chmod +x ./gunicorn.sh +COPY text_extractor_api/ text_extractor_api/ +COPY text_extractor/ text_extractor/ +COPY root.py uvicorn.sh ./ + +RUN chmod +x ./uvicorn.sh EXPOSE 5000/tcp ENTRYPOINT ["tini", "--"] -CMD ["/app/gunicorn.sh"] \ No newline at end of file +CMD ["/app/uvicorn.sh"] \ No newline at end of file diff --git a/README.md b/README.md index dd1c837..26e71df 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ [![CI](https://github.com/data-house/pdf-text-extractor/actions/workflows/ci.yml/badge.svg)](https://github.com/data-house/pdf-text-extractor/actions/workflows/ci.yml) [![Build Docker Image](https://github.com/data-house/pdf-text-extractor/actions/workflows/docker.yml/badge.svg)](https://github.com/data-house/pdf-text-extractor/actions/workflows/docker.yml) -# PDF Text extraction service for Data House +# PDF Text Extraction Service -Extract text from PDFs keeping page information. +A FastAPI application to extract text from pdf documents. ## Getting started @@ -18,18 +18,9 @@ A sample [`docker-compose.yaml` file](./docker-compose.yaml) is available within > Please refer to [Releases](https://github.com/data-house/pdf-text-extractor/releases) and [Packages](https://github.com/data-house/pdf-text-extractor/pkgs/container/pdf-text-extractor) for the available tags. -**Available environment variables** - -| variable | default | description | -|------|---------|-------------| -| `GUNICORN_WORKERS` | 2 | The number of [Gunicorn](https://docs.gunicorn.org/en/latest/settings.html#worker-class) sync workers | -| `GUNICORN_WORKERS_TIMEOUT` | 600 | The timeout, in seconds, of each worker | - - - ## Usage -The PDF Text Extract service expose a web application on port `5000`. The available API receive a PDF file via a URL and return the extracted text as a JSON response. +The PDF Text Extract service expose a web application. The available API receive a PDF file via a URL and return the extracted text as a JSON response. The exposed service is unauthenticated therefore consider exposing it only within a trusted network. If you plan to make it available publicly consider adding a reverse proxy with authentication in front. @@ -38,44 +29,44 @@ The exposed service is unauthenticated therefore consider exposing it only withi The service expose only one endpoint `/extract-text` that accepts a `POST` request with the following input as a `json` body: -- `url` the URL of the PDF file to process -- `mime_type` the mime type of the file (it is expected to be `application/pdf`) +- `url`: the URL of the PDF file to process. +- `mime_type`: the mime type of the file (it is expected to be `application/pdf`). +- `driver`: two drivers are currently implemented `pymupdf` and `pdfact`. It defines the extraction backend to use. > **warning** The processing is performed synchronously -The response will be a JSON containing: - -- `status` the status of the operation. Usually `ok`. -- `content` a list of objects describing the chunked content with the page reference. Each object contains a `text` property with the part of the PDF text and a `metadata` object with the `page_number` property representing the page of the PDF from which the `text` was extracted. +The response is a JSON with the extracted text splitted in chunks. In particular, the structure is as follows: -The following code block shows a possible output: +- `text`: The list of chunks, each composed by: + - `text`: The text extracted from the chunk. + - `metadata`: A json with additional information regarding the chunk. +- `fonts`: The list of fonts used in the document. +Each font is represented by `name`, `id`, `is-bold`, `is-type3` and `is-italic`. +Available only using `pdfact` driver. +- `colors`: The list of colors used in the document. +Each color is represented by `r`, `g`, `b` and `id`. +Available only using `pdfact` driver. -```json -{ - "status": "ok", - "content": [ - { - "text": "This is a test PDF to be used as input in unit tests", - "metadata": { - "page_number": 1 - } - } - ] -} -``` +The `metadata` of each chunk contains the following information: +- `page`: The page number from which the chunk has been extracted. +- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.) +- `positions`: A list of bounding box containing the text. +Each bounding box is identified by 4 coordinated: `minY`, `minX`, `maxY` and `maxX`. +- `font`: The font of the chunk. +- `color`: The color of the chunk. ### Error handling The service can return the following errors -| code | message | description | -|------|---------|-------------| -| `422` | No url found in request | In case the `url` field in the request is missing | -| `422` | No mime_type found in request | In case the `mime_type` field in the request is missing | -| `422` | Unsupported file type | In case the file is not a PDF | -| `500` | Error while saving file | In case it was not possible to download the file from the specified URL | -| `500` | Error while parsing file | In case it was not possible to open the file after download | +| code | message | description | +|-------|-------------------------------|-------------------------------------------------------------------------| +| `422` | No url found in request | In case the `url` field in the request is missing | +| `422` | No mime_type found in request | In case the `mime_type` field in the request is missing | +| `422` | Unsupported file type | In case the file is not a PDF | +| `500` | Error while saving file | In case it was not possible to download the file from the specified URL | +| `500` | Error while parsing file | In case it was not possible to open the file after download | The body of the response can contain a JSON with the following fields: @@ -94,7 +85,7 @@ The body of the response can contain a JSON with the following fields: ## Development -The PDF text extract service is built using [Flask](https://flask.palletsprojects.com/) on Python 3.9. +The PDF text extract service is built using [FastAPI](https://fastapi.tiangolo.com/) and Python 3.9. Given the selected stack the development requires: @@ -111,7 +102,7 @@ pip install -r requirements.txt Run the local development application using: ```bash -python -m flask --app parsing_service run +fastapi dev text_extractor_api/main.py ``` diff --git a/docker-compose.yaml b/docker-compose.yaml index e55cd2d..3393bbf 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,16 +1,21 @@ version: '3' networks: - web: + internal: driver: bridge services: app: - image: "ghcr.io/data-house/pdf-text-extractor:main" - environment: - GUNICORN_WORKERS: 2 - GUNICORN_WORKERS_TIMEOUT: 600 + build: + context: . networks: - - web + - internal + env_file: + - .env ports: - - "5200:5000" \ No newline at end of file + - "5002:5000" + + pdfact: + image: "ghcr.io/data-house/pdfact:main" + networks: + - internal \ No newline at end of file diff --git a/gunicorn.sh b/gunicorn.sh deleted file mode 100644 index c880228..0000000 --- a/gunicorn.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -gunicorn "parsing_service:create_app()" -w "${GUNICORN_WORKERS:-2}" --timeout "${GUNICORN_WORKERS_TIMEOUT:-600}" -b 0.0.0.0:5000 \ No newline at end of file diff --git a/parsing_service/__init__.py b/parsing_service/__init__.py deleted file mode 100644 index 28e60a7..0000000 --- a/parsing_service/__init__.py +++ /dev/null @@ -1,80 +0,0 @@ -import hashlib -import logging -import os - -import requests -from requests.exceptions import HTTPError -from requests.exceptions import Timeout -from flask import Flask, request - -from parsing_service.implementation.parser_factory import parse_file -from parsing_service.logger import init_logger - -logger = logging.getLogger(__name__) - - -def create_app(): - init_logger() - app = Flask(__name__, instance_relative_config=True) - app.resource_path = os.environ.get("RESOURCE_PATH", "/tmp") - try: - os.mkdir(app.resource_path) - except FileExistsError: - pass - - @app.route("/extract-text", methods=["POST"]) - def text_extract_endpoint(): - - logger.info("Received parse request") - - if not request.json: - logger.warning("No json found in request") - return {"message": "No json found in request", "code": 422, "type": "Unprocessable Entity"}, 422 - if not request.json.get("url"): - logger.warning("No file found in request") - return {"message": "No url found in request", "code": 422, "type": "Unprocessable Entity"}, 422 - if not request.json.get("mime_type"): - logger.warning("No mime_type found in request") - return {"message": "No mime_type found in request", "code": 422, "type": "Unprocessable Entity"}, 422 - - if request.json.get("mime_type") != 'application/pdf': - mime = request.json.get("mime_type") - logger.warning(f"Unsupported format [{mime}]") - return {"message": f"Unsupported mime type. Expecting application/pdf received [{mime}]", "code": 422, "type": "Unprocessable Entity"}, 422 - - filename = hashlib.sha256(request.json.get("url").encode()).hexdigest() - extension = request.json.get("mime_type").split("/")[-1] - filename = f"{filename}.{extension}" - logger.info(f"Parsing {filename}") - - try: - resp = requests.get(request.json.get("url"), allow_redirects=True, timeout=120) - - resp.raise_for_status() - - open(os.path.join(app.resource_path, filename), 'wb').write(resp.content) - except HTTPError as http_err: - logger.exception("Error while downloading file", exc_info=True) - return {"message": f"Error while downloading file [{http_err}]", "code": 500, "type": "Internal Server Error"}, 500 - except Timeout as http_timeout: - logger.exception("Timeout while downloading file", exc_info=True) - return {"message": f"File download not completed [{http_timeout}]", "code": 500, "type": "Internal Server Error"}, 500 - except Exception as requestError: - logger.exception("Error while downloading file", exc_info=True) - return {"message": "Error while saving file", "code": 500, "type": "Internal Server Error"}, 500 - - try: - doc_parsed = parse_file(os.path.join(app.resource_path, filename), extension) - os.remove(os.path.join(app.resource_path, filename)) - except ValueError as ve: - logger.exception("Unsupported file type", exc_info=True) - return {"message": "Unsupported file type", "code": 422, "type": "Unprocessable Entity"}, 422 - except Exception as err: - logger.exception("Error while parsing file", exc_info=True) - return {"message": "Error while parsing file", "code": 500, "type": "Internal Server Error"}, 500 - - logger.info(f"Parse done for file {filename}") - - return {"status": "ok", "content": [chunk.to_dict() for chunk in doc_parsed]}, 200 - - return app diff --git a/parsing_service/implementation/chunk.py b/parsing_service/implementation/chunk.py deleted file mode 100644 index 47a4c5f..0000000 --- a/parsing_service/implementation/chunk.py +++ /dev/null @@ -1,10 +0,0 @@ -from parsing_service.models.chunck import AChunk - - -class Chunk(AChunk): - """ - A chunk of text. - """ - - def __init__(self, text: str, metadata: dict = None, embedded_vector: list = None): - super().__init__(text, metadata, embedded_vector) diff --git a/parsing_service/implementation/parser_factory.py b/parsing_service/implementation/parser_factory.py deleted file mode 100644 index cfc9953..0000000 --- a/parsing_service/implementation/parser_factory.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import List - -from parsing_service.implementation.pdf_parser import PDFParser -from parsing_service.models.chunck import AChunk - - -def parse_file(filename: str, filetype: str) -> List[AChunk]: - """ - Parse the given file and return a list of chunks. - :param filename: The name of the file to parse. - :param filetype: The type of the file to parse. - :return: A list of extracted chunks. - """ - if filetype != "pdf": - raise ValueError(f"Invalid filetype {filetype}") - - parser = PDFParser() - context = parser.parse(filename) - return context diff --git a/parsing_service/implementation/pdf_parser.py b/parsing_service/implementation/pdf_parser.py deleted file mode 100644 index e2fdbc8..0000000 --- a/parsing_service/implementation/pdf_parser.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import re -from typing import List - -import fitz - -from parsing_service.implementation.chunk import Chunk -from parsing_service.models.chunck import AChunk -from parsing_service.models.parser import Parser - - -class PDFParser(Parser): - - def __init__(self): - super().__init__() - - def parse(self, filename: str) -> List[AChunk]: - pdf = fitz.open(filename) - documents = [] - skipping = False - for page in pdf: - text = page.get_text() - if os.environ.get("REMOVE_METHODOLOGY_CHAPTER", "True").lower() == "true": - if text.startswith("2 EVALUIERUNGSDESIGN UND METHODOLOGIE"): - skipping = True - if text.startswith("3 ERGEBNISSE DER EVALUIERUNG"): - skipping = False - if skipping: - continue - # Merge hyphenated words - text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) - # Fix newlines in the middle of sentences - text = re.sub(r"(? List[AChunk]: - """ - Read and extract the text from a document into a list of chunks. - - :param filename: a string representing the path to access the document. - :return: a list of chunk extracted from the document. - """ - pass diff --git a/requirements.txt b/requirements.txt index c7410dd..e01fc38 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ -Flask==2.3.2 pandas==2.0.2 pymupdf==1.22.5 numpy~=1.24.3 -requests==2.31.0 -gunicorn==20.1.0; platform_system != "Windows" +requests==2.32.0 +fastapi~=0.111.0 +pydantic~=2.7.1 +pydantic_settings~=2.2.1 +uvicorn==0.22.0 \ No newline at end of file diff --git a/text_extractor/__init__.py b/text_extractor/__init__.py new file mode 100644 index 0000000..be05e01 --- /dev/null +++ b/text_extractor/__init__.py @@ -0,0 +1,5 @@ +import logging + +from text_extractor.logger import init_logger + +logger = logging.getLogger(__name__) diff --git a/parsing_service/logger.py b/text_extractor/logger.py similarity index 100% rename from parsing_service/logger.py rename to text_extractor/logger.py diff --git a/text_extractor/models/__init__.py b/text_extractor/models/__init__.py new file mode 100644 index 0000000..df07121 --- /dev/null +++ b/text_extractor/models/__init__.py @@ -0,0 +1,5 @@ +from .color import Color +from .document import Document +from .font import Font +from .paragraph import Paragraph, Metadata +from .position import Position diff --git a/parsing_service/models/chunck.py b/text_extractor/models/chunk.py similarity index 70% rename from parsing_service/models/chunck.py rename to text_extractor/models/chunk.py index 1f85ff3..e8b0a59 100644 --- a/parsing_service/models/chunck.py +++ b/text_extractor/models/chunk.py @@ -1,22 +1,18 @@ import json -from abc import ABC -from typing import List -class AChunk(ABC): +class Chunk: """ - Abstract class to represent a chunk of a document + A chunk of text """ - def __init__(self, text: str, metadata: dict = None, embedded_vector: List[float] = None): + def __init__(self, text: str, metadata: dict = None): """ :param text: the text contained in the chunk. :param metadata: additional data to identify the chunk in a document. - :param embedded_vector: the embedding of text. """ self.text = text self.metadata = metadata - self.embedded_vector = embedded_vector def __str__(self) -> str: """ diff --git a/text_extractor/models/color.py b/text_extractor/models/color.py new file mode 100644 index 0000000..0217604 --- /dev/null +++ b/text_extractor/models/color.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + + +class Color(BaseModel): + r: int + b: int + g: int + id: str diff --git a/text_extractor/models/document.py b/text_extractor/models/document.py new file mode 100644 index 0000000..11e4f87 --- /dev/null +++ b/text_extractor/models/document.py @@ -0,0 +1,13 @@ +from typing import List, Optional + +from pydantic import BaseModel + +from text_extractor.models.color import Color +from text_extractor.models.font import Font +from text_extractor.models.paragraph import Paragraph + + +class Document(BaseModel): + fonts: Optional[List[Font]] = None + text: List[Paragraph] + colors: Optional[List[Color]] = None diff --git a/text_extractor/models/font.py b/text_extractor/models/font.py new file mode 100644 index 0000000..277f6bf --- /dev/null +++ b/text_extractor/models/font.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel, Field + + +class Font(BaseModel): + name: str + id: str + is_bold: bool = Field(False, alias='is-bold') + is_type3: bool = Field(False, alias='is-type3') + is_italic: bool = Field(False, alias='is-italic') diff --git a/text_extractor/models/paragraph.py b/text_extractor/models/paragraph.py new file mode 100644 index 0000000..a4b9c6f --- /dev/null +++ b/text_extractor/models/paragraph.py @@ -0,0 +1,20 @@ +from typing import List, Optional + +from pydantic import BaseModel + +from text_extractor.models.color import Color +from text_extractor.models.font import Font +from text_extractor.models.position import Position + + +class Metadata(BaseModel): + role: Optional[str] = None + color: Optional[Color] = None + positions: Optional[List[Position]] = None + font: Optional[Font] = None + page: int + + +class Paragraph(BaseModel): + text: str + metadata: Metadata diff --git a/text_extractor/models/position.py b/text_extractor/models/position.py new file mode 100644 index 0000000..69a4527 --- /dev/null +++ b/text_extractor/models/position.py @@ -0,0 +1,10 @@ +from typing import Optional + +from pydantic import BaseModel + + +class Position(BaseModel): + minY: Optional[float] = None + minX: Optional[float] = None + maxY: Optional[float] = None + maxX: Optional[float] = None diff --git a/text_extractor/parser/__init__.py b/text_extractor/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text_extractor/parser/pdf_parser.py b/text_extractor/parser/pdf_parser.py new file mode 100644 index 0000000..a537532 --- /dev/null +++ b/text_extractor/parser/pdf_parser.py @@ -0,0 +1,24 @@ +import re +from abc import ABC, abstractmethod + +from text_extractor.models import Document + + +class PDFParser(ABC): + @abstractmethod + def parse(self, filename: str, **kwargs) -> Document: + pass + + +def clean_text(text: str) -> str: + # Merge hyphenated words + text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) + # Fix newlines in the middle of sentences + text = re.sub(r"(? None: + self.url = url + + def parse(self, filename: str, **kwargs) -> Document: + body = {"url": filename} + unit = kwargs.get("unit", None) + roles = kwargs.get("roles", None) + if unit is not None: + body["unit"] = unit + if roles is not None: + body["roles"] = roles + try: + response = requests.post(self.url, json=body) + response.raise_for_status() + res = response.json() + if unit == 'paragraph' or unit is None: + res = pdfact_formatter(res) + document = pdfact_to_document(res) + return document + except RequestException as e: + logger.exception(f"An error occurred while trying to reach the API: {e}", exc_info=True) + raise HTTPException(status_code=503, detail="Error while trying to reach the API") + + +def pdfact_to_document(json_data: dict) -> Document: + colors = [Color(**color) for color in json_data.get('colors', [])] + + fonts = [Font(**font) for font in json_data.get('fonts', [])] + + paragraphs = [] + for para in json_data.get('paragraphs', []): + paragraph_detail = para['paragraph'] + color_id = paragraph_detail['color']['id'] + + color = next((c for c in colors if c.id == color_id), None) + + font_id = paragraph_detail['font']['id'] + font = next((f for f in fonts if f.id == font_id), None) + + positions = [ + Position( + minY=pos['minY'], + minX=pos['minX'], + maxY=pos['maxY'], + maxX=pos['maxX'] + ) for pos in paragraph_detail.get('positions', []) + ] + + page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None + + metadata = Metadata( + role=paragraph_detail['role'], + color=color, + positions=positions, + font=font, + page=page + ) + paragraph = Paragraph( + text=paragraph_detail['text'], + metadata=metadata + ) + + paragraphs.append(paragraph) + + document = Document( + fonts=fonts, + text=paragraphs, + colors=colors + ) + + return document + + +def pdfact_formatter(json_file): + previous_length = None + current_json = json_file + current_length = len(current_json["paragraphs"]) + + while previous_length is None or previous_length != current_length: + previous_length = current_length + current_json = aggregate_paragraphs(current_json) + current_length = len(current_json["paragraphs"]) + + return current_json + + +def aggregate_paragraphs(json_file): + output = [] + fonts = json_file["fonts"] + colors = json_file["colors"] + i = 0 + while i < len(json_file["paragraphs"][:-1]): + paragraph1 = json_file["paragraphs"][i] + paragraph2 = json_file["paragraphs"][i + 1] + + if compare_paragraphs(paragraph1, paragraph2): + paragraph = merge_pargraphs(paragraph1, paragraph2) + output.append(paragraph) + + # After merging the two paragraphs, proceed to the paragraph following the (i+1)-th one + if i + 2 < len(json_file["paragraphs"][:-1]): + i += 2 + continue + # if the paragraph following the (i+1)-th one is the last one, then concatenate it + elif i + 2 == len(json_file["paragraphs"][:-1]): + output.append(json_file["paragraphs"][i + 2]) + break + else: + output.append(json_file["paragraphs"][i]) + + # If the next paragraph is the last one, then concatenate it to the list of paragraphs + if i + 1 == len(json_file["paragraphs"][:-1]): + output.append(json_file["paragraphs"][i + 1]) + i += 1 + + paragraphs = {'fonts': fonts, 'paragraphs': output, 'colors': colors} + return paragraphs + + +def compare_paragraphs(p1, p2, tr=25): + if p1["paragraph"]["role"] != p2["paragraph"]["role"]: + return False + positions1, positions2 = p1["paragraph"]["positions"], p2["paragraph"]["positions"] + + for pos1 in positions1: + for pos2 in positions2: + # Compare if they are aligned with respect to the x-axis and if their distance is less than a threshold + if (pos1["minX"] - pos2["minX"] == 0 + or pos1["maxX"] - pos2["maxX"] == 0 + or (pos1["minX"] + pos1["maxX"]) / 2 == (pos2["minX"] + pos2["maxX"]) / 2) \ + and (pos1["minY"] - pos2["maxY"] < tr): + return True + # Compare if they are aligned with respect to the y-axis and if their distance is less than a threshold + elif (pos1["minY"] - pos2["minY"] == 0 + or pos1["maxY"] - pos2["maxY"] == 0 + or (pos1["minY"] + pos1["maxY"]) / 2 == (pos2["minY"] + pos2["maxY"]) / 2) \ + and (pos2["minX"] - pos1["maxX"] < tr): + return True + + return False + + +def merge_pargraphs(p1, p2): + role = p1["paragraph"]["role"] + color = p1["paragraph"]["color"] + font = p1["paragraph"]["font"] + positions1 = p1["paragraph"]["positions"] + positions2 = p2["paragraph"]["positions"] + text1 = p1["paragraph"]["text"] + text2 = p2["paragraph"]["text"] + + paragraph = { + "paragraph": { + "role": role, + "color": color, + "positions": positions1 + positions2, + "text": text1 + '\n\n' + text2, + "font": font + } + } + + return paragraph diff --git a/text_extractor/parser/pymupdf_parser.py b/text_extractor/parser/pymupdf_parser.py new file mode 100644 index 0000000..bde4c89 --- /dev/null +++ b/text_extractor/parser/pymupdf_parser.py @@ -0,0 +1,38 @@ +from typing import List + +import fitz + +from text_extractor.models import Document, Metadata, Paragraph +from text_extractor.models.chunk import Chunk +from text_extractor.parser.pdf_parser import PDFParser, clean_text + + +class PymupdfParser(PDFParser): + def parse(self, filename: str, **kwargs) -> Document: + pdf = fitz.open(filename) + documents = [] + for page in pdf: + text = page.get_text() + text = clean_text(text) + documents.append(Chunk(text, {"page_number": page.number + 1})) + return chunks_to_document(documents) + + +def chunks_to_document(doc_parsed: List[Chunk]) -> Document: + paragraphs = [] + for page in doc_parsed: + page_number = page.metadata['page_number'] + + metadata = Metadata(page=page_number) + + paragraph = Paragraph( + text=page.text, + metadata=metadata + ) + + paragraphs.append(paragraph) + + document = Document( + text=paragraphs, + ) + return document diff --git a/text_extractor_api/__init__.py b/text_extractor_api/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/text_extractor_api/__init__.py @@ -0,0 +1 @@ + diff --git a/text_extractor_api/config.py b/text_extractor_api/config.py new file mode 100644 index 0000000..3fde0c8 --- /dev/null +++ b/text_extractor_api/config.py @@ -0,0 +1,10 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + pdfact_url: str + + model_config = SettingsConfigDict(env_file=".env") + + +settings = Settings() diff --git a/text_extractor_api/main.py b/text_extractor_api/main.py new file mode 100644 index 0000000..1082a20 --- /dev/null +++ b/text_extractor_api/main.py @@ -0,0 +1,17 @@ +import logging + +from fastapi import FastAPI + +from text_extractor import init_logger +from text_extractor_api.routers import parser + +init_logger() +logger = logging.getLogger(__name__) +app = FastAPI() +app.include_router(parser.router) + + +@app.get("/") +async def root(): + logger.info("Welcome to text extractor!") + return {"message": "Welcome to text extractor!"} diff --git a/text_extractor_api/models/__init__.py b/text_extractor_api/models/__init__.py new file mode 100644 index 0000000..93a382a --- /dev/null +++ b/text_extractor_api/models/__init__.py @@ -0,0 +1 @@ +from .extract_text_request import ExtractTextRequest diff --git a/text_extractor_api/models/extract_text_request.py b/text_extractor_api/models/extract_text_request.py new file mode 100644 index 0000000..75917c8 --- /dev/null +++ b/text_extractor_api/models/extract_text_request.py @@ -0,0 +1,11 @@ +from typing import List, Optional + +from pydantic import BaseModel + + +class ExtractTextRequest(BaseModel): + url: str + mime_type: str + driver: str + unit: Optional[str] = None + roles: Optional[List[str]] = None diff --git a/text_extractor_api/routers/__init__.py b/text_extractor_api/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text_extractor_api/routers/parser.py b/text_extractor_api/routers/parser.py new file mode 100644 index 0000000..5b45e7a --- /dev/null +++ b/text_extractor_api/routers/parser.py @@ -0,0 +1,76 @@ +import hashlib +import logging +import os + +import requests +from fastapi import APIRouter, HTTPException +from requests.exceptions import HTTPError, Timeout + +from text_extractor.models import Document +from text_extractor.parser.pdfact_parser import PdfactParser +from text_extractor.parser.pymupdf_parser import PymupdfParser +from text_extractor_api.config import settings +from text_extractor_api.models import ExtractTextRequest + +router = APIRouter() +logger = logging.getLogger(__name__) + + +@router.post("/extract-text", response_model=Document) +async def parse_pdf(request: ExtractTextRequest) -> Document: + logger.info("Received parse request.") + resource_path: str = os.environ.get("RESOURCE_PATH", "/tmp") + + if request.mime_type != 'application/pdf': + mime = request.mime_type + raise HTTPException(status_code=422, detail=f"Unsupported mime type[{mime}]. Expecting application/pdf.") + + if request.driver.lower() not in ["pdfact", "pymupdf"]: + raise HTTPException(status_code=400, + detail=f"Unsupported driver. Expecting 'pdfact' or 'pymupdf', received [{request.driver}].") + + try: + os.mkdir(resource_path) + except FileExistsError: + pass + + if request.mime_type != 'application/pdf': + mime = request.mime_type + logger.warning(f"Unsupported format [{mime}]") + raise HTTPException(status_code=422, + detail=f"Unsupported mime type. Expecting application/pdf received [{mime}].") + + filename = hashlib.sha256(request.url.encode()).hexdigest() + extension = request.mime_type.split("/")[-1] + filename = f"{filename}.{extension}" + logger.info(f"Parsing {filename}") + + file_path = os.path.join(resource_path, filename) + + try: + resp = requests.get(request.url, allow_redirects=True, timeout=120) + resp.raise_for_status() + with open(file_path, 'wb') as f: + f.write(resp.content) + except HTTPError as http_err: + logger.exception("Error while downloading file.", exc_info=True) + raise HTTPException(status_code=500, detail=f"Error while downloading file [{http_err}]") + except Timeout as http_timeout: + logger.exception("Timeout while downloading file.", exc_info=True) + raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]") + + try: + document = None + if request.driver.lower() == "pdfact": + parser = PdfactParser(settings.pdfact_url) + document = parser.parse(filename=request.url, unit=request.unit, roles=request.roles) + elif request.driver.lower() == "pymupdf": + parser = PymupdfParser() + document = parser.parse(filename=file_path) + except Exception as err: + logger.exception(f"Error while parsing file. {str(err)}", exc_info=True) + raise HTTPException(status_code=502, detail="Error while parsing file") + finally: + if os.path.exists(file_path): + os.remove(file_path) + return document diff --git a/uvicorn.sh b/uvicorn.sh new file mode 100644 index 0000000..357ae69 --- /dev/null +++ b/uvicorn.sh @@ -0,0 +1,2 @@ +#!/bin/sh +uvicorn "text_extractor_api.main:app" --host 0.0.0.0 --port 5000