From b18be2392550fda4f9009e39aa7cda59190af79d Mon Sep 17 00:00:00 2001
From: Andrea Ponti <59694427+andreaponti5@users.noreply.github.com>
Date: Tue, 4 Jun 2024 11:46:54 +0200
Subject: [PATCH] FastAPI and PDFAct new implementation (#2)

* Replace Flask with FastAPI

* Add PDFAct driver

---------

Co-authored-by: AnnaMarika01 <a.biasco@campus.unimib.it>
---
 .github/workflows/ci.yml                      |   2 +-
 .gitignore                                    |   1 +
 Dockerfile                                    |  10 +-
 README.md                                     |  73 ++++----
 docker-compose.yaml                           |  19 +-
 gunicorn.sh                                   |   2 -
 parsing_service/__init__.py                   |  80 --------
 parsing_service/implementation/chunk.py       |  10 -
 .../implementation/parser_factory.py          |  19 --
 parsing_service/implementation/pdf_parser.py  |  41 ----
 parsing_service/models/parser.py              |  20 --
 requirements.txt                              |   8 +-
 text_extractor/__init__.py                    |   5 +
 {parsing_service => text_extractor}/logger.py |   0
 text_extractor/models/__init__.py             |   5 +
 .../models/chunk.py                           |  10 +-
 text_extractor/models/color.py                |   8 +
 text_extractor/models/document.py             |  13 ++
 text_extractor/models/font.py                 |   9 +
 text_extractor/models/paragraph.py            |  20 ++
 text_extractor/models/position.py             |  10 +
 text_extractor/parser/__init__.py             |   0
 text_extractor/parser/pdf_parser.py           |  24 +++
 text_extractor/parser/pdfact_parser.py        | 175 ++++++++++++++++++
 text_extractor/parser/pymupdf_parser.py       |  38 ++++
 text_extractor_api/__init__.py                |   1 +
 text_extractor_api/config.py                  |  10 +
 text_extractor_api/main.py                    |  17 ++
 text_extractor_api/models/__init__.py         |   1 +
 .../models/extract_text_request.py            |  11 ++
 text_extractor_api/routers/__init__.py        |   0
 text_extractor_api/routers/parser.py          |  76 ++++++++
 uvicorn.sh                                    |   2 +
 33 files changed, 485 insertions(+), 235 deletions(-)
 delete mode 100644 gunicorn.sh
 delete mode 100644 parsing_service/__init__.py
 delete mode 100644 parsing_service/implementation/chunk.py
 delete mode 100644 parsing_service/implementation/parser_factory.py
 delete mode 100644 parsing_service/implementation/pdf_parser.py
 delete mode 100644 parsing_service/models/parser.py
 create mode 100644 text_extractor/__init__.py
 rename {parsing_service => text_extractor}/logger.py (100%)
 create mode 100644 text_extractor/models/__init__.py
 rename parsing_service/models/chunck.py => text_extractor/models/chunk.py (70%)
 create mode 100644 text_extractor/models/color.py
 create mode 100644 text_extractor/models/document.py
 create mode 100644 text_extractor/models/font.py
 create mode 100644 text_extractor/models/paragraph.py
 create mode 100644 text_extractor/models/position.py
 create mode 100644 text_extractor/parser/__init__.py
 create mode 100644 text_extractor/parser/pdf_parser.py
 create mode 100644 text_extractor/parser/pdfact_parser.py
 create mode 100644 text_extractor/parser/pymupdf_parser.py
 create mode 100644 text_extractor_api/__init__.py
 create mode 100644 text_extractor_api/config.py
 create mode 100644 text_extractor_api/main.py
 create mode 100644 text_extractor_api/models/__init__.py
 create mode 100644 text_extractor_api/models/extract_text_request.py
 create mode 100644 text_extractor_api/routers/__init__.py
 create mode 100644 text_extractor_api/routers/parser.py
 create mode 100644 uvicorn.sh

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 487d762..f1dcfd1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,4 +23,4 @@ jobs:
         fetch-depth: 1
 
     - name: Lint the Shell scripts
-      run: shellcheck ./gunicorn.sh
+      run: shellcheck ./uvicorn.sh
diff --git a/.gitignore b/.gitignore
index 3bf780b..5cf0957 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .idea
+logs
 .env
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 0aa64fd..6591947 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -33,13 +33,15 @@ WORKDIR /app
 COPY --from=build-image /opt/venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 
-COPY parsing_service/ parsing_service/
-COPY root.py gunicorn.sh ./
 
-RUN chmod +x ./gunicorn.sh
+COPY text_extractor_api/ text_extractor_api/
+COPY text_extractor/ text_extractor/
+COPY root.py uvicorn.sh ./
+
+RUN chmod +x ./uvicorn.sh
 
 EXPOSE 5000/tcp
 
 ENTRYPOINT ["tini", "--"]
 
-CMD ["/app/gunicorn.sh"]
\ No newline at end of file
+CMD ["/app/uvicorn.sh"]
\ No newline at end of file
diff --git a/README.md b/README.md
index dd1c837..26e71df 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 [![CI](https://github.com/data-house/pdf-text-extractor/actions/workflows/ci.yml/badge.svg)](https://github.com/data-house/pdf-text-extractor/actions/workflows/ci.yml) [![Build Docker Image](https://github.com/data-house/pdf-text-extractor/actions/workflows/docker.yml/badge.svg)](https://github.com/data-house/pdf-text-extractor/actions/workflows/docker.yml)
 
-# PDF Text extraction service for Data House
+# PDF Text Extraction Service
 
-Extract text from PDFs keeping page information.
+A FastAPI application to extract text from pdf documents.
 
 ## Getting started
 
@@ -18,18 +18,9 @@ A sample [`docker-compose.yaml` file](./docker-compose.yaml) is available within
 > Please refer to [Releases](https://github.com/data-house/pdf-text-extractor/releases) and [Packages](https://github.com/data-house/pdf-text-extractor/pkgs/container/pdf-text-extractor) for the available tags.
 
 
-**Available environment variables**
-
-| variable | default | description |
-|------|---------|-------------|
-| `GUNICORN_WORKERS` | 2 | The number of [Gunicorn](https://docs.gunicorn.org/en/latest/settings.html#worker-class) sync workers |
-| `GUNICORN_WORKERS_TIMEOUT` | 600 | The timeout, in seconds, of each worker |
-
-
-
 ## Usage
 
-The PDF Text Extract service expose a web application on port `5000`. The available API receive a PDF file via a URL and return the extracted text as a JSON response.
+The PDF Text Extract service expose a web application. The available API receive a PDF file via a URL and return the extracted text as a JSON response.
 
 The exposed service is unauthenticated therefore consider exposing it only within a trusted network. If you plan to make it available publicly consider adding a reverse proxy with authentication in front.
 
@@ -38,44 +29,44 @@ The exposed service is unauthenticated therefore consider exposing it only withi
 The service expose only one endpoint `/extract-text` that accepts a `POST` request
 with the following input as a `json` body:
 
-- `url` the URL of the PDF file to process
-- `mime_type` the mime type of the file (it is expected to be `application/pdf`)
+- `url`: the URL of the PDF file to process.
+- `mime_type`: the mime type of the file (it is expected to be `application/pdf`).
+- `driver`: two drivers are currently implemented `pymupdf` and `pdfact`. It defines the extraction backend to use.
 
 > **warning** The processing is performed synchronously
 
 
-The response will be a JSON containing:
-
-- `status` the status of the operation. Usually `ok`.
-- `content` a list of objects describing the chunked content with the page reference. Each object contains a `text` property with the part of the PDF text and a `metadata` object with the `page_number` property representing the page of the PDF from which the `text` was extracted.
+The response is a JSON with the extracted text splitted in chunks. In particular, the structure is as follows:
 
-The following code block shows a possible output:
+- `text`: The list of chunks, each composed by:
+    - `text`: The text extracted from the chunk.
+    - `metadata`: A json with additional information regarding the chunk.
+- `fonts`: The list of fonts used in the document. 
+Each font is represented by `name`, `id`, `is-bold`, `is-type3` and `is-italic`. 
+Available only using `pdfact` driver.
+- `colors`: The list of colors used in the document.
+Each color is represented by `r`, `g`, `b` and `id`.
+Available only using `pdfact` driver.
 
-```json
-{
-  "status": "ok",
-  "content": [
-    {
-      "text": "This is a test PDF to be used as input in unit tests",
-      "metadata": {
-        "page_number": 1
-      }
-    }
-  ]
-}
-```
+The `metadata` of each chunk contains the following information:
+- `page`: The page number from which the chunk has been extracted.
+- `role`: The role of the chunk in the document (e.g., _heading_, _body_, etc.)
+- `positions`: A list of bounding box containing the text. 
+Each bounding box is identified by 4 coordinated: `minY`, `minX`, `maxY` and `maxX`.
+- `font`: The font of the chunk.
+- `color`: The color of the chunk.
 
 ### Error handling
 
 The service can return the following errors
 
-| code | message | description |
-|------|---------|-------------|
-| `422` | No url found in request | In case the `url` field in the request is missing |
-| `422` | No mime_type found in request | In case the `mime_type` field in the request is missing |
-| `422` | Unsupported file type | In case the file is not a PDF |
-| `500` | Error while saving file | In case it was not possible to download the file from the specified URL |
-| `500` | Error while parsing file | In case it was not possible to open the file after download |
+| code  | message                       | description                                                             |
+|-------|-------------------------------|-------------------------------------------------------------------------|
+| `422` | No url found in request       | In case the `url` field in the request is missing                       |
+| `422` | No mime_type found in request | In case the `mime_type` field in the request is missing                 |
+| `422` | Unsupported file type         | In case the file is not a PDF                                           |
+| `500` | Error while saving file       | In case it was not possible to download the file from the specified URL |
+| `500` | Error while parsing file      | In case it was not possible to open the file after download             |
 
 
 The body of the response can contain a JSON with the following fields:
@@ -94,7 +85,7 @@ The body of the response can contain a JSON with the following fields:
 
 ## Development
 
-The PDF text extract service is built using [Flask](https://flask.palletsprojects.com/) on Python 3.9.
+The PDF text extract service is built using [FastAPI](https://fastapi.tiangolo.com/) and Python 3.9.
 
 Given the selected stack the development requires:
 
@@ -111,7 +102,7 @@ pip install -r requirements.txt
 Run the local development application using:
 
 ```bash
-python -m flask --app parsing_service run
+fastapi dev text_extractor_api/main.py
 ```
 
 
diff --git a/docker-compose.yaml b/docker-compose.yaml
index e55cd2d..3393bbf 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,16 +1,21 @@
 version: '3'
 
 networks:
-  web:
+  internal:
     driver: bridge
 
 services:
   app:
-    image: "ghcr.io/data-house/pdf-text-extractor:main"
-    environment:
-      GUNICORN_WORKERS: 2
-      GUNICORN_WORKERS_TIMEOUT: 600
+    build:
+      context: .
     networks:
-      - web
+        - internal
+    env_file:
+      - .env
     ports:
-      - "5200:5000"
\ No newline at end of file
+      - "5002:5000"
+
+  pdfact:
+    image: "ghcr.io/data-house/pdfact:main"
+    networks:
+      - internal
\ No newline at end of file
diff --git a/gunicorn.sh b/gunicorn.sh
deleted file mode 100644
index c880228..0000000
--- a/gunicorn.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-gunicorn "parsing_service:create_app()" -w "${GUNICORN_WORKERS:-2}" --timeout "${GUNICORN_WORKERS_TIMEOUT:-600}" -b 0.0.0.0:5000
\ No newline at end of file
diff --git a/parsing_service/__init__.py b/parsing_service/__init__.py
deleted file mode 100644
index 28e60a7..0000000
--- a/parsing_service/__init__.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import hashlib
-import logging
-import os
-
-import requests
-from requests.exceptions import HTTPError
-from requests.exceptions import Timeout
-from flask import Flask, request
-
-from parsing_service.implementation.parser_factory import parse_file
-from parsing_service.logger import init_logger
-
-logger = logging.getLogger(__name__)
-
-
-def create_app():
-    init_logger()
-    app = Flask(__name__, instance_relative_config=True)
-    app.resource_path = os.environ.get("RESOURCE_PATH", "/tmp")
-    try:
-        os.mkdir(app.resource_path)
-    except FileExistsError:
-        pass
-
-    @app.route("/extract-text", methods=["POST"])
-    def text_extract_endpoint():
-
-        logger.info("Received parse request")
-
-        if not request.json:
-            logger.warning("No json found in request")
-            return {"message": "No json found in request", "code": 422, "type": "Unprocessable Entity"}, 422
-        if not request.json.get("url"):
-            logger.warning("No file found in request")
-            return {"message": "No url found in request", "code": 422, "type": "Unprocessable Entity"}, 422
-        if not request.json.get("mime_type"):
-            logger.warning("No mime_type found in request")
-            return {"message": "No mime_type found in request", "code": 422, "type": "Unprocessable Entity"}, 422
-        
-        if request.json.get("mime_type") != 'application/pdf':
-            mime = request.json.get("mime_type")
-            logger.warning(f"Unsupported format [{mime}]")
-            return {"message": f"Unsupported mime type. Expecting application/pdf received [{mime}]", "code": 422, "type": "Unprocessable Entity"}, 422
-        
-        filename = hashlib.sha256(request.json.get("url").encode()).hexdigest()
-        extension = request.json.get("mime_type").split("/")[-1]
-        filename = f"{filename}.{extension}"
-        logger.info(f"Parsing {filename}")
-        
-        try:
-            resp = requests.get(request.json.get("url"), allow_redirects=True, timeout=120)
-
-            resp.raise_for_status()
-
-            open(os.path.join(app.resource_path, filename), 'wb').write(resp.content)
-        except HTTPError as http_err:
-            logger.exception("Error while downloading file", exc_info=True)
-            return {"message": f"Error while downloading file [{http_err}]", "code": 500, "type": "Internal Server Error"}, 500
-        except Timeout as http_timeout:
-            logger.exception("Timeout while downloading file", exc_info=True)
-            return {"message": f"File download not completed [{http_timeout}]", "code": 500, "type": "Internal Server Error"}, 500
-        except Exception as requestError:
-            logger.exception("Error while downloading file", exc_info=True)
-            return {"message": "Error while saving file", "code": 500, "type": "Internal Server Error"}, 500
-        
-        try:
-            doc_parsed = parse_file(os.path.join(app.resource_path, filename), extension)
-            os.remove(os.path.join(app.resource_path, filename))
-        except ValueError as ve:
-            logger.exception("Unsupported file type", exc_info=True)
-            return {"message": "Unsupported file type", "code": 422, "type": "Unprocessable Entity"}, 422
-        except Exception as err:
-            logger.exception("Error while parsing file", exc_info=True)
-            return {"message": "Error while parsing file", "code": 500, "type": "Internal Server Error"}, 500
-        
-        logger.info(f"Parse done for file {filename}")
-        
-        return {"status": "ok", "content": [chunk.to_dict() for chunk in doc_parsed]}, 200
-
-    return app
diff --git a/parsing_service/implementation/chunk.py b/parsing_service/implementation/chunk.py
deleted file mode 100644
index 47a4c5f..0000000
--- a/parsing_service/implementation/chunk.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from parsing_service.models.chunck import AChunk
-
-
-class Chunk(AChunk):
-    """
-    A chunk of text.
-    """
-
-    def __init__(self, text: str, metadata: dict = None, embedded_vector: list = None):
-        super().__init__(text, metadata, embedded_vector)
diff --git a/parsing_service/implementation/parser_factory.py b/parsing_service/implementation/parser_factory.py
deleted file mode 100644
index cfc9953..0000000
--- a/parsing_service/implementation/parser_factory.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from typing import List
-
-from parsing_service.implementation.pdf_parser import PDFParser
-from parsing_service.models.chunck import AChunk
-
-
-def parse_file(filename: str, filetype: str) -> List[AChunk]:
-    """
-    Parse the given file and return a list of chunks.
-    :param filename: The name of the file to parse.
-    :param filetype: The type of the file to parse.
-    :return: A list of extracted chunks.
-    """
-    if filetype != "pdf":
-        raise ValueError(f"Invalid filetype {filetype}")
-
-    parser = PDFParser()
-    context = parser.parse(filename)
-    return context
diff --git a/parsing_service/implementation/pdf_parser.py b/parsing_service/implementation/pdf_parser.py
deleted file mode 100644
index e2fdbc8..0000000
--- a/parsing_service/implementation/pdf_parser.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import os
-import re
-from typing import List
-
-import fitz
-
-from parsing_service.implementation.chunk import Chunk
-from parsing_service.models.chunck import AChunk
-from parsing_service.models.parser import Parser
-
-
-class PDFParser(Parser):
-
-    def __init__(self):
-        super().__init__()
-
-    def parse(self, filename: str) -> List[AChunk]:
-        pdf = fitz.open(filename)
-        documents = []
-        skipping = False
-        for page in pdf:
-            text = page.get_text()
-            if os.environ.get("REMOVE_METHODOLOGY_CHAPTER", "True").lower() == "true":
-                if text.startswith("2 EVALUIERUNGSDESIGN UND METHODOLOGIE"):
-                    skipping = True
-                if text.startswith("3 ERGEBNISSE DER EVALUIERUNG"):
-                    skipping = False
-            if skipping:
-                continue
-            # Merge hyphenated words
-            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
-            # Fix newlines in the middle of sentences
-            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
-            # Remove multiple newlines
-            text = re.sub(r"\n\s*\n", "\n\n", text)
-            text = re.sub(r'\s+', ' ', text)
-            # Remove repeated special characters
-            text = re.sub(r"([^\w\s])\1+", r"\1", text)
-            documents.append(Chunk(text, {"page_number": page.number + 1}))
-
-        return documents
diff --git a/parsing_service/models/parser.py b/parsing_service/models/parser.py
deleted file mode 100644
index 87867a5..0000000
--- a/parsing_service/models/parser.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import List
-
-from parsing_service.models.chunck import AChunk
-
-
-class Parser(ABC):
-    """
-    Abstract class to implement a generic document parser (.pdf, .doc, etc.)
-    """
-
-    @abstractmethod
-    def parse(self, filename: str) -> List[AChunk]:
-        """
-        Read and extract the text from a document into a list of chunks.
-
-        :param filename: a string representing the path to access the document.
-        :return: a list of chunk extracted from the document.
-        """
-        pass
diff --git a/requirements.txt b/requirements.txt
index c7410dd..e01fc38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
-Flask==2.3.2
 pandas==2.0.2
 pymupdf==1.22.5
 numpy~=1.24.3
-requests==2.31.0
-gunicorn==20.1.0; platform_system != "Windows"
+requests==2.32.0
+fastapi~=0.111.0
+pydantic~=2.7.1
+pydantic_settings~=2.2.1
+uvicorn==0.22.0
\ No newline at end of file
diff --git a/text_extractor/__init__.py b/text_extractor/__init__.py
new file mode 100644
index 0000000..be05e01
--- /dev/null
+++ b/text_extractor/__init__.py
@@ -0,0 +1,5 @@
+import logging
+
+from text_extractor.logger import init_logger
+
+logger = logging.getLogger(__name__)
diff --git a/parsing_service/logger.py b/text_extractor/logger.py
similarity index 100%
rename from parsing_service/logger.py
rename to text_extractor/logger.py
diff --git a/text_extractor/models/__init__.py b/text_extractor/models/__init__.py
new file mode 100644
index 0000000..df07121
--- /dev/null
+++ b/text_extractor/models/__init__.py
@@ -0,0 +1,5 @@
+from .color import Color
+from .document import Document
+from .font import Font
+from .paragraph import Paragraph, Metadata
+from .position import Position
diff --git a/parsing_service/models/chunck.py b/text_extractor/models/chunk.py
similarity index 70%
rename from parsing_service/models/chunck.py
rename to text_extractor/models/chunk.py
index 1f85ff3..e8b0a59 100644
--- a/parsing_service/models/chunck.py
+++ b/text_extractor/models/chunk.py
@@ -1,22 +1,18 @@
 import json
-from abc import ABC
-from typing import List
 
 
-class AChunk(ABC):
+class Chunk:
     """
-    Abstract class to represent a chunk of a document
+    A chunk of text
     """
 
-    def __init__(self, text: str, metadata: dict = None, embedded_vector: List[float] = None):
+    def __init__(self, text: str, metadata: dict = None):
         """
         :param text: the text contained in the chunk.
         :param metadata: additional data to identify the chunk in a document.
-        :param embedded_vector: the embedding of text.
         """
         self.text = text
         self.metadata = metadata
-        self.embedded_vector = embedded_vector
 
     def __str__(self) -> str:
         """
diff --git a/text_extractor/models/color.py b/text_extractor/models/color.py
new file mode 100644
index 0000000..0217604
--- /dev/null
+++ b/text_extractor/models/color.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+
+class Color(BaseModel):
+    r: int
+    b: int
+    g: int
+    id: str
diff --git a/text_extractor/models/document.py b/text_extractor/models/document.py
new file mode 100644
index 0000000..11e4f87
--- /dev/null
+++ b/text_extractor/models/document.py
@@ -0,0 +1,13 @@
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+from text_extractor.models.color import Color
+from text_extractor.models.font import Font
+from text_extractor.models.paragraph import Paragraph
+
+
+class Document(BaseModel):
+    fonts: Optional[List[Font]] = None
+    text: List[Paragraph]
+    colors: Optional[List[Color]] = None
diff --git a/text_extractor/models/font.py b/text_extractor/models/font.py
new file mode 100644
index 0000000..277f6bf
--- /dev/null
+++ b/text_extractor/models/font.py
@@ -0,0 +1,9 @@
+from pydantic import BaseModel, Field
+
+
+class Font(BaseModel):
+    name: str
+    id: str
+    is_bold: bool = Field(False, alias='is-bold')
+    is_type3: bool = Field(False, alias='is-type3')
+    is_italic: bool = Field(False, alias='is-italic')
diff --git a/text_extractor/models/paragraph.py b/text_extractor/models/paragraph.py
new file mode 100644
index 0000000..a4b9c6f
--- /dev/null
+++ b/text_extractor/models/paragraph.py
@@ -0,0 +1,20 @@
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+from text_extractor.models.color import Color
+from text_extractor.models.font import Font
+from text_extractor.models.position import Position
+
+
+class Metadata(BaseModel):
+    role: Optional[str] = None
+    color: Optional[Color] = None
+    positions: Optional[List[Position]] = None
+    font: Optional[Font] = None
+    page: int
+
+
+class Paragraph(BaseModel):
+    text: str
+    metadata: Metadata
diff --git a/text_extractor/models/position.py b/text_extractor/models/position.py
new file mode 100644
index 0000000..69a4527
--- /dev/null
+++ b/text_extractor/models/position.py
@@ -0,0 +1,10 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class Position(BaseModel):
+    minY: Optional[float] = None
+    minX: Optional[float] = None
+    maxY: Optional[float] = None
+    maxX: Optional[float] = None
diff --git a/text_extractor/parser/__init__.py b/text_extractor/parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/text_extractor/parser/pdf_parser.py b/text_extractor/parser/pdf_parser.py
new file mode 100644
index 0000000..a537532
--- /dev/null
+++ b/text_extractor/parser/pdf_parser.py
@@ -0,0 +1,24 @@
+import re
+from abc import ABC, abstractmethod
+
+from text_extractor.models import Document
+
+
+class PDFParser(ABC):
+    @abstractmethod
+    def parse(self, filename: str, **kwargs) -> Document:
+        pass
+
+
+def clean_text(text: str) -> str:
+    # Merge hyphenated words
+    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
+    # Fix newlines in the middle of sentences
+    text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
+    # Remove multiple newlines
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    # Replace multiple whitespaces with a single space
+    text = re.sub(r'\s+', ' ', text)
+    # Remove repeated special characters
+    text = re.sub(r"([^\w\s])\1+", r"\1", text)
+    return text
diff --git a/text_extractor/parser/pdfact_parser.py b/text_extractor/parser/pdfact_parser.py
new file mode 100644
index 0000000..df6ed35
--- /dev/null
+++ b/text_extractor/parser/pdfact_parser.py
@@ -0,0 +1,175 @@
+import logging
+
+import requests
+from fastapi import HTTPException
+from requests.exceptions import RequestException
+
+from text_extractor.models import Document, Metadata, Paragraph, Position, Color, Font
+from text_extractor.parser.pdf_parser import PDFParser
+
+logger = logging.getLogger(__name__)
+
+
+class PdfactParser(PDFParser):
+    def __init__(self, url: str) -> None:
+        self.url = url
+
+    def parse(self, filename: str, **kwargs) -> Document:
+        body = {"url": filename}
+        unit = kwargs.get("unit", None)
+        roles = kwargs.get("roles", None)
+        if unit is not None:
+            body["unit"] = unit
+        if roles is not None:
+            body["roles"] = roles
+        try:
+            response = requests.post(self.url, json=body)
+            response.raise_for_status()
+            res = response.json()
+            if unit == 'paragraph' or unit is None:
+                res = pdfact_formatter(res)
+            document = pdfact_to_document(res)
+            return document
+        except RequestException as e:
+            logger.exception(f"An error occurred while trying to reach the API: {e}", exc_info=True)
+            raise HTTPException(status_code=503, detail="Error while trying to reach the API")
+
+
+def pdfact_to_document(json_data: dict) -> Document:
+    colors = [Color(**color) for color in json_data.get('colors', [])]
+
+    fonts = [Font(**font) for font in json_data.get('fonts', [])]
+
+    paragraphs = []
+    for para in json_data.get('paragraphs', []):
+        paragraph_detail = para['paragraph']
+        color_id = paragraph_detail['color']['id']
+
+        color = next((c for c in colors if c.id == color_id), None)
+
+        font_id = paragraph_detail['font']['id']
+        font = next((f for f in fonts if f.id == font_id), None)
+
+        positions = [
+            Position(
+                minY=pos['minY'],
+                minX=pos['minX'],
+                maxY=pos['maxY'],
+                maxX=pos['maxX']
+            ) for pos in paragraph_detail.get('positions', [])
+        ]
+
+        page = paragraph_detail['positions'][0]['page'] if paragraph_detail.get('positions') else None
+
+        metadata = Metadata(
+            role=paragraph_detail['role'],
+            color=color,
+            positions=positions,
+            font=font,
+            page=page
+        )
+        paragraph = Paragraph(
+            text=paragraph_detail['text'],
+            metadata=metadata
+        )
+
+        paragraphs.append(paragraph)
+
+    document = Document(
+        fonts=fonts,
+        text=paragraphs,
+        colors=colors
+    )
+
+    return document
+
+
+def pdfact_formatter(json_file):
+    previous_length = None
+    current_json = json_file
+    current_length = len(current_json["paragraphs"])
+
+    while previous_length is None or previous_length != current_length:
+        previous_length = current_length
+        current_json = aggregate_paragraphs(current_json)
+        current_length = len(current_json["paragraphs"])
+
+    return current_json
+
+
+def aggregate_paragraphs(json_file):
+    output = []
+    fonts = json_file["fonts"]
+    colors = json_file["colors"]
+    i = 0
+    while i < len(json_file["paragraphs"][:-1]):
+        paragraph1 = json_file["paragraphs"][i]
+        paragraph2 = json_file["paragraphs"][i + 1]
+
+        if compare_paragraphs(paragraph1, paragraph2):
+            paragraph = merge_pargraphs(paragraph1, paragraph2)
+            output.append(paragraph)
+
+            # After merging the two paragraphs, proceed to the paragraph following the (i+1)-th one
+            if i + 2 < len(json_file["paragraphs"][:-1]):
+                i += 2
+                continue
+            # if the paragraph following the (i+1)-th one is the last one, then concatenate it
+            elif i + 2 == len(json_file["paragraphs"][:-1]):
+                output.append(json_file["paragraphs"][i + 2])
+                break
+        else:
+            output.append(json_file["paragraphs"][i])
+
+            # If the next paragraph is the last one, then concatenate it to the list of paragraphs
+            if i + 1 == len(json_file["paragraphs"][:-1]):
+                output.append(json_file["paragraphs"][i + 1])
+        i += 1
+
+    paragraphs = {'fonts': fonts, 'paragraphs': output, 'colors': colors}
+    return paragraphs
+
+
+def compare_paragraphs(p1, p2, tr=25):
+    if p1["paragraph"]["role"] != p2["paragraph"]["role"]:
+        return False
+    positions1, positions2 = p1["paragraph"]["positions"], p2["paragraph"]["positions"]
+
+    for pos1 in positions1:
+        for pos2 in positions2:
+            # Compare if they are aligned with respect to the x-axis and if their distance is less than a threshold
+            if (pos1["minX"] - pos2["minX"] == 0
+                or pos1["maxX"] - pos2["maxX"] == 0
+                or (pos1["minX"] + pos1["maxX"]) / 2 == (pos2["minX"] + pos2["maxX"]) / 2) \
+                    and (pos1["minY"] - pos2["maxY"] < tr):
+                return True
+            # Compare if they are aligned with respect to the y-axis and if their distance is less than a threshold
+            elif (pos1["minY"] - pos2["minY"] == 0
+                  or pos1["maxY"] - pos2["maxY"] == 0
+                  or (pos1["minY"] + pos1["maxY"]) / 2 == (pos2["minY"] + pos2["maxY"]) / 2) \
+                    and (pos2["minX"] - pos1["maxX"] < tr):
+                return True
+
+    return False
+
+
+def merge_pargraphs(p1, p2):
+    role = p1["paragraph"]["role"]
+    color = p1["paragraph"]["color"]
+    font = p1["paragraph"]["font"]
+    positions1 = p1["paragraph"]["positions"]
+    positions2 = p2["paragraph"]["positions"]
+    text1 = p1["paragraph"]["text"]
+    text2 = p2["paragraph"]["text"]
+
+    paragraph = {
+        "paragraph": {
+            "role": role,
+            "color": color,
+            "positions": positions1 + positions2,
+            "text": text1 + '\n\n' + text2,
+            "font": font
+        }
+    }
+
+    return paragraph
diff --git a/text_extractor/parser/pymupdf_parser.py b/text_extractor/parser/pymupdf_parser.py
new file mode 100644
index 0000000..bde4c89
--- /dev/null
+++ b/text_extractor/parser/pymupdf_parser.py
@@ -0,0 +1,38 @@
+from typing import List
+
+import fitz
+
+from text_extractor.models import Document, Metadata, Paragraph
+from text_extractor.models.chunk import Chunk
+from text_extractor.parser.pdf_parser import PDFParser, clean_text
+
+
+class PymupdfParser(PDFParser):
+    def parse(self, filename: str, **kwargs) -> Document:
+        pdf = fitz.open(filename)
+        documents = []
+        for page in pdf:
+            text = page.get_text()
+            text = clean_text(text)
+            documents.append(Chunk(text, {"page_number": page.number + 1}))
+        return chunks_to_document(documents)
+
+
+def chunks_to_document(doc_parsed: List[Chunk]) -> Document:
+    paragraphs = []
+    for page in doc_parsed:
+        page_number = page.metadata['page_number']
+
+        metadata = Metadata(page=page_number)
+
+        paragraph = Paragraph(
+            text=page.text,
+            metadata=metadata
+        )
+
+        paragraphs.append(paragraph)
+
+    document = Document(
+        text=paragraphs,
+    )
+    return document
diff --git a/text_extractor_api/__init__.py b/text_extractor_api/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/text_extractor_api/__init__.py
@@ -0,0 +1 @@
+
diff --git a/text_extractor_api/config.py b/text_extractor_api/config.py
new file mode 100644
index 0000000..3fde0c8
--- /dev/null
+++ b/text_extractor_api/config.py
@@ -0,0 +1,10 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    pdfact_url: str
+
+    model_config = SettingsConfigDict(env_file=".env")
+
+
+settings = Settings()
diff --git a/text_extractor_api/main.py b/text_extractor_api/main.py
new file mode 100644
index 0000000..1082a20
--- /dev/null
+++ b/text_extractor_api/main.py
@@ -0,0 +1,17 @@
+import logging
+
+from fastapi import FastAPI
+
+from text_extractor import init_logger
+from text_extractor_api.routers import parser
+
+init_logger()
+logger = logging.getLogger(__name__)
+app = FastAPI()
+app.include_router(parser.router)
+
+
+@app.get("/")
+async def root():
+    logger.info("Welcome to text extractor!")
+    return {"message": "Welcome to text extractor!"}
diff --git a/text_extractor_api/models/__init__.py b/text_extractor_api/models/__init__.py
new file mode 100644
index 0000000..93a382a
--- /dev/null
+++ b/text_extractor_api/models/__init__.py
@@ -0,0 +1 @@
+from .extract_text_request import ExtractTextRequest
diff --git a/text_extractor_api/models/extract_text_request.py b/text_extractor_api/models/extract_text_request.py
new file mode 100644
index 0000000..75917c8
--- /dev/null
+++ b/text_extractor_api/models/extract_text_request.py
@@ -0,0 +1,11 @@
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class ExtractTextRequest(BaseModel):
+    url: str
+    mime_type: str
+    driver: str
+    unit: Optional[str] = None
+    roles: Optional[List[str]] = None
diff --git a/text_extractor_api/routers/__init__.py b/text_extractor_api/routers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/text_extractor_api/routers/parser.py b/text_extractor_api/routers/parser.py
new file mode 100644
index 0000000..5b45e7a
--- /dev/null
+++ b/text_extractor_api/routers/parser.py
@@ -0,0 +1,76 @@
+import hashlib
+import logging
+import os
+
+import requests
+from fastapi import APIRouter, HTTPException
+from requests.exceptions import HTTPError, Timeout
+
+from text_extractor.models import Document
+from text_extractor.parser.pdfact_parser import PdfactParser
+from text_extractor.parser.pymupdf_parser import PymupdfParser
+from text_extractor_api.config import settings
+from text_extractor_api.models import ExtractTextRequest
+
+router = APIRouter()
+logger = logging.getLogger(__name__)
+
+
+@router.post("/extract-text", response_model=Document)
+async def parse_pdf(request: ExtractTextRequest) -> Document:
+    logger.info("Received parse request.")
+    resource_path: str = os.environ.get("RESOURCE_PATH", "/tmp")
+
+    if request.mime_type != 'application/pdf':
+        mime = request.mime_type
+        raise HTTPException(status_code=422, detail=f"Unsupported mime type[{mime}]. Expecting application/pdf.")
+
+    if request.driver.lower() not in ["pdfact", "pymupdf"]:
+        raise HTTPException(status_code=400,
+                            detail=f"Unsupported driver. Expecting 'pdfact' or 'pymupdf', received [{request.driver}].")
+
+    try:
+        os.mkdir(resource_path)
+    except FileExistsError:
+        pass
+
+    if request.mime_type != 'application/pdf':
+        mime = request.mime_type
+        logger.warning(f"Unsupported format [{mime}]")
+        raise HTTPException(status_code=422,
+                            detail=f"Unsupported mime type. Expecting application/pdf received [{mime}].")
+
+    filename = hashlib.sha256(request.url.encode()).hexdigest()
+    extension = request.mime_type.split("/")[-1]
+    filename = f"{filename}.{extension}"
+    logger.info(f"Parsing {filename}")
+
+    file_path = os.path.join(resource_path, filename)
+
+    try:
+        resp = requests.get(request.url, allow_redirects=True, timeout=120)
+        resp.raise_for_status()
+        with open(file_path, 'wb') as f:
+            f.write(resp.content)
+    except HTTPError as http_err:
+        logger.exception("Error while downloading file.", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error while downloading file [{http_err}]")
+    except Timeout as http_timeout:
+        logger.exception("Timeout while downloading file.", exc_info=True)
+        raise HTTPException(status_code=408, detail=f"File download not completed [{http_timeout}]")
+
+    try:
+        document = None
+        if request.driver.lower() == "pdfact":
+            parser = PdfactParser(settings.pdfact_url)
+            document = parser.parse(filename=request.url, unit=request.unit, roles=request.roles)
+        elif request.driver.lower() == "pymupdf":
+            parser = PymupdfParser()
+            document = parser.parse(filename=file_path)
+    except Exception as err:
+        logger.exception(f"Error while parsing file. {str(err)}", exc_info=True)
+        raise HTTPException(status_code=502, detail="Error while parsing file")
+    finally:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+    return document
diff --git a/uvicorn.sh b/uvicorn.sh
new file mode 100644
index 0000000..357ae69
--- /dev/null
+++ b/uvicorn.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+uvicorn "text_extractor_api.main:app" --host 0.0.0.0 --port 5000