Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pdf2md #3213

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open

pdf2md #3213

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions python/pdf-marker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel

ENV DEBIAN_FRONTEND=noninteractive
ENV LANG=C.UTF-8
# 安装构建依赖 cv2 dependencies
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y

# 设置 pip 配置
RUN mkdir -p /root/.pip
COPY pip.conf /root/.pip/

# 创建模型文件夹
RUN mkdir -p /root/huggingface

# 复制依赖文件
COPY requirements.txt /root/
COPY api_mp.py /root/


# 导入huggingface的代理和huggingface模型位置
ENV HF_ENDPOINT=https://hf-mirror.com \
HF_DATASETS_CACHE=/root/huggingface \
HUGGINGFACE_HUB_CACHE=/root/huggingface \
HF_HOME=/root/huggingface

# 设置工作目录
WORKDIR /root

# 安装 Python 依赖
RUN pip3 install --no-cache-dir -r requirements.txt

# 删除不必要的工具和文件以减小镜像体积
RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip



# 设置容器启动命令
CMD ["python3", "api_mp.py"]
43 changes: 43 additions & 0 deletions python/pdf-marker/Readme.md
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

本地怎么开发也补充下。比如模型怎么拉,安装依赖。

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
## 环境

24G显存的显卡若干

## 打包镜像

在 `pdf-marker` 根目录下执行:

```bash
sudo docker build -t model_pdf -f Dockerfile .
```

## 运行容器

```bash
sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 model_pdf
```

## 访问示例

用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务

参数:file-->本地文件的地址

示例一:

```bash
curl --location --request POST "http://localhost:7231/v1/parse/file" \
--header "Authorization: Bearer your_access_token" \
--form "file=@./file/chinese_test.pdf"
```

示例二:

```bash
curl --location --request POST "http://localhost:7231/v1/parse/file" \
--header "Authorization: Bearer your_access_token" \
--form "file=@./file/englist_test.pdf"
```

## 多文件测试数据

运行 `test` 文件下的 `test.py` 文件,修改里面的 `file_paths` 为自己仓库的 `url` 即可
162 changes: 162 additions & 0 deletions python/pdf-marker/api_mp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import asyncio
import base64
import fitz
import torch.multiprocessing as mp
import shutil
import time
from contextlib import asynccontextmanager
from loguru import logger
from fastapi import HTTPException, FastAPI, UploadFile, File
import multiprocessing
from marker.output import save_markdown
from marker.convert import convert_single_pdf
from marker.models import load_all_models
import torch
from concurrent.futures import ProcessPoolExecutor
import os
app = FastAPI()
model_lst = None
model_refs = None
temp_dir = "./temp"
# 设置环境变量
os.environ['PROCESSES_PER_GPU'] = str(2)

def worker_init(counter, lock):
global model_lst

# 动态获取可用 GPU 数量
num_gpus = torch.cuda.device_count()
processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))

with lock:
worker_id = counter.value
counter.value += 1

# 根据 worker_id 动态分配设备
if num_gpus == 0:
device = 'cpu'
else:
device_id = worker_id // processes_per_gpu
if device_id >= num_gpus:
raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
device = f'cuda:{device_id}'

# 加载模型到对应的设备
model_lst = load_all_models(device=device, dtype=torch.float32)
print(f"Worker {worker_id}: Models loaded successfully on {device}!")

# 设置模型共享内存
for model in model_lst:
if model is None:
continue
model.share_memory()

def process_file_with_multiprocessing(temp_file_path):
global model_lst
full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1)
fname = os.path.basename(temp_file_path)
subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta)
md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path)
return md_content_with_base64_images, out_meta

@asynccontextmanager
async def lifespan(app: FastAPI):
try:
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
except RuntimeError:
raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
# 创建 Manager 对象
manager = multiprocessing.Manager()
worker_counter = manager.Value('i', 0)
worker_lock = manager.Lock()
global my_pool
gpu_count = torch.cuda.device_count()
# my_pool = ProcessPoolExecutor(max_workers=4, initializer=worker_init)
my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock))

# 将控制权交还给应用
yield
global temp_dir
# 应用关闭时执行清理工作
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir) # Delete temp directory
del model_lst
del model_refs
print("Application shutdown, cleaning up...")

# Set up a lifespan context manager
app.router.lifespan_context = lifespan

@app.post("/v1/parse/file")
async def read_file(
file: UploadFile = File(...)):
try:
start_time = time.time()
global temp_dir
os.makedirs(temp_dir, exist_ok=True)
temp_file_path = os.path.join(temp_dir, file.filename)
with open(temp_file_path, "wb") as temp_file:
temp_file.write(await file.read())

# Use fitz to open the PDF and get the number of pages
pdf_document = fitz.open(temp_file_path)
total_pages = pdf_document.page_count
pdf_document.close()

# print("Is deamon0:", multiprocessing.current_process().daemon)
global my_pool
loop = asyncio.get_event_loop()
md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path)
# Record end time
end_time = time.time()

# Calculate duration (milliseconds)
duration = end_time - start_time
print(file.filename+"Total time:", duration)
return {
"success": True,
"message": "",
"data": {
"markdown": md_content_with_base64_images,
"page": total_pages,
"duration": duration
}
}

except Exception as e:
logger.exception(e)
raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}")

finally:

if temp_file_path and os.path.exists(temp_file_path):
os.remove(temp_file_path)
def img_to_base64(img_path):
with open(img_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode('utf-8')
def embed_images_as_base64(md_content, image_dir):
lines = md_content.split('\n')
new_lines = []
for line in lines:
if line.startswith("![") and "](" in line and ")" in line:
start_idx = line.index("](") + 2
end_idx = line.index(")", start_idx)
img_rel_path = line[start_idx:end_idx]

img_name = os.path.basename(img_rel_path)
img_path = os.path.join(image_dir, img_name)

if os.path.exists(img_path):
img_base64 = img_to_base64(img_path)
new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}'
new_lines.append(new_line)
else:
new_lines.append(line)
else:
new_lines.append(line)
return '\n'.join(new_lines)

if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7231)

5 changes: 5 additions & 0 deletions python/pdf-marker/pip.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[global]
time-out=60
index-url=https://pypi.tuna.tsinghua.edu.cn/simple
[install]
trusted-host=pypi.tuna.tsinghua.edu.cn
108 changes: 108 additions & 0 deletions python/pdf-marker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
acres==0.1.0
aiofiles==24.1.0
annotated-types==0.7.0
anyio==4.6.2.post1
certifi==2024.8.30
charset-normalizer==3.4.0
ci-info==0.3.0
click==8.1.7
coloredlogs==15.0.1
configobj==5.0.9
configparser==7.1.0
dol==0.2.83
etelemetry==0.3.1
fastapi==0.115.5
filelock==3.16.1
filetype==1.2.0
flatbuffers==24.3.25
frontend==0.0.3
fsspec==2024.10.0
ftfy==6.3.1
h11==0.14.0
httplib2==0.22.0
huggingface-hub==0.26.2
humanfriendly==10.0
i2==0.1.36
idna==3.10
importlib_resources==6.4.5
isodate==0.6.1
itsdangerous==2.2.0
Jinja2==3.1.4
joblib==1.4.2
loguru==0.7.2
looseversion==1.3.0
lxml==5.3.0
marker-pdf==0.3.10
MarkupSafe==3.0.2
mpmath==1.3.0
networkx==3.4.2
nibabel==5.3.2
nipype==1.9.1
numpy==2.1.3
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.1.3
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu12==12.3.1.170
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127
onnxruntime==1.20.1
opencv-python==4.10.0.84
opencv-python-headless==4.10.0.84
packaging==24.2
pandas==2.2.3
pathlib==1.0.1
pdftext==0.3.19
pillow==10.4.0
pip==24.3.1
protobuf==5.28.3
prov==2.0.1
puremagic==1.28
pydantic==2.10.0
pydantic_core==2.27.0
pydantic-settings==2.6.1
pydot==3.0.2
PyMuPDF==1.24.14
pyparsing==3.2.0
pypdfium2==4.30.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.17
pytz==2024.2
pyxnat==1.6.2
PyYAML==6.0.2
RapidFuzz==3.10.1
rdflib==6.3.2
regex==2024.11.6
requests==2.32.3
safetensors==0.4.5
scikit-learn==1.5.2
scipy==1.14.1
setuptools==75.6.0
simplejson==3.19.3
six==1.16.0
sniffio==1.3.1
starlette==0.41.3
surya-ocr==0.6.13
sympy==1.13.1
tabled-pdf==0.1.4
tabulate==0.9.0
texify==0.2.1
threadpoolctl==3.5.0
tokenizers==0.20.3
torch==2.5.1
tqdm==4.67.0
traits==6.4.3
transformers==4.46.3
triton==3.1.0
typing_extensions==4.12.2
tzdata==2024.2
urllib3==2.2.3
uvicorn==0.32.1
wcwidth==0.2.13
wheel==0.45.0
34 changes: 34 additions & 0 deletions python/pdf-marker/test/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import json
import os
from io import BytesIO
import requests
from multiprocessing import Process
def request_(file_path, ocr):
url = "http://127.0.0.1:7231/v1/parse/file" # FastAPI 服务 URL
response = requests.get(file_path)
if response.status_code == 200:
file_data = BytesIO(response.content)
pdf_name = os.path.basename(file_path)
files = {'file': (pdf_name, file_data, 'application/pdf')}

# 发送 POST 请求
response = requests.post(url, files=files)

# 处理响应
if response.status_code == 200:
print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)

if __name__ == "__main__":
file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf",
"https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf",
"https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/Exploring_the_Applicability_of_Transfer_Learning_and_Feature_Engineering_in_Epilepsy_Prediction_Using_Hybrid_Transformer_Model.pdf",
"https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"]
file_path = "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf"
file_path2 = "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/Exploring_the_Applicability_of_Transfer_Learning_and_Feature_Engineering_in_Epilepsy_Prediction_Using_Hybrid_Transformer_Model.pdf"
for file_path in file_paths:
p = Process(target=request_, args=(file_path,True))
p.start()