labring · YYH211 · Nov 22, 2024 · c121914yu · Nov 24, 2024
diff --git a/python/pdf-marker/Dockerfile b/python/pdf-marker/Dockerfile
@@ -0,0 +1,38 @@
+FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+# 安装构建依赖 cv2 dependencies
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
+
+# 设置 pip 配置
+RUN mkdir -p /root/.pip
+COPY pip.conf /root/.pip/
+
+# 创建模型文件夹
+RUN mkdir -p /root/huggingface
+
+# 复制依赖文件
+COPY requirements.txt /root/
+COPY api_mp.py /root/
+
+
+# 导入huggingface的代理和huggingface模型位置
+ENV HF_ENDPOINT=https://hf-mirror.com \
+    HF_DATASETS_CACHE=/root/huggingface \
+    HUGGINGFACE_HUB_CACHE=/root/huggingface \
+    HF_HOME=/root/huggingface
+
+# 设置工作目录
+WORKDIR /root
+
+# 安装 Python 依赖
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# 删除不必要的工具和文件以减小镜像体积
+RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip
+
+
+
+# 设置容器启动命令
+CMD ["python3", "api_mp.py"]
diff --git a/python/pdf-marker/Readme.md b/python/pdf-marker/Readme.md
@@ -0,0 +1,43 @@
+## 环境
+
+24G显存的显卡若干
+
+## 打包镜像
+
+在 `pdf-marker` 根目录下执行：
+
+```bash
+sudo docker build -t model_pdf -f Dockerfile .
+```
+
+## 运行容器
+
+```bash
+sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 model_pdf
+```
+
+## 访问示例
+
+用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务
+
+参数：file-->本地文件的地址
+
+示例一：
+
+```bash
+curl --location --request POST "http://localhost:7231/v1/parse/file" \
+--header "Authorization: Bearer your_access_token" \
+--form "file=@./file/chinese_test.pdf"
+```
+
+示例二：
+
+```bash
+curl --location --request POST "http://localhost:7231/v1/parse/file" \
+--header "Authorization: Bearer your_access_token" \
+--form "file=@./file/englist_test.pdf"
+```
+
+## 多文件测试数据
+
+运行 `test` 文件下的 `test.py` 文件，修改里面的 `file_paths` 为自己仓库的 `url` 即可
diff --git a/python/pdf-marker/api_mp.py b/python/pdf-marker/api_mp.py
@@ -0,0 +1,162 @@
+import asyncio
+import base64
+import fitz
+import torch.multiprocessing as mp
+import shutil
+import time
+from contextlib import asynccontextmanager
+from loguru import logger
+from fastapi import HTTPException, FastAPI, UploadFile, File
+import multiprocessing
+from marker.output import save_markdown
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+import torch
+from concurrent.futures import ProcessPoolExecutor
+import os
+app = FastAPI()
+model_lst = None
+model_refs = None
+temp_dir = "./temp"
+# 设置环境变量
+os.environ['PROCESSES_PER_GPU'] = str(2)
+
+def worker_init(counter, lock):
+    global model_lst
+
+    # 动态获取可用 GPU 数量
+    num_gpus = torch.cuda.device_count()
+    processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
+
+    with lock:
+        worker_id = counter.value
+        counter.value += 1
+
+    # 根据 worker_id 动态分配设备
+    if num_gpus == 0:
+        device = 'cpu'
+    else:
+        device_id = worker_id // processes_per_gpu
+        if device_id >= num_gpus:
+            raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
+        device = f'cuda:{device_id}'
+
+    # 加载模型到对应的设备
+    model_lst = load_all_models(device=device, dtype=torch.float32)
+    print(f"Worker {worker_id}: Models loaded successfully on {device}!")
+
+    # 设置模型共享内存
+    for model in model_lst:
+        if model is None:
+            continue
+        model.share_memory()
+
+def process_file_with_multiprocessing(temp_file_path):
+    global model_lst
+    full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1)
+    fname = os.path.basename(temp_file_path)
+    subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta)
+    md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path)
+    return md_content_with_base64_images, out_meta
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
+    except RuntimeError:
+        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
+    # 创建 Manager 对象
+    manager = multiprocessing.Manager()
+    worker_counter = manager.Value('i', 0)
+    worker_lock = manager.Lock()
+    global my_pool
+    gpu_count = torch.cuda.device_count()
+    # my_pool = ProcessPoolExecutor(max_workers=4, initializer=worker_init)
+    my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock))
+
+    # 将控制权交还给应用
+    yield
+    global temp_dir
+    # 应用关闭时执行清理工作
+    if temp_dir and os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)  # Delete temp directory
+    del model_lst
+    del model_refs
+    print("Application shutdown, cleaning up...")
+
+# Set up a lifespan context manager
+app.router.lifespan_context = lifespan
+
+@app.post("/v1/parse/file")
+async def read_file(
+        file: UploadFile = File(...)):
+    try:
+        start_time = time.time()
+        global temp_dir
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file_path = os.path.join(temp_dir, file.filename)
+        with open(temp_file_path, "wb") as temp_file:
+            temp_file.write(await file.read())
+
+        # Use fitz to open the PDF and get the number of pages
+        pdf_document = fitz.open(temp_file_path)
+        total_pages = pdf_document.page_count
+        pdf_document.close()
+
+        # print("Is deamon0:", multiprocessing.current_process().daemon)
+        global my_pool
+        loop = asyncio.get_event_loop()
+        md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path)
+        # Record end time
+        end_time = time.time()
+
+        # Calculate duration (milliseconds)
+        duration = end_time - start_time
+        print(file.filename+"Total time:", duration)
+        return {
+                "success": True,
+                "message": "",
+                "data": {
+                    "markdown": md_content_with_base64_images,
+                    "page": total_pages,
+                    "duration": duration
+                }
+            }
+
+    except Exception as e:
+        logger.exception(e)
+        raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}")
+
+    finally:
+
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+def img_to_base64(img_path):
+    with open(img_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+def embed_images_as_base64(md_content, image_dir):
+    lines = md_content.split('\n')
+    new_lines = []
+    for line in lines:
+        if line.startswith("![") and "](" in line and ")" in line:
+            start_idx = line.index("](") + 2
+            end_idx = line.index(")", start_idx)
+            img_rel_path = line[start_idx:end_idx]
+
+            img_name = os.path.basename(img_rel_path)
+            img_path = os.path.join(image_dir, img_name)
+
+            if os.path.exists(img_path):
+                img_base64 = img_to_base64(img_path)
+                new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}'
+                new_lines.append(new_line)
+            else:
+                new_lines.append(line)
+        else:
+            new_lines.append(line)
+    return '\n'.join(new_lines)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7231)
+
diff --git a/python/pdf-marker/pip.conf b/python/pdf-marker/pip.conf
@@ -0,0 +1,5 @@
+[global]
+time-out=60
+index-url=https://pypi.tuna.tsinghua.edu.cn/simple
+[install]
+trusted-host=pypi.tuna.tsinghua.edu.cn
diff --git a/python/pdf-marker/requirements.txt b/python/pdf-marker/requirements.txt
@@ -0,0 +1,108 @@
+acres==0.1.0
+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+ci-info==0.3.0
+click==8.1.7
+coloredlogs==15.0.1
+configobj==5.0.9
+configparser==7.1.0
+dol==0.2.83
+etelemetry==0.3.1
+fastapi==0.115.5
+filelock==3.16.1
+filetype==1.2.0
+flatbuffers==24.3.25
+frontend==0.0.3
+fsspec==2024.10.0
+ftfy==6.3.1
+h11==0.14.0
+httplib2==0.22.0
+huggingface-hub==0.26.2
+humanfriendly==10.0
+i2==0.1.36
+idna==3.10
+importlib_resources==6.4.5
+isodate==0.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.4
+joblib==1.4.2
+loguru==0.7.2
+looseversion==1.3.0
+lxml==5.3.0
+marker-pdf==0.3.10
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+nibabel==5.3.2
+nipype==1.9.1
+numpy==2.1.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+onnxruntime==1.20.1
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+packaging==24.2
+pandas==2.2.3
+pathlib==1.0.1
+pdftext==0.3.19
+pillow==10.4.0
+pip==24.3.1
+protobuf==5.28.3
+prov==2.0.1
+puremagic==1.28
+pydantic==2.10.0
+pydantic_core==2.27.0
+pydantic-settings==2.6.1
+pydot==3.0.2
+PyMuPDF==1.24.14
+pyparsing==3.2.0
+pypdfium2==4.30.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.17
+pytz==2024.2
+pyxnat==1.6.2
+PyYAML==6.0.2
+RapidFuzz==3.10.1
+rdflib==6.3.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+setuptools==75.6.0
+simplejson==3.19.3
+six==1.16.0
+sniffio==1.3.1
+starlette==0.41.3
+surya-ocr==0.6.13
+sympy==1.13.1
+tabled-pdf==0.1.4
+tabulate==0.9.0
+texify==0.2.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+torch==2.5.1
+tqdm==4.67.0
+traits==6.4.3
+transformers==4.46.3
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+wcwidth==0.2.13
+wheel==0.45.0
diff --git a/python/pdf-marker/test/test.py b/python/pdf-marker/test/test.py
@@ -0,0 +1,34 @@
+import json
+import os
+from io import BytesIO
+import requests
+from multiprocessing import Process
+def request_(file_path, ocr):
+    url = "http://127.0.0.1:7231/v1/parse/file"  # FastAPI 服务 URL
+    response = requests.get(file_path)
+    if response.status_code == 200:
+        file_data = BytesIO(response.content)
+        pdf_name = os.path.basename(file_path)
+        files = {'file': (pdf_name, file_data, 'application/pdf')}
+
+        # 发送 POST 请求
+        response = requests.post(url, files=files)
+
+        # 处理响应
+        if response.status_code == 200:
+            print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False))
+        else:
+            print(f"Request failed with status code: {response.status_code}")
+            print(response.text)
+
+if __name__ == "__main__":
+    file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf",
+                 "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf",
+                 "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/Exploring_the_Applicability_of_Transfer_Learning_and_Feature_Engineering_in_Epilepsy_Prediction_Using_Hybrid_Transformer_Model.pdf",
+                 "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"]
+    file_path = "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf"
+    file_path2 = "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/Exploring_the_Applicability_of_Transfer_Learning_and_Feature_Engineering_in_Epilepsy_Prediction_Using_Hybrid_Transformer_Model.pdf"
+    for file_path in file_paths:
+        p = Process(target=request_, args=(file_path,True))
+        p.start()
+