diff --git a/README.md b/README.md index bcc9428..e719cc0 100644 --- a/README.md +++ b/README.md @@ -258,12 +258,14 @@ pdf2zh example.pdf -t 1

TODO

-- [ ] Parse layout with [PaddleX (17 cls)](https://github.com/PaddlePaddle/PaddleX/blob/17cc27ac3842e7880ca4aad92358d3ef8555429a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py#L81), [PaperMage (27 cls)](https://github.com/allenai/papermage/blob/9cd4bb48cbedab45d0f7a455711438f1632abebe/README.md?plain=1#L102), [SAM2](https://github.com/facebookresearch/sam2) +- [ ] Parse layout with [PaddleX](https://github.com/PaddlePaddle/PaddleX/blob/17cc27ac3842e7880ca4aad92358d3ef8555429a/paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py#L81), [PaperMage](https://github.com/allenai/papermage/blob/9cd4bb48cbedab45d0f7a455711438f1632abebe/README.md?plain=1#L102), [SAM2](https://github.com/facebookresearch/sam2) - [ ] Fix page rotation, table of contents, format of list - [ ] Fix pixel formula in old paper +- [ ] Support multiple language with [Noto Font](https://fonts.google.com/noto), [Ubuntu Font](https://design.ubuntu.com/font) +

Acknowledgements

- Document merging: [PyMuPDF](https://github.com/pymupdf/PyMuPDF) diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py index 69fd9d7..c5a5a83 100644 --- a/pdf2zh/converter.py +++ b/pdf2zh/converter.py @@ -528,10 +528,10 @@ def vflag(font, char): # 匹配公式(和角标)字体 pstk[-1][5] = child.font sstk[-1] += child.get_text() else: # 公式入栈 - if ( # 根据公式右侧的文字修正公式的纵向偏移 + if ( # 根据公式左侧的文字修正公式的纵向偏移 not vstk # 1. 当前字符是公式的第一个字符 and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 - and child.x0 > xt.x0 # 3. 当前字符在前一个字符右侧 + and child.x0 > xt.x0 # 3. 前一个字符在公式左侧 ): vfix = child.y0 - xt.y0 vstk.append(child)