diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 0000000000..36c5737550 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,81 @@ +# PaddleOCR + +name: 🐛 Bug Report +description: Problems with PaddleOCR +labels: [bug] +body: + - type: markdown + attributes: + value: | + Thank you for submitting a PaddleOCR 🐛 Bug Report! + + - type: checkboxes + attributes: + label: Search before asking + description: > + Please search the PaddleOCR [Docs](https://paddlepaddle.github.io/PaddleOCR/), [Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) and [Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions) to see if a similar bug report already exists. + options: + - label: > + I have searched the PaddleOCR [Docs](https://paddlepaddle.github.io/PaddleOCR/) and found no similar bug report. + required: true + - label: > + I have searched the PaddleOCR [Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) and found no similar bug report. + required: true + - label: > + I have searched the PaddleOCR [Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions) and found no similar bug report. + required: true + + - type: textarea + attributes: + label: Bug + description: Provide console output with error messages and/or screenshots of the bug. + placeholder: | + 💡 ProTip! Include as much information as possible (screenshots, logs, tracebacks etc.) to receive the most helpful response. + validations: + required: true + + - type: textarea + attributes: + label: Environment + description: Please specify the software and hardware you used to produce the bug. + placeholder: | + + ``` + OS macOS-13.5.2 + Environment Jupyter + Python 3.11.2 + PaddleOCR 2.8.1 + Install git + RAM 16.00 GB + CPU Apple M2 + CUDA None + ``` + validations: + required: false + + - type: textarea + attributes: + label: Minimal Reproducible Example + description: > + When asking a question, people will be better able to provide help if you provide code that they can easily understand and use to **reproduce** the problem. + This is referred to by community members as creating a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). + placeholder: | + ``` + # Code to reproduce your issue here + ``` + validations: + required: false + + - type: textarea + attributes: + label: Additional + description: Anything else you would like to share? + + - type: checkboxes + attributes: + label: Are you willing to submit a PR? + description: > + (Optional) We encourage you to submit a [Pull Request](https://github.com/PaddlePaddle/PaddleOCR/pulls) (PR) to help improve PaddleOCR for everyone, especially if you have a good understanding of how to implement a fix or feature. + See the PaddleOCR [community_contribution](https://paddlepaddle.github.io/PaddleOCR/community/community_contribution.html#2) to get started. + options: + - label: Yes I'd like to help by submitting a PR! diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md deleted file mode 100644 index 83d6ae1b64..0000000000 --- a/.github/ISSUE_TEMPLATE/bug.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: 🐞 Bug -about: Bug -title: '' -labels: 'Bug' -assignees: '' - ---- - - -#### 问题描述 / Problem Description - - -#### 运行环境 / Runtime Environment -- OS: -- Paddle: -- PaddleOCR: - -#### 复现代码 / Reproduction Code - - -#### 完整报错 / Complete Error Message - - -#### 可能解决方案 / Possible solutions - - -#### 附件 / Appendix diff --git a/.github/workflows/documents.yml b/.github/workflows/documents.yml new file mode 100644 index 0000000000..b98a4b8e1e --- /dev/null +++ b/.github/workflows/documents.yml @@ -0,0 +1,29 @@ +name: build_document_site +on: + push: + branches: + - master + - main +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material jieba mkdocs-git-revision-date-localized-plugin mkdocs-git-committers-plugin-2 mkdocs-static-i18n + - run: mkdocs gh-deploy --force diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4121e4a654..3dc5bc0b15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,13 +10,13 @@ repos: - id: detect-private-key - id: end-of-file-fixer - id: trailing-whitespace - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py|md)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$ - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.5.1 hooks: - id: remove-crlf - id: remove-tabs - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py|md)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$ - repo: local hooks: - id: clang-format diff --git a/README.md b/README.md index 2a4eeb03bf..8b5556823b 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,17 @@ PaddleOCR 由 [PMC](https://github.com/PaddlePaddle/PaddleOCR/issues/12122) 监 ⚠️注意:[Issues](https://github.com/PaddlePaddle/PaddleOCR/issues)模块仅用来报告程序🐞Bug,其余提问请移步[Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions)模块提问。如所提Issue不是Bug,会被移到Discussions模块,敬请谅解。 ## 📣 近期更新 + +- **🔥2024.7 添加 PaddleOCR 算法模型挑战赛冠军方案**: + - 赛题一:OCR 端到端识别任务冠军方案——[场景文本识别算法-SVTRv2](doc/doc_ch/algorithm_rec_svtrv2.md); + - 赛题二:通用表格识别任务冠军方案——[表格识别算法-SLANet-LCNetV2](doc/doc_ch/algorithm_table_slanet.md)。 + +- **💥2024.6.27 飞桨低代码开发工具 [PaddleX 3.0](https://github.com/paddlepaddle/paddlex) 重磅更新!** + - 低代码开发范式:支持 OCR 模型全流程低代码开发,提供 Python API,支持用户自定义串联模型; + - 多硬件训推支持:支持英伟达 GPU、昆仑芯、昇腾和寒武纪等多种硬件进行模型训练与推理。PaddleOCR支持的模型见 [模型列表](doc/doc_ch/hardware/supported_models.md) + - **📚直播和OCR实战打卡营预告**:《PP-ChatOCRv2赋能金融报告信息智能化抽取,新金融效率再升级》课程上线,破解复杂版面、表格识别、信息抽取OCR解析难题,直播时间:6月6日(周四)19:00。并于6月11日启动【政务采购合同信息抽取】实战打卡营。报名链接:https://www.wjx.top/vm/eBcYmqO.aspx?udsid=197406 + - **🔥2024.5.10 上线星河零代码产线(OCR 相关)**:全面覆盖了以下四大 OCR 核心任务,提供极便捷的 Badcase 分析和实用的在线体验: - [通用 OCR](https://aistudio.baidu.com/community/app/91660) (PP-OCRv4)。 - [通用表格识别](https://aistudio.baidu.com/community/app/91661) (SLANet)。 diff --git a/README_en.md b/README_en.md index 0580b28a21..6dcbe067e6 100644 --- a/README_en.md +++ b/README_en.md @@ -31,6 +31,11 @@ PaddleOCR is being oversight by a [PMC](https://github.com/PaddlePaddle/PaddleOC ⚠️ Note: The [Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) module is only for reporting program 🐞 bugs, for the rest of the questions, please move to the [Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions). Please note that if the Issue mentioned is not a bug, it will be moved to the Discussions module. ## 📣 Recent updates + +- **🔥2024.7 Added PaddleOCR Algorithm Model Challenge Champion Solutions**: + - Challenge One, OCR End-to-End Recognition Task Champion Solution: [Scene Text Recognition Algorithm-SVTRv2](doc/doc_ch/algorithm_rec_svtrv2.md); + - Challenge Two, General Table Recognition Task Champion Solution: [Table Recognition Algorithm-SLANet-LCNetV2](doc/doc_ch/algorithm_table_slanet.md). + - **🔥2023.8.7 Release PaddleOCR[release/2.7](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7)** - Release [PP-OCRv4](./doc/doc_ch/PP-OCRv4_introduction.md), support mobile version and server version - PP-OCRv4-mobile:When the speed is comparable, the effect of the Chinese scene is improved by 4.5% compared with PP-OCRv3, the English scene is improved by 10%, and the average recognition accuracy of the 80-language multilingual model is increased by more than 8%. diff --git a/VERSION_NUMBER b/VERSION_NUMBER index 834f262953..dbe5900654 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -2.8.0 +2.8.1 diff --git a/__init__.py b/__init__.py index 7233b3c0ed..e57c0c764b 100644 --- a/__init__.py +++ b/__init__.py @@ -11,13 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .paddleocr import * +from .paddleocr import ( + PaddleOCR, + PPStructure, + draw_ocr, + draw_structure_result, + save_structure_res, + download_with_progressbar, + sorted_layout_boxes, + convert_info_docx, + to_excel, +) import importlib.metadata as importlib_metadata try: __version__ = importlib_metadata.version(__package__ or __name__) except importlib_metadata.PackageNotFoundError: __version__ = "0.0.0" + __all__ = [ "PaddleOCR", "PPStructure", diff --git a/configs/rec/SVTRv2/rec_repsvtr_ch.yml b/configs/rec/SVTRv2/rec_repsvtr_ch.yml new file mode 100644 index 0000000000..4fd643bc98 --- /dev/null +++ b/configs/rec/SVTRv2/rec_repsvtr_ch.yml @@ -0,0 +1,134 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_repsvtr_ch + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_repsvtr.txt + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.025 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: RepSVTR + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_ch.yml b/configs/rec/SVTRv2/rec_svtrv2_ch.yml new file mode 100644 index 0000000000..70efe10934 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_ch.yml @@ -0,0 +1,143 @@ +Global: + debug: false + use_gpu: true + epoch_num: 200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_ch + save_epoch_step: 10 + eval_batch_step: [0, 1000] + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svrtv2.txt + + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.001 # 8gpus 192bs + warmup_epoch: 5 + + +Architecture: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: *max_text_length + num_decoder_layers: 2 + +Loss: + name: MultiLoss + loss_config_list: + - CTCLoss: + - NRTRLoss: + +PostProcess: + name: CTCLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml b/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml new file mode 100644 index 0000000000..071070ed83 --- /dev/null +++ b/configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml @@ -0,0 +1,208 @@ +Global: + debug: false + use_gpu: true + epoch_num: 100 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec_svtrv2_ch_distill_lr00002/ + save_epoch_step: 5 + eval_batch_step: + - 0 + - 1000 + cal_metric_during_train: False + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: false + infer_img: doc/imgs_words/ch/word_1.jpg + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + max_text_length: &max_text_length 25 + infer_mode: false + use_space_char: true + distributed: true + save_res_path: ./output/rec/predicts_svtrv2_ch_distill.txt +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.99 + epsilon: 1.e-8 + weight_decay: 0.05 + no_weight_decay_name: norm pos_embed patch_embed downsample + one_dim_param_no_weight_decay: True + lr: + name: Cosine + learning_rate: 0.0002 # 8gpus 192bs + warmup_epoch: 5 +Architecture: + model_type: rec + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: ./output/rec_svtrv2_ch/best_accuracy + freeze_params: true + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: SVTRv2 + use_pos_embed: False + dims: [128, 256, 384] + depths: [6, 6, 6] + num_heads: [4, 8, 12] + mixer: [['Conv','Conv','Conv','Conv','Conv','Conv'],['Conv','Conv','Global','Global','Global','Global'],['Global','Global','Global','Global','Global','Global']] + local_k: [[5, 5], [5, 5], [-1, -1]] + sub_k: [[2, 1], [2, 1], [-1, -1]] + last_stage: False + use_pool: True + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length + Student: + pretrained: ./output/rec_repsvtr_ch/best_accuracy + freeze_params: false + return_all_feats: true + model_type: rec + algorithm: SVTR_LCNet + Transform: null + Backbone: + name: RepSVTR + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 256 + depth: 2 + hidden_dims: 256 + kernel_size: [1, 3] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + num_decoder_layers: 2 + max_text_length: *max_text_length +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDKDLoss: + weight: 0.1 + model_name_pairs: + - - Student + - Teacher + key: head_out + multi_head: true + alpha: 1.0 + beta: 2.0 + dis_head: gtc + name: dkd + - DistillationCTCLoss: + weight: 1.0 + model_name_list: + - Student + key: head_out + multi_head: true + - DistillationNRTRLoss: + weight: 1.0 + smoothing: false + model_name_list: + - Student + key: head_out + multi_head: true + - DistillCTCLogits: + weight: 1.0 + reduction: mean + model_name_pairs: + - - Student + - Teacher + key: head_out +PostProcess: + name: DistillationCTCLabelDecode + model_name: + - Student + key: head_out + multi_head: true +Metric: + name: DistillationMetric + base_metric_name: RecMetric + main_indicator: acc + key: Student + + +Train: + dataset: + name: MultiScaleDataSet + ds_width: false + data_dir: ./train_data/ + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/train_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - RecAug: + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + sampler: + name: MultiScaleSampler + scales: [[320, 32], [320, 48], [320, 64]] + first_bs: &bs 192 + fix_bs: false + divided_factor: [8, 16] # w, h + is_training: True + loader: + shuffle: true + batch_size_per_card: *bs + drop_last: true + num_workers: 8 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data + label_file_list: + - ./train_data/val_list.txt + transforms: + - DecodeImage: + img_mode: BGR + channel_first: false + - MultiLabelEncode: + gtc_encode: NRTRLabelEncode + - RecResizeImg: + image_shape: [3, 48, 320] + - KeepKeys: + keep_keys: + - image + - label_ctc + - label_gtc + - length + - valid_ratio + loader: + shuffle: false + drop_last: false + batch_size_per_card: 128 + num_workers: 4 diff --git a/configs/rec/rec_latex_ocr.yml b/configs/rec/rec_latex_ocr.yml new file mode 100644 index 0000000000..cde3449076 --- /dev/null +++ b/configs/rec/rec_latex_ocr.yml @@ -0,0 +1,126 @@ +Global: + use_gpu: True + epoch_num: 500 + log_smooth_window: 20 + print_batch_step: 100 + save_model_dir: ./output/rec/latex_ocr/ + save_epoch_step: 5 + max_seq_len: 512 + # evaluation is run every 60000 iterations (22 epoch)(batch_size = 56) + eval_batch_step: [0, 60000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/datasets/pme_demo/0000013.png + infer_mode: False + use_space_char: False + rec_char_dict_path: ppocr/utils/dict/latex_ocr_tokenizer.json + save_res_path: ./output/rec/predicts_latexocr.txt + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + lr: + name: Const + learning_rate: 0.0001 + +Architecture: + model_type: rec + algorithm: LaTeXOCR + in_channels: 1 + Transform: + Backbone: + name: HybridTransformer + img_size: [192, 672] + patch_size: 16 + num_classes: 0 + embed_dim: 256 + depth: 4 + num_heads: 8 + input_channel: 1 + is_predict: False + is_export: False + Head: + name: LaTeXOCRHead + pad_value: 0 + is_export: False + decoder_args: + attn_on_attn: True + cross_attend: True + ff_glu: True + rel_pos_bias: False + use_scalenorm: False + +Loss: + name: LaTeXOCRLoss + +PostProcess: + name: LaTeXOCRDecode + rec_char_dict_path: ppocr/utils/dict/latex_ocr_tokenizer.json + +Metric: + name: LaTeXOCRMetric + main_indicator: exp_rate + cal_blue_score: False + +Train: + dataset: + name: LaTeXOCRDataSet + data: ./train_data/LaTeXOCR/latexocr_train.pkl + min_dimensions: [32, 32] + max_dimensions: [672, 192] + batch_size_per_pair: 56 + keep_smaller_batches: False + transforms: + - DecodeImage: + channel_first: False + - MinMaxResize: + min_dimensions: [32, 32] + max_dimensions: [672, 192] + - LatexTrainTransform: + bitmap_prob: .04 + - NormalizeImage: + mean: [0.7931, 0.7931, 0.7931] + std: [0.1738, 0.1738, 0.1738] + order: 'hwc' + - LatexImageFormat: + - KeepKeys: + keep_keys: ['image'] + loader: + shuffle: True + batch_size_per_card: 1 + drop_last: False + num_workers: 0 + collate_fn: LaTeXOCRCollator + +Eval: + dataset: + name: LaTeXOCRDataSet + data: ./train_data/LaTeXOCR/latexocr_val.pkl + min_dimensions: [32, 32] + max_dimensions: [672, 192] + batch_size_per_pair: 10 + keep_smaller_batches: True + transforms: + - DecodeImage: + channel_first: False + - MinMaxResize: + min_dimensions: [32, 32] + max_dimensions: [672, 192] + - LatexTestTransform: + - NormalizeImage: + mean: [0.7931, 0.7931, 0.7931] + std: [0.1738, 0.1738, 0.1738] + order: 'hwc' + - LatexImageFormat: + - KeepKeys: + keep_keys: ['image'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 + num_workers: 0 + collate_fn: LaTeXOCRCollator diff --git a/doc/datasets/pme_demo/0000013.png b/doc/datasets/pme_demo/0000013.png new file mode 100644 index 0000000000..9f8b11b580 Binary files /dev/null and b/doc/datasets/pme_demo/0000013.png differ diff --git a/doc/datasets/pme_demo/0000295.png b/doc/datasets/pme_demo/0000295.png new file mode 100644 index 0000000000..26a271abf6 Binary files /dev/null and b/doc/datasets/pme_demo/0000295.png differ diff --git a/doc/datasets/pme_demo/0000562.png b/doc/datasets/pme_demo/0000562.png new file mode 100644 index 0000000000..121eda8e2a Binary files /dev/null and b/doc/datasets/pme_demo/0000562.png differ diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 6fb277a1f4..90b4b62013 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -137,6 +137,7 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 已支持的公式识别算法列表(戳链接获取使用教程): - [x] [CAN](./algorithm_rec_can.md) +- [x] [LaTeX-OCR](./algorithm_rec_latex_ocr.md) 在CROHME手写公式数据集上,算法效果如下: @@ -144,6 +145,13 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 | ----- | ----- | ----- | ----- | ----- | |CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| +在LaTeX-OCR印刷公式数据集上,算法效果如下: + +| 模型 | 骨干网络 |配置文件 | BLEU score | normed edit distance | ExpRate |下载链接| +|-----------|------------| ----- |:-----------:|:---------------------:|:---------:| ----- | +| LaTeX-OCR | Hybrid ViT |[rec_latex_ocr.yml](../../configs/rec/rec_latex_ocr.yml)| 0.8821 | 0.0823 | 40.01% |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar)| + + ## 2. 端到端算法 diff --git a/doc/doc_ch/algorithm_rec_latex_ocr.md b/doc/doc_ch/algorithm_rec_latex_ocr.md new file mode 100644 index 0000000000..9acc861828 --- /dev/null +++ b/doc/doc_ch/algorithm_rec_latex_ocr.md @@ -0,0 +1,176 @@ +# 印刷数学公式识别算法-LaTeX-OCR + +- [1. 算法简介](#1) +- [2. 环境配置](#2) +- [3. 模型训练、评估、预测](#3) + - [3.1 pickle 标签文件生成](#3-1) + - [3.2 训练](#3-2) + - [3.3 评估](#3-3) + - [3.4 预测](#3-4) +- [4. 推理部署](#4) + - [4.1 Python推理](#4-1) + - [4.2 C++推理](#4-2) + - [4.3 Serving服务化部署](#4-3) + - [4.4 更多推理部署](#4-4) +- [5. FAQ](#5) + + +## 1. 算法简介 + +原始项目: +> [https://github.com/lukas-blecher/LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR) + + + + +`LaTeX-OCR`使用[`LaTeX-OCR印刷公式数据集`](https://drive.google.com/drive/folders/13CA4vAmOmD_I_dSbvLp-Lf0s6KiaNfuO)进行训练,在对应测试集上的精度如下: + +| 模型 | 骨干网络 |配置文件 | BLEU score | normed edit distance | ExpRate |下载链接| +|-----------|------------| ----- |:-----------:|:---------------------:|:---------:| ----- | +| LaTeX-OCR | Hybrid ViT |[rec_latex_ocr.yml](../../configs/rec/rec_latex_ocr.yml)| 0.8821 | 0.0823 | 40.01% |[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar)| + + +## 2. 环境配置 +请先参考[《运行环境准备》](./environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](./clone.md)克隆项目代码。 + +此外,需要安装额外的依赖: +```shell +pip install "tokenizers==0.19.1" "imagesize" +``` + + +## 3. 模型训练、评估、预测 + + + +### 3.1 pickle 标签文件生成 +从[谷歌云盘](https://drive.google.com/drive/folders/13CA4vAmOmD_I_dSbvLp-Lf0s6KiaNfuO)中下载 formulae.zip 和 math.txt,之后,使用如下命令,生成 pickle 标签文件。 + +```shell +# 创建 LaTeX-OCR 数据集目录 +mkdir -p train_data/LaTeXOCR +# 解压formulae.zip ,并拷贝math.txt +unzip -d train_data/LaTeXOCR path/formulae.zip +cp path/math.txt train_data/LaTeXOCR +# 将原始的 .txt 文件转换为 .pkl 文件,从而对不同尺度的图像进行分组 +# 训练集转换 +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/train --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +# 验证集转换 +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/val --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +# 测试集转换 +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/test --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +``` + +### 3.2 模型训练 + +请参考[文本识别训练教程](./recognition.md)。PaddleOCR对代码进行了模块化,训练`LaTeX-OCR`识别模型时需要**更换配置文件**为`LaTeX-OCR`的[配置文件](../../configs/rec/rec_latex_ocr.yml)。 + +#### 启动训练 + + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: +```shell +#单卡训练 (默认训练方式) +python3 tools/train.py -c configs/rec/rec_latex_ocr.yml +#多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_latex_ocr.yml +``` + +**注意:** + +- 默认每训练22个epoch(60000次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 +``` +python3 tools/train.py -c configs/rec/rec_latex_ocr.yml -o Global.eval_batch_step=[0,{length_of_dataset//batch_size*22}] +``` + + +### 3.3 评估 + +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar),使用如下命令进行评估: + +```shell +# 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型,请注意修改路径和文件名为{path/to/weights}/{model_name}。 +# 验证集评估 +python3 tools/eval.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Metric.cal_blue_score=True +# 测试集评估 +python3 tools/eval.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Metric.cal_blue_score=True Eval.dataset.data=./train_data/LaTeXOCR/latexocr_test.pkl +``` + + +### 3.4 预测 + +使用如下命令进行单张图片预测: +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_latex_ocr.yml -o Architecture.Backbone.is_predict=True Architecture.Backbone.is_export=True Architecture.Head.is_export=True Global.infer_img='./doc/datasets/pme_demo/0000013.png' Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/datasets/pme_demo/'。 +``` + + +## 4. 推理部署 + + +### 4.1 Python推理 +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar) ),可以使用如下命令进行转换: + +```shell +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_latex_ocr_infer/ Architecture.Backbone.is_predict=True Architecture.Backbone.is_export=True Architecture.Head.is_export=True + +# 目前的静态图模型支持的最大输出长度为512 +``` +**注意:** +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请检查配置文件中的`rec_char_dict_path`是否为所需要的字典文件。 +- [转换后模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_infer.tar) + +转换成功后,在目录下有三个文件: +``` +/inference/rec_latex_ocr_infer/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```shell +python3 tools/infer/predict_rec.py --image_dir='./doc/datasets/pme_demo/0000295.png' --rec_algorithm="LaTeXOCR" --rec_batch_num=1 --rec_model_dir="./inference/rec_latex_ocr_infer/" --rec_char_dict_path="./ppocr/utils/dict/latex_ocr_tokenizer.json" + +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/datasets/pme_demo/'。 +``` +  + +![测试图片样例](../datasets/pme_demo/0000295.png) + +执行命令后,上面图像的预测结果(识别的文本)会打印到屏幕上,示例如下: +```shell +Predicts of ./doc/datasets/pme_demo/0000295.png:\zeta_{0}(\nu)=-{\frac{\nu\varrho^{-2\nu}}{\pi}}\int_{\mu}^{\infty}d\omega\int_{C_{+}}d z{\frac{2z^{2}}{(z^{2}+\omega^{2})^{\nu+1}}}{\tilde{\Psi}}(\omega;z)e^{i\epsilon z}~~~, +``` + + +**注意**: + +- 需要注意预测图像为**白底黑字**,即手写公式部分为黑色,背景为白色的图片。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中 LaTeX-OCR 的预处理为您的预处理方法。 + + + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持 LaTeX-OCR,所以暂未支持 + + +### 4.3 Serving服务化部署 + +暂不支持 + + +### 4.4 更多推理部署 + +暂不支持 + + +## 5. FAQ + +1. LaTeX-OCR 数据集来自于[LaTeXOCR源repo](https://github.com/lukas-blecher/LaTeX-OCR) 。 diff --git a/doc/doc_ch/algorithm_rec_svtrv2.md b/doc/doc_ch/algorithm_rec_svtrv2.md index a508b4f02c..1391cc790c 100644 --- a/doc/doc_ch/algorithm_rec_svtrv2.md +++ b/doc/doc_ch/algorithm_rec_svtrv2.md @@ -19,8 +19,15 @@ ### SVTRv2算法简介 -[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)排行榜第一算法。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 +🔥 该算法由来自复旦大学视觉与学习实验室([FVL](https://fvl.fudan.edu.cn))的[OpenOCR](https://github.com/Topdu/OpenOCR)团队研发,其在[PaddleOCR算法模型挑战赛 - 赛题一:OCR端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)中荣获一等奖,B榜端到端识别精度相比PP-OCRv4提升2.5%,推理速度持平。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 +|模型|配置文件|端到端|下载链接| +| --- | --- | --- | --- | +|PP-OCRv4| |A榜 62.77%
B榜 62.51%| [Model List](../../doc/doc_ch/models_list.md) | +|SVTRv2(Rec Sever)|[configs/rec/SVTRv2/rec_svtrv2_ch.yml](../../configs/rec/SVTRv2/rec_svtrv2_ch.yml)|A榜 68.81% (使用PP-OCRv4检测模型)| [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_ch_infer.tar) | +|RepSVTR(Mobile)|[识别](../../configs/rec/SVTRv2/rec_repsvtr_ch.yml)
[识别蒸馏](../../configs/rec/SVTRv2/rec_svtrv2_ch_distillation.yml)
[检测](../../configs/det/det_repsvtr_db.yml)|B榜 65.07%| 识别: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_repsvtr_ch_infer.tar)
识别蒸馏: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_rec_svtrv2_distill_ch_infer.tar)
检测: [训练模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/openatom/openatom_det_repsvtr_ch_infer.tar) | + +🚀 快速使用:参考PP-OCR推理[说明文档](../../doc/doc_ch/inference_ppocr.md),将检测和识别模型替换为上表中对应的RepSVTR或SVTRv2推理模型即可使用。 ## 2. 环境配置 @@ -115,7 +122,7 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) ### 4.2 C++推理部署 -由于C++预处理后处理还未支持SVTRv2 +准备好推理模型后,参考[cpp infer](../../deploy/cpp_infer/)教程进行操作即可。 ### 4.3 Serving服务化部署 @@ -125,7 +132,7 @@ Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) ### 4.4 更多推理部署 -暂不支持 +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../deploy/paddle2onnx/)教程操作。 ## 5. FAQ diff --git a/doc/doc_ch/algorithm_table_slanet.md b/doc/doc_ch/algorithm_table_slanet.md index 4aba89b415..ea2f770914 100644 --- a/doc/doc_ch/algorithm_table_slanet.md +++ b/doc/doc_ch/algorithm_table_slanet.md @@ -13,7 +13,7 @@ ## 1. 算法简介 -PaddleOCR 算法模型挑战赛 - 赛题二:通用表格识别任务排行榜第一算法。核心思路: +该算法由来自北京交通大学机器学习与认识计算研究团队的ocr识别队研发,其在PaddleOCR算法模型挑战赛 - 赛题二:通用表格识别任务中排行榜荣获一等奖,排行榜精度相比PP-Structure表格识别模型提升0.8%,推理速度提升3倍。优化思路如下: - 1. 改善推理过程,至EOS停止,速度提升3倍 - 2. 升级Backbone为LCNetV2(SSLD版本) @@ -23,9 +23,9 @@ PaddleOCR 算法模型挑战赛 - 赛题二:通用表格识别任务排行榜 在PubTabNet表格识别公开数据集上,算法复现效果如下: -|模型|骨干网络|配置文件|acc| -| --- | --- | --- | --- | -|SLANet|LCNetV2|[configs/table/SLANet_lcnetv2.yml](../../configs/table/SLANet_lcnetv2.yml)|76.67%| +|模型|骨干网络|配置文件|acc|下载链接| +| --- | --- | --- | --- | --- | +|SLANet|LCNetV2|[configs/table/SLANet_lcnetv2.yml](../../configs/table/SLANet_lcnetv2.yml)|76.67%| [训练模型](https://paddleocr.bj.bcebos.com/openatom/ch_ppstructure_openatom_SLANetv2_train.tar) /[推理模型](https://paddleocr.bj.bcebos.com/openatom/ch_ppstructure_openatom_SLANetv2_infer.tar) | @@ -78,7 +78,7 @@ python3 tools/export_model.py -c configs/table/SLANet_lcnetv2.yml -o Global.pret ```shell cd ppstructure/ -python3.7 table/predict_structure.py --table_model_dir=../inference/slanet_lcnetv2_infer/ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt --image_dir=docs/table/table.jpg --output=../output/table_slanet_lcnetv2 --use_gpu=False --benchmark=True --enable_mkldnn=True +python table/predict_structure.py --table_model_dir=../inference/slanet_lcnetv2_infer/ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --image_dir=docs/table/table.jpg --output=../output/table_slanet_lcnetv2 --use_gpu=False --benchmark=True --enable_mkldnn=True --table_max_len=512 # 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='docs/table'。 ``` diff --git a/doc/doc_ch/detection.md b/doc/doc_ch/detection.md index eba5213501..8271f100a9 100644 --- a/doc/doc_ch/detection.md +++ b/doc/doc_ch/detection.md @@ -141,6 +141,8 @@ python3 tools/train.py -c configs/det/det_mv3_db.yml \ -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True ``` +**注意** +文本检测模型使用AMP时可能遇到训练不收敛问题,可以参考[discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions/12445)中的临时解决方案进行使用。 diff --git a/doc/doc_ch/hardware/install_other_devices.md b/doc/doc_ch/hardware/install_other_devices.md new file mode 100644 index 0000000000..ccb1fbadfb --- /dev/null +++ b/doc/doc_ch/hardware/install_other_devices.md @@ -0,0 +1,44 @@ +# 多硬件安装飞桨 +本文档主要针对昇腾 NPU 硬件平台,介绍如何安装飞桨。 +## 1. 昇腾 NPU 飞桨安装 +### 1.1 环境准备 +当前 PaddleOCR 支持昇腾 910B 芯片,昇腾驱动版本为 23.0.3。考虑到环境差异性,我们推荐使用飞桨官方提供的标准镜像完成环境准备。 +- 1. 拉取镜像,此镜像仅为开发环境,镜像中不包含预编译的飞桨安装包,镜像中已经默认安装了昇腾算子库 CANN-8.0.RC1。 + +``` +# 适用于 X86 架构,暂时不提供 Arch64 架构镜像 +docker pull registry.baidubce.com/device/paddle-npu:cann80RC1-ubuntu20-x86_64-gcc84-py39 +``` + +- 2. 参考如下命令启动容器,ASCEND_RT_VISIBLE_DEVICES 指定可见的 NPU 卡号 +``` +docker run -it --name paddle-npu-dev -v $(pwd):/work \ + --privileged --network=host --shm-size=128G -w=/work \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -e ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ + registry.baidubce.com/device/paddle-npu:cann80RC1-ubuntu20-x86_64-gcc84-py39 /bin/bash +``` +### 1.2 安装 paddle 包 +当前提供 Python3.9 的 wheel 安装包。如有其他 Python 版本需求,可以参考[飞桨官方文档](https://www.paddlepaddle.org.cn/install/quick)自行编译安装。 + +- 1. 下载安装 Python3.9 的 wheel 安装包 + +``` +# 注意需要先安装飞桨 cpu 版本 +pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddle-device/npu/paddlepaddle-0.0.0-cp39-cp39-linux_x86_64.whl +pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddle-device/npu/paddle_custom_npu-0.0.0-cp39-cp39-linux_x86_64.whl +``` +- 2. 验证安装包 +安装完成之后,运行如下命令。 +``` +python -c "import paddle; paddle.utils.run_check()" +``` +预期得到如下输出结果 +``` +Running verify PaddlePaddle program ... +PaddlePaddle works well on 1 npu. +PaddlePaddle works well on 8 npus. +PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now. +``` diff --git a/doc/doc_ch/hardware/supported_models.md b/doc/doc_ch/hardware/supported_models.md new file mode 100644 index 0000000000..10b5d240d5 --- /dev/null +++ b/doc/doc_ch/hardware/supported_models.md @@ -0,0 +1,7 @@ +# PaddleOCR模型列表 + +*多硬件安装方式请参考[多硬件安装文档](install_other_devices.md)* + +| 模型名称 | 昇腾NPU | +| ---------------- | -------- | +| PP-OCRv4 | √ | diff --git a/doc/doc_ch/installation.md b/doc/doc_ch/installation.md index 7e7523b999..962a926911 100644 --- a/doc/doc_ch/installation.md +++ b/doc/doc_ch/installation.md @@ -60,6 +60,3 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR cd PaddleOCR pip3 install -r requirements.txt ``` - -注意,windows环境下,建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装, -直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。 diff --git a/doc/doc_ch/quickstart.md b/doc/doc_ch/quickstart.md index 9e1912364e..65b29b1879 100644 --- a/doc/doc_ch/quickstart.md +++ b/doc/doc_ch/quickstart.md @@ -43,9 +43,6 @@ pip install paddleocr ``` -- 对于Windows环境用户:直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装。 - - ## 2. 便捷使用 diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 4cffcfd419..4c893ddcf4 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -137,6 +137,8 @@ On the TextZoom public dataset, the effect of the algorithm is as follows: Supported formula recognition algorithms (Click the link to get the tutorial): - [x] [CAN](./algorithm_rec_can_en.md) +- [x] [LaTeX-OCR](./algorithm_rec_latex_ocr_en.md) + On the CROHME handwritten formula dataset, the effect of the algorithm is as follows: @@ -145,6 +147,13 @@ On the CROHME handwritten formula dataset, the effect of the algorithm is as fol |CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| +On the LaTeX-OCR printed formula dataset, the effect of the algorithm is as follows: + +| Model | Backbone |config| BLEU score | normed edit distance | ExpRate |Download link| +|-----------|----------| ---- |:-----------:|:---------------------:|:---------:| ----- | +| LaTeX-OCR | Hybrid ViT |[rec_latex_ocr.yml](../../configs/rec/rec_latex_ocr.yml)| 0.8821 | 0.0823 | 40.01% |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar)| + + ## 2. End-to-end OCR Algorithms diff --git a/doc/doc_en/algorithm_rec_latex_ocr_en.md b/doc/doc_en/algorithm_rec_latex_ocr_en.md new file mode 100644 index 0000000000..fcb8863d30 --- /dev/null +++ b/doc/doc_en/algorithm_rec_latex_ocr_en.md @@ -0,0 +1,131 @@ +# LaTeX-OCR + +- [1. Introduction](#1) +- [2. Environment](#2) +- [3. Model Training / Evaluation / Prediction](#3) + - [3.1 Pickle File Generation](#3-1) + - [3.2 Training](#3-2) + - [3.3 Evaluation](#3-3) + - [3.4 Prediction](#3-4) +- [4. Inference and Deployment](#4) + - [4.1 Python Inference](#4-1) + - [4.2 C++ Inference](#4-2) + - [4.3 Serving](#4-3) + - [4.4 More](#4-4) +- [5. FAQ](#5) + + +## 1. Introduction + +Original Project: +> [https://github.com/lukas-blecher/LaTeX-OCR](https://github.com/lukas-blecher/LaTeX-OCR) + + +Using LaTeX-OCR printed mathematical expression recognition datasets for training, and evaluating on its test sets, the algorithm reproduction effect is as follows: + +| Model | Backbone |config| BLEU score | normed edit distance | ExpRate |Download link| +|-----------|----------| ---- |:-----------:|:---------------------:|:---------:| ----- | +| LaTeX-OCR | Hybrid ViT |[rec_latex_ocr.yml](../../configs/rec/rec_latex_ocr.yml)| 0.8821 | 0.0823 | 40.01% |[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_latex_ocr_train.tar)| + + +## 2. Environment +Please refer to ["Environment Preparation"](./environment_en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](./clone_en.md) to clone the project code. + +Furthermore, additional dependencies need to be installed: +```shell +pip install "tokenizers==0.19.1" "imagesize" +``` + + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](./recognition_en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +Pickle File Generation: + +Download formulae.zip and math.txt in [Google Drive](https://drive.google.com/drive/folders/13CA4vAmOmD_I_dSbvLp-Lf0s6KiaNfuO), and then use the following command to generate the pickle file. + +```shell +# Create a LaTeX-OCR dataset directory +mkdir -p train_data/LaTeXOCR +# Unzip formulae.zip and copy math.txt +unzip -d train_data/LaTeXOCR path/formulae.zip +cp path/math.txt train_data/LaTeXOCR +# Convert the original .txt file to a .pkl file to group images of different scales +# Training set conversion +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/train --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +# Validation set conversion +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/val --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +# Test set conversion +python ppocr/utils/formula_utils/math_txt2pkl.py --image_dir=train_data/LaTeXOCR/test --mathtxt_path=train_data/LaTeXOCR/math.txt --output_dir=train_data/LaTeXOCR/ +``` + + +Training: + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +``` +#Single GPU training (Default training method) +python3 tools/train.py -c configs/rec/rec_latex_ocr.yml + +#Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_latex_ocr.yml +``` + +Evaluation: + +``` +# GPU evaluation +# Validation set evaluation +python3 tools/eval.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Metric.cal_blue_score=True +# Test set evaluation +python3 tools/eval.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Metric.cal_blue_score=True Eval.dataset.data=./train_data/LaTeXOCR/latexocr_test.pkl +``` + +Prediction: + +``` +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_latex_ocr.yml -o Architecture.Backbone.is_predict=True Architecture.Backbone.is_export=True Architecture.Head.is_export=True Global.infer_img='./doc/datasets/pme_demo/0000013.png' Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams +``` + + +## 4. Inference and Deployment + + +### 4.1 Python Inference +First, the model saved during the LaTeX-OCR printed mathematical expression recognition training process is converted into an inference model. you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_latex_ocr.yml -o Global.pretrained_model=./rec_latex_ocr_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_latex_ocr_infer/ Architecture.Backbone.is_predict=True Architecture.Backbone.is_export=True Architecture.Head.is_export=True + +# The default output max length of the model is 512. +``` + +For LaTeX-OCR printed mathematical expression recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --image_dir='./doc/datasets/pme_demo/0000295.png' --rec_algorithm="LaTeXOCR" --rec_batch_num=1 --rec_model_dir="./inference/rec_latex_ocr_infer/" --rec_char_dict_path="./ppocr/utils/dict/latex_ocr_tokenizer.json" +``` + + +### 4.2 C++ Inference + +Not supported + + +### 4.3 Serving + +Not supported + + +### 4.4 More + +Not supported + + +## 5. FAQ + + +``` diff --git a/doc/doc_en/clone_en.md b/doc/doc_en/clone_en.md index 9594d9a0b4..f9af283ce7 100644 --- a/doc/doc_en/clone_en.md +++ b/doc/doc_en/clone_en.md @@ -19,9 +19,3 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR cd PaddleOCR pip3 install -r requirements.txt ``` - -If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. - -Please try to download Shapely whl file from [http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). - -Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) diff --git a/doc/doc_en/installation_en.md b/doc/doc_en/installation_en.md index bb499f5cd4..5d1ffb007e 100644 --- a/doc/doc_en/installation_en.md +++ b/doc/doc_en/installation_en.md @@ -63,9 +63,3 @@ git clone https://gitee.com/paddlepaddle/PaddleOCR cd PaddleOCR pip3 install -r requirements.txt ``` - -If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. - -Please try to download Shapely whl file from [http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). - -Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) diff --git a/doc/doc_en/quickstart_en.md b/doc/doc_en/quickstart_en.md index 5b4a97f6c9..dc2becd167 100644 --- a/doc/doc_en/quickstart_en.md +++ b/doc/doc_en/quickstart_en.md @@ -47,10 +47,6 @@ For more software version requirements, please refer to the instructions in [Ins pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+ ``` -- **For windows users:** If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. Please try to download Shapely whl file [here](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). - - Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) - ## 2. Easy-to-Use diff --git a/docs/FAQ.en.md b/docs/FAQ.en.md new file mode 100644 index 0000000000..f395d9987e --- /dev/null +++ b/docs/FAQ.en.md @@ -0,0 +1,62 @@ +--- +comments: true +hide: + - navigation +--- + +1. **Prediction error: got an unexpected keyword argument 'gradient_clip'** +The installed version of paddle is incorrect. Currently, this project only supports Paddle 1.7, which will be adapted to 1.8 in the near future. + +2. **Error when converting attention recognition model: KeyError: 'predict'** +Solved. Please update to the latest version of the code. + +3. **About inference speed** +When there are many words in the picture, the prediction time will increase. You can use `--rec_batch_num` to set a smaller prediction batch num. The default value is 30, which can be changed to 10 or other values. + +4. **Service deployment and mobile deployment** +It is expected that the service deployment based on Serving and the mobile deployment based on Paddle Lite will be released successively in mid-to-late June. Stay tuned for more updates. + +5. **Release time of self-developed algorithm** +Baidu Self-developed algorithms such as SAST, SRN and end2end PSL will be released in June or July. Please be patient. + +6. **How to run on Windows or Mac?** +PaddleOCR has completed the adaptation to Windows and MAC systems. Two points should be noted during operation: + 1. In [Quick installation](./installation_en.md), if you do not want to install docker, you can skip the first step and start with the second step. + 2. When downloading the inference model, if wget is not installed, you can directly click the model link or copy the link address to the browser to download, then extract and place it in the corresponding directory. + +7. **The difference between ultra-lightweight model and General OCR model** +At present, PaddleOCR has opensourced two Chinese models, namely 8.6M ultra-lightweight Chinese model and general Chinese OCR model. The comparison information between the two is as follows: + - Similarities: Both use the same **algorithm** and **training data**; + - Differences: The difference lies in **backbone network** and **channel parameters**, the ultra-lightweight model uses MobileNetV3 as the backbone network, the general model uses Resnet50_vd as the detection model backbone, and Resnet34_vd as the recognition model backbone. You can compare the two model training configuration files to see the differences in parameters. + +|Model|Backbone|Detection configuration file|Recognition configuration file| +|-|-|-|-| +|8.6M ultra-lightweight Chinese OCR model|MobileNetV3+MobileNetV3|det_mv3_db.yml|rec_chinese_lite_train.yml| +|General Chinese OCR model|Resnet50_vd+Resnet34_vd|det_r50_vd_db.yml|rec_chinese_common_train.yml| + +8. **Is there a plan to opensource a model that only recognizes numbers or only English + numbers?** +It is not planned to opensource numbers only, numbers + English only, or other vertical text models. PaddleOCR has opensourced a variety of detection and recognition algorithms for customized training. The two Chinese models are also based on the training output of the open-source algorithm library. You can prepare the data according to the tutorial, choose the appropriate configuration file, train yourselves, and we believe that you can get good result. If you have any questions during the training, you are welcome to open issues or ask in the communication group. We will answer them in time. + +9. **What is the training data used by the open-source model? Can it be opensourced?** +At present, the open source model, dataset and magnitude are as follows: + - Detection: + English dataset: ICDAR2015 + Chinese dataset: LSVT street view dataset with 3w pictures + - Recognition: + English dataset: MJSynth and SynthText synthetic dataset, the amount of data is tens of millions. + Chinese dataset: LSVT street view dataset with cropped text area, a total of 30w images. In addition, the synthesized data based on LSVT corpus is 500w. + + Among them, the public datasets are opensourced, users can search and download by themselves, or refer to [Chinese data set](dataset/datasets_en.md), synthetic data is not opensourced, users can use open-source synthesis tools to synthesize data themselves. Current available synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator), etc. + +10. **Error in using the model with TPS module for prediction** +Error message: Input(X) dims[3] and Input(Grid) dims[2] should be equal, but received X dimension[3]\(108) != Grid dimension[2]\(100) +Solution: TPS does not support variable shape. Please set --rec_image_shape='3,32,100' and --rec_char_type='en' + +11. **Custom dictionary used during training, the recognition results show that words do not appear in the dictionary** +The used custom dictionary path is not set when making prediction. The solution is setting parameter `rec_char_dict_path` to the corresponding dictionary file. + +12. **Results of cpp_infer and python_inference are very different** +Versions of exported inference model and inference library should be same. For example, on Windows platform, version of the inference library that PaddlePaddle provides is 1.8, but version of the inference model that PaddleOCR provides is 1.7, you should export model yourself(`tools/export_model.py`) on PaddlePaddle 1.8 and then use the exported model for inference. + +13. **How to identify artistic fonts in signs or advertising images** +Recognizing artistic fonts in signs or advertising images is a very challenging task because the variation in individual characters is much greater compared to standard fonts. If the artistic font to be identified is within a dictionary list, each word in the dictionary can be treated as a template for recognition using a general image retrieval system. You can try using PaddleClas image recognition system. diff --git a/docs/FAQ.md b/docs/FAQ.md new file mode 100644 index 0000000000..eab3279a46 --- /dev/null +++ b/docs/FAQ.md @@ -0,0 +1,779 @@ +--- +comments: true +hide: + - navigation +--- + +> 恭喜你发现宝藏! + +PaddleOCR收集整理了自从开源以来在issues和用户群中的常见问题并且给出了简要解答,旨在为OCR的开发者提供一些参考,也希望帮助大家少走一些弯路。 + +其中[通用问题](#1)一般是初次接触OCR相关算法时用户会提出的问题,在[1.5 垂类场景实现思路](#15)中总结了如何在一些具体的场景中确定技术路线进行优化。[PaddleOCR常见问题](#2)是开发者在使用PaddleOCR之后可能会遇到的问题也是PaddleOCR实践过程中的避坑指南。 + +同时PaddleOCR也会在review issue的过程中添加 `good issue`、 `good first issue` 标签,但这些问题可能不会被立刻补充在FAQ文档里,开发者也可对应查看。我们也非常希望开发者能够帮助我们将这些内容补充在FAQ中。 + +OCR领域大佬众多,本文档回答主要依赖有限的项目实践,难免挂一漏万,如有遗漏和不足,也**希望有识之士帮忙补充和修正**,万分感谢。 + +## 1. 通用问题 + +### 1.1 检测 + +#### Q: 基于深度学习的文字检测方法有哪几种?各有什么优缺点? + +**A**:常用的基于深度学习的文字检测方法一般可以分为基于回归的、基于分割的两大类,当然还有一些将两者进行结合的方法。 + +(1)基于回归的方法分为box回归和像素值回归。a. 采用box回归的方法主要有CTPN、Textbox系列和EAST,这类算法对规则形状文本检测效果较好,但无法准确检测不规则形状文本。 b. 像素值回归的方法主要有CRAFT和SA-Text,这类算法能够检测弯曲文本且对小文本效果优秀但是实时性能不够。 + +(2)基于分割的算法,如PSENet,这类算法不受文本形状的限制,对各种形状的文本都能取得较好的效果,但是往往后处理比较复杂,导致耗时严重。目前也有一些算法专门针对这个问题进行改进,如DB,将二值化进行近似,使其可导,融入训练,从而获取更准确的边界,大大降低了后处理的耗时。 + +### 1.2 识别 + +#### Q: PaddleOCR提供的文本识别算法包括哪些? + +A: PaddleOCR主要提供五种文本识别算法,包括CRNN\StarNet\RARE\Rosetta和SRN, 其中CRNN\StarNet和Rosetta是基于ctc的文字识别算法,RARE是基于attention的文字识别算法;SRN为百度自研的文本识别算法,引入了语义信息,显著提升了准确率。 详情可参照如下页面: 文本识别算法 + +#### Q: 文本识别方法CRNN关键技术有哪些? + +A: CRNN 关键技术包括三部分。(1)CNN提取图像卷积特征。(2)深层双向LSTM网络,在卷积特征的基础上继续提取文字序列特征。(3)Connectionist Temporal Classification(CTC),解决训练时字符无法对齐的问题。 + +#### Q: 对于中文行文本识别,CTC和Attention哪种更优? + +**A**:(1)从效果上来看,通用OCR场景CTC的识别效果优于Attention,因为带识别的字典中的字符比较多,常用中文汉字三千字以上,如果训练样本不足的情况下,对于这些字符的序列关系挖掘比较困难。中文场景下Attention模型的优势无法体现。而且Attention适合短语句识别,对长句子识别比较差。 + +(2)从训练和预测速度上,Attention的串行解码结构限制了预测速度,而CTC网络结构更高效,预测速度上更有优势。 + +#### Q: 弯曲形变的文字识别需要怎么处理?TPS应用场景是什么,是否好用? + +**A**:(1)在大多数情况下,如果遇到的场景弯曲形变不是太严重,检测4个顶点,然后直接通过仿射变换转正识别就足够了。 + +(2)如果不能满足需求,可以尝试使用TPS(Thin Plate Spline),即薄板样条插值。TPS是一种插值算法,经常用于图像变形等,通过少量的控制点就可以驱动图像进行变化。一般用在有弯曲形变的文本识别中,当检测到不规则的/弯曲的(如,使用基于分割的方法检测算法)文本区域,往往先使用TPS算法对文本区域矫正成矩形再进行识别,如,STAR-Net、RARE等识别算法中引入了TPS模块。 + +> **Warning**:TPS看起来美好,在实际应用时经常发现并不够鲁棒,并且会增加耗时,需要谨慎使用。 + +### 1.3 端到端 + +#### Q: 请问端到端的pgnet相比于DB+CRNN在准确率上有优势吗?或者是pgnet最擅长的场景是什么场景呢? + +A: pgnet是端到端算法,检测识别一步到位,不用分开训练2个模型,也支持弯曲文本的识别,但是在中文上的效果还没有充分验证;db+crnn的验证更充分,应用相对成熟,常规非弯曲的文本都能解的不错。 + +#### Q: 目前OCR普遍是二阶段,端到端的方案在业界落地情况如何? + +**A**:端到端在文字分布密集的业务场景,效率会比较有保证,精度的话看自己业务数据积累情况,如果行级别的识别数据积累比较多的话two-stage会比较好。百度的落地场景,比如工业仪表识别、车牌识别都用到端到端解决方案。 + +#### Q: 二阶段的端到端的场景文本识别方法的不足有哪些? + +A: 这类方法一般需要设计针对ROI提取特征的方法,而ROI操作一般比较耗时。 + +#### Q: AAAI 2021最新的端到端场景文本识别PGNet算法有什么特点? + +A: PGNet不需要字符级别的标注,NMS操作以及ROI操作。同时提出预测文本行内的阅读顺序模块和基于图的修正模块来提升文本识别效果。该算法是百度自研,近期会在PaddleOCR开源。 + +### 1.4 评估方法 + +#### Q: OCR领域常用的评估指标是什么? + +**A**:对于两阶段的可以分开来看,分别是检测和识别阶段 + +(1)检测阶段:先按照检测框和标注框的IOU评估,IOU大于某个阈值判断为检测准确。这里检测框和标注框不同于一般的通用目标检测框,是采用多边形进行表示。检测准确率:正确的检测框个数在全部检测框的占比,主要是判断检测指标。检测召回率:正确的检测框个数在全部标注框的占比,主要是判断漏检的指标。 + +(2)识别阶段: +字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。 + +(3)端到端统计: +端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; +端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; +准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的检测框中的文本与标注的文本相同。 + +### 1.5 垂类场景实现思路 + +#### Q:背景干扰的文字(如印章盖到落款上,需要识别落款或者印章中的文字),如何识别? + +**A**:(1)在人眼确认可识别的条件下,对于背景有干扰的文字,首先要保证检测框足够准确,如果检测框不准确,需要考虑是否可以通过过滤颜色等方式对图像预处理并且增加更多相关的训练数据;在识别的部分,注意在训练数据中加入背景干扰类的扩增图像。 + +(2)如果MobileNet模型不能满足需求,可以尝试ResNet系列大模型来获得更好的效果。 + +#### Q:请问对于图片中的密集文字,有什么好的处理办法吗? + +A:可以先试用预训练模型测试一下,例如DB+CRNN,判断下密集文字图片中是检测还是识别的问题,然后针对性的改善。还有一种是如果图象中密集文字较小,可以尝试增大图像分辨率,对图像进行一定范围内的拉伸,将文字稀疏化,提高识别效果。 + +#### Q: 文本行较紧密的情况下如何准确检测? + +**A**:使用基于分割的方法,如DB,检测密集文本行时,最好收集一批数据进行训练,并且在训练时,并将生成二值图像的shrink_ratio参数调小一些。 + +#### Q:对于一些在识别时稍微模糊的文本,有没有一些图像增强的方式? + +A:在人类肉眼可以识别的前提下,可以考虑图像处理中的均值滤波、中值滤波或者高斯滤波等模糊算子尝试。也可以尝试从数据扩增扰动来强化模型鲁棒性,另外新的思路有对抗性训练和超分SR思路,可以尝试借鉴。但目前业界尚无普遍认可的最优方案,建议优先在数据采集阶段增加一些限制提升图片质量。 + +#### Q:低像素文字或者字号比较小的文字有什么超分辨率方法吗 + +A:超分辨率方法分为传统方法和基于深度学习的方法。基于深度学习的方法中,比较经典的有SRCNN,另外CVPR2020也有一篇超分辨率的工作可以参考文章:Unpaired Image Super-Resolution using Pseudo-Supervision,但是没有充分的实践验证过,需要看实际场景下的效果。 + +#### Q:对于一些尺寸较大的文档类图片,在检测时会有较多的漏检,怎么避免这种漏检的问题呢? + +A:PaddleOCR中在图像最长边大于960时,将图像等比例缩放为长边960的图像再进行预测,对于这种图像,可以通过修改det_limit_side_len,增大检测的最长边:tools/infer/utility.py#L42 + +#### Q:文档场景中,使用DB模型会出现整行漏检的情况应该怎么解决? + +A:可以在预测时调小 det_db_box_thresh 阈值,默认为0.5, 可调小至0.3观察效果。 + +#### Q: 弯曲文本(如略微形变的文档图像)漏检问题 + +**A**: db后处理中计算文本框平均得分时,是求rectangle区域的平均分数,容易造成弯曲文本漏检,已新增求polygon区域的平均分数,会更准确,但速度有所降低,可按需选择,在相关pr中可查看[可视化对比效果](https://github.com/PaddlePaddle/PaddleOCR/pull/2604)。该功能通过参数 [det_db_score_mode](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L51)进行选择,参数值可选[`fast`(默认)、`slow`],`fast`对应原始的rectangle方式,`slow`对应polygon方式。感谢用户[buptlihang](https://github.com/buptlihang)提[pr](https://github.com/PaddlePaddle/PaddleOCR/pull/2574)帮助解决该问题🌹。 + +#### Q:如何识别文字比较长的文本? + +**A**:在中文识别模型训练时,并不是采用直接将训练样本缩放到[3,32,320]进行训练,而是先等比例缩放图像,保证图像高度为32,宽度不足320的部分补0,宽高比大于10的样本直接丢弃。预测时,如果是单张图像预测,则按上述操作直接对图像缩放,不做宽度320的限制。如果是多张图预测,则采用batch方式预测,每个batch的宽度动态变换,采用这个batch中最长宽度。 + +#### Q:如何识别带空格的英文行文本图像? + +**A**:空格识别可以考虑以下两种方案: + +(1)优化文本检测算法。检测结果在空格处将文本断开。这种方案在检测数据标注时,需要将含有空格的文本行分成好多段。 + +(2)优化文本识别算法。在识别字典里面引入空格字符,然后在识别的训练数据中,如果用空行,进行标注。此外,合成数据时,通过拼接训练数据,生成含有空格的文本。 + +#### Q:弯曲文本有试过opencv的TPS进行弯曲校正吗? + +**A**:opencv的tps需要标出上下边界对应的点,这个点很难通过传统方法或者深度学习方法获取。PaddleOCR里StarNet网络中的tps模块实现了自动学点,自动校正,可以直接尝试这个。 + +#### Q: 如何识别招牌或者广告图中的艺术字? + +A: 招牌或者广告图中的艺术字是文本识别一个非常有挑战性的难题,因为艺术字中的单字和印刷体相比,变化非常大。如果需要识别的艺术字是在一个词典列表内,可以将改每个词典认为是一个待识别图像模板,通过通用图像检索识别系统解决识别问题。可以尝试使用PaddleClas的图像识别系统。 + +#### Q: 印章如何识别 + +A:1. 使用带tps的识别网络或abcnet,2.使用极坐标变换将图片拉平之后使用crnn + +#### Q: 使用预训练模型进行预测,对于特定字符识别识别效果较差,怎么解决? + +A: 由于我们所提供的识别模型是基于通用大规模数据集进行训练的,部分字符可能在训练集中包含较少,因此您可以构建特定场景的数据集,基于我们提供的预训练模型进行微调。建议用于微调的数据集中,每个字符出现的样本数量不低于300,但同时需要注意不同字符的数量均衡。具体可以参考:微调。 + +#### Q: 在使用训练好的识别模型进行预测的时候,发现有很多重复的字,这个怎么解决呢? + +A:可以看下训练的尺度和预测的尺度是否相同,如果训练的尺度为[3, 32, 320],预测的尺度为[3, 64, 640],则会有比较多的重复识别现象。 + +#### Q: 图像正常识别出来的文字是OK的,旋转90度后识别出来的结果就比较差,有什么方法可以优化? + +**A**: 整图旋转90之后效果变差是有可能的,因为目前PPOCR默认输入的图片是正向的; 可以自己训练一个整图的方向分类器,放在预测的最前端(可以参照现有方向分类器的方式),或者可以基于规则做一些预处理,比如判断长宽等等。 + +#### Q: 如何识别竹简上的古文? + +**A**:对于字符都是普通的汉字字符的情况,只要标注足够的数据,finetune模型就可以了。如果数据量不足,您可以尝试[StyleText](https://github.com/PFCCLab/StyleText)工具。 +而如果使用的字符是特殊的古文字、甲骨文、象形文字等,那么首先需要构建一个古文字的字典,之后再进行训练。 + +#### Q: 只想要识别票据中的部分片段,重新训练它的话,只需要训练文本检测模型就可以了吗?问文本识别,方向分类还是用原来的模型这样可以吗? + +**A**:可以的。PaddleOCR的检测、识别、方向分类器三个模型是独立的,在实际使用中可以优化和替换其中任何一个模型。 + +#### Q: 如何用PaddleOCR识别视频中的文字? + +**A**: 目前PaddleOCR主要针对图像做处理,如果需要视频识别,可以先对视频抽帧,然后用PPOCR识别。 + +#### Q: 相机采集的图像为四通道,应该如何处理? + +**A**: 有两种方式处理: + +- 如果没有其他需要,可以在解码数据的时候指定模式为三通道,例如如果使用opencv,可以使用cv::imread(img_path, cv::IMREAD_COLOR)。 +- 如果其他模块需要处理四通道的图像,那也可以在输入PaddleOCR模块之前进行转换,例如使用cvCvtColor(&img,img3chan,CV_RGBA2RGB)。 + +#### Q: 遇到中英文识别模型不支持的字符,该如何对模型做微调? + +**A**:如果希望识别中英文识别模型中不支持的字符,需要更新识别的字典,并完成微调过程。比如说如果希望模型能够进一步识别罗马数字,可以按照以下步骤完成模型微调过程。 + +1. 准备中英文识别数据以及罗马数字的识别数据,用于训练,同时保证罗马数字和中英文识别数字的效果; +2. 修改默认的字典文件,在后面添加罗马数字的字符; +3. 下载PaddleOCR提供的预训练模型,配置预训练模型和数据的路径,开始训练。 + +#### Q:特殊字符(例如一些标点符号)识别效果不好怎么办? + +**A**:首先请您确认要识别的特殊字符是否在字典中。 +如果字符在已经字典中但效果依然不好,可能是由于识别数据较少导致的,您可以增加相应数据finetune模型。 + +--- + +#### Q:单张图上多语种并存识别(如单张图印刷体和手写文字并存),应该如何处理? + +**A**:单张图像中存在多种类型文本的情况很常见,典型的以学生的试卷为代表,一张图像同时存在手写体和印刷体两种文本,这类情况下,可以尝试”1个检测模型+1个N分类模型+N个识别模型”的解决方案。 +其中不同类型文本共用同一个检测模型,N分类模型指额外训练一个分类器,将检测到的文本进行分类,如手写+印刷的情况就是二分类,N种语言就是N分类,在识别的部分,针对每个类型的文本单独训练一个识别模型,如手写+印刷的场景,就需要训练一个手写体识别模型,一个印刷体识别模型,如果一个文本框的分类结果是手写体,那么就传给手写体识别模型进行识别,其他情况同理。 + +#### Q: 多语言的字典里是混合了不同的语种,这个是有什么讲究吗?统一到一个字典里会对精度造成多大的损失? + +**A**:统一到一个字典里,会造成最后一层FC过大,增加模型大小。如果有特殊需求的话,可以把需要的几种语言合并字典训练模型,合并字典之后如果引入过多的形近字,可能会造成精度损失,字符平衡的问题可能也需要考虑一下。在PaddleOCR里暂时将语言字典分开。 + +#### Q:类似泰语这样的小语种,部分字会占用两个字符甚至三个字符,请问如何制作字典 + +**A**:处理字符的时候,把多字符的当作一个字就行,字典中每行是一个字。 + +--- + +#### Q: 想把简历上的文字识别出来后,能够把关系一一对应起来,比如姓名和它后面的名字组成一对,籍贯、邮箱、学历等等都和各自的内容关联起来,这个应该如何处理,PPOCR目前支持吗? + +**A**: 这样的需求在企业应用中确实比较常见,但往往都是个性化的需求,没有非常规整统一的处理方式。常见的处理方式有如下两种: + +1. 对于单一版式、或者版式差异不大的应用场景,可以基于识别场景的一些先验信息,将识别内容进行配对; 比如运用表单结构信息:常见表单"姓名"关键字的后面,往往紧跟的就是名字信息 +2. 对于版式多样,或者无固定版式的场景, 需要借助于NLP中的NER技术,给识别内容中的某些字段,赋予key值 + +由于这部分需求和业务场景强相关,难以用一个统一的模型去处理,目前PPOCR暂不支持。 如果需要用到NER技术,可以参照Paddle团队的另一个开源套件: [PaddlePaddle/ERNIE](https://github.com/PaddlePaddle/ERNIE), 其提供的预训练模型ERNIE, 可以帮助提升NER任务的准确率。 + +### 1.6 训练过程与模型调优 + +#### Q: 增大batch_size模型训练速度没有明显提升 + +A:如果batch_size打得太大,加速效果不明显的话,可以试一下增大初始化内存的值,运行代码前设置环境变量: +export FLAGS_initial_cpu_memory_in_mb=2000 # 设置初始化内存约2G左右 + +#### Q: 预测时提示图像过大,显存、内存溢出了,应该如何处理? + +A: 可以按照这个PR的修改来缓解显存、内存占用 #2230 + +#### Q: 识别训练时,训练集精度已经到达90了,但验证集精度一直在70,涨不上去怎么办? + +A:训练集精度90,测试集70多的话,应该是过拟合了,有两个可尝试的方法:(1)加入更多的增广方式或者调大增广prob的概率,默认为0.4。(2)调大系统的l2 decay值 + +### 1.7 补充资料 + +#### Q: 对于小白如何快速入门中文OCR项目实践? + +A:建议可以先了解OCR方向的基础知识,大概了解基础的检测和识别模型算法。然后在Github上可以查看OCR方向相关的repo。目前来看,从内容的完备性来看,PaddleOCR的中英文双语教程文档是有明显优势的,在数据集、模型训练、预测部署文档详实,可以快速入手。而且还有微信用户群答疑,非常适合学习实践。项目地址:PaddleOCR + +AI 快车道课程: + +## 2. PaddleOCR实战问题 + +### 2.1 PaddleOCR repo + +#### Q: PaddleOCR develop分支和dygraph分支的区别? + +**A**:目前PaddleOCR有四个分支,分别是: + +- develop:基于Paddle静态图开发的分支,推荐使用paddle1.8 或者2.0版本,该分支具备完善的模型训练、预测、推理部署、量化裁剪等功能,领先于release/1.1分支。 +- release/1.1:PaddleOCR 发布的第一个稳定版本,基于静态图开发,具备完善的训练、预测、推理部署、量化裁剪等功能。 +- dygraph:基于Paddle动态图开发的分支,目前仍在开发中,未来将作为主要开发分支,运行要求使用Paddle2.0.0版本。 +- release/2.0-rc1-0:PaddleOCR发布的第二个稳定版本,基于动态图和paddle2.0版本开发,动态图开发的工程更易于调试,目前支,支持模型训练、预测,暂不支持移动端部署。 + +如果您已经上手过PaddleOCR,并且希望在各种环境上部署PaddleOCR,目前建议使用静态图分支,develop或者release/1.1分支。如果您是初学者,想快速训练,调试PaddleOCR中的算法,建议尝鲜PaddleOCR dygraph分支。 + +**注意**:develop和dygraph分支要求的Paddle版本、本地环境有差别,请注意不同分支环境安装部分的差异。 + +#### Q:PaddleOCR与百度的其他OCR产品有什么区别? + +**A**:PaddleOCR主要聚焦通用ocr,如果有垂类需求,您可以用PaddleOCR+垂类数据自己训练; +如果缺少带标注的数据,或者不想投入研发成本,建议直接调用开放的API,开放的API覆盖了目前比较常见的一些垂类。 + +### 2.2 安装环境 + +#### Q:OSError: [WinError 126] 找不到指定的模块。mac pro python 3.4 shapely import 问题 + +A:这个问题是因为shapely库安装有误,可以参考 #212 这个issue重新安装一下 + +#### Q:PaddlePaddle怎么指定GPU运行 os.environ["CUDA_VISIBLE_DEVICES"]这种不生效 + +A:通过设置 export CUDA_VISIBLE_DEVICES='0'环境变量 + +#### Q:PaddleOCR是否支持在Windows或Mac系统上运行? + +A:PaddleOCR已完成Windows和Mac系统适配,运行时注意两点: +(1)在快速安装时,如果不想安装docker,可跳过第一步,直接从第二步安装paddle开始。 +(2)inference模型下载时,如果没有安装wget,可直接点击模型链接或将链接地址复制到浏览器进行下载,并解压放置到相应目录。 + +### 2.3 数据量说明 + +#### Q:简单的对于精度要求不高的OCR任务,数据集需要准备多少张呢? + +**A**:(1)训练数据的数量和需要解决问题的复杂度有关系。难度越大,精度要求越高,则数据集需求越大,而且一般情况实际中的训练数据越多效果越好。 + +(2)对于精度要求不高的场景,检测任务和识别任务需要的数据量是不一样的。对于检测任务,500张图像可以保证基本的检测效果。对于识别任务,需要保证识别字典中每个字符出现在不同场景的行文本图像数目需要大于200张(举例,如果有字典中有5个字,每个字都需要出现在200张图片以上,那么最少要求的图像数量应该在200-1000张之间),这样可以保证基本的识别效果。 + +#### Q:请问PaddleOCR项目中的中文超轻量和通用模型用了哪些数据集?训练多少样本,gpu什么配置,跑了多少个epoch,大概跑了多久? + +**A**: +(1)检测的话,LSVT街景数据集共3W张图像,超轻量模型,150epoch左右,2卡V100 跑了不到2天;通用模型:2卡V100 150epoch 不到4天。 +(2)识别的话,520W左右的数据集(真实数据26W+合成数据500W)训练,超轻量模型:4卡V100,总共训练了5天左右。通用模型:4卡V100,共训练6天。 + +超轻量模型训练分为2个阶段: +(1)全量数据训练50epoch,耗时3天 +(2)合成数据+真实数据按照1:1数据采样,进行finetune训练200epoch,耗时2天 + +通用模型训练: +真实数据+合成数据,动态采样(1:1)训练,200epoch,耗时 6天左右。 + +#### Q:训练文字识别模型,真实数据有30w,合成数据有500w,需要做样本均衡吗? + +A:需要,一般需要保证一个batch中真实数据样本和合成数据样本的比例是5:1~10:1左右效果比较理想。如果合成数据过大,会过拟合到合成数据,预测效果往往不佳。还有一种启发性的尝试是可以先用大量合成数据训练一个base模型,然后再用真实数据微调,在一些简单场景效果也是会有提升的。 + +#### Q: 当训练数据量少时,如何获取更多的数据? + +A:当训练数据量少时,可以尝试以下三种方式获取更多的数据:(1)人工采集更多的训练数据,最直接也是最有效的方式。(2)基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。(3)利用数据生成算法合成数据,例如pix2pix等算法。 + +### 2.4 数据标注与生成 + +> [!NOTE] +> StyleText 已经移动到 [PFCCLab/StyleText](https://github.com/PFCCLab/StyleText) + +#### Q: Style-Text 如何不文字风格迁移,就像普通文本生成程序一样默认字体直接输出到分割的背景图? + +**A**:使用image_synth模式会输出fake_bg.jpg,即为背景图。如果想要批量提取背景,可以稍微修改一下代码,将fake_bg保存下来即可。要修改的位置: + + +#### Q: 能否修改StyleText配置文件中的分辨率? + +**A**:StyleText目前的训练数据主要是高度32的图片,建议不要改变高度。未来我们会支持更丰富的分辨率。 + +#### Q: StyleText是否可以更换字体文件? + +**A**:StyleText项目中的字体文件为标准字体,主要用作模型的输入部分,不能够修改。 +StyleText的用途主要是:提取style_image中的字体、背景等style信息,根据语料生成同样style的图片。 + +#### Q: StyleText批量生成图片为什么没有输出? + +**A**:需要检查以下您配置文件中的路径是否都存在。尤其要注意的是[label_file配置](https://github.com/PFCCLab/StyleText/blob/main/README_ch.md#%E4%B8%89%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B)。 +如果您使用的style_image输入没有label信息,您依然需要提供一个图片文件列表。 + +#### Q:使用StyleText进行数据合成时,文本(TextInput)的长度远超StyleInput的长度,该怎么处理与合成呢? + +**A**:在使用StyleText进行数据合成的时候,建议StyleInput的长度长于TextInput的长度。有2种方法可以处理上述问题: + +1. 将StyleInput按列的方向进行复制与扩充,直到其超过TextInput的长度。 +2. 将TextInput进行裁剪,保证每段TextInput都稍短于StyleInput,分别合成之后,再拼接在一起。 + +实际使用中发现,使用第2种方法的效果在长文本合成的场景中的合成效果更好,StyleText中提供的也是第2种数据合成的逻辑。 + +#### Q: StyleText 合成数据效果不好? + +**A**:StyleText模型生成的数据主要用于OCR识别模型的训练。PaddleOCR目前识别模型的输入为32 x N,因此当前版本模型主要适用高度为32的数据。 +建议要合成的数据尺寸设置为32 x N。尺寸相差不多的数据也可以生成,尺寸很大或很小的数据效果确实不佳。 + +### 2.5 预训练模型与微调 + +#### Q:如何更换文本检测/识别的backbone? + +A:无论是文字检测,还是文字识别,骨干网络的选择是预测效果和预测效率的权衡。一般,选择更大规模的骨干网络,例如ResNet101_vd,则检测或识别更准确,但预测耗时相应也会增加。而选择更小规模的骨干网络,例如MobileNetV3_small_x0_35,则预测更快,但检测或识别的准确率会大打折扣。幸运的是不同骨干网络的检测或识别效果与在ImageNet数据集图像1000分类任务效果正相关。飞桨图像分类套件PaddleClas汇总了ResNet_vd、Res2Net、HRNet、MobileNetV3、GhostNet等23种系列的分类网络结构,在上述图像分类任务的top1识别准确率,GPU(V100和T4)和CPU(骁龙855)的预测耗时以及相应的117个预训练模型下载地址。 + +(1)文字检测骨干网络的替换,主要是确定类似于ResNet的4个stages,以方便集成后续的类似FPN的检测头。此外,对于文字检测问题,使用ImageNet训练的分类预训练模型,可以加速收敛和效果提升。 + +(2)文字识别的骨干网络的替换,需要注意网络宽高stride的下降位置。由于文本识别一般宽高比例很大,因此高度下降频率少一些,宽度下降频率多一些。可以参考PaddleOCR中MobileNetV3骨干网络的改动。 + +#### Q: 参照文档做实际项目时,是重新训练还是在官方训练的基础上进行训练?具体如何操作? + +**A**: 基于官方提供的模型,进行finetune的话,收敛会更快一些。 具体操作上,以识别模型训练为例:如果修改了字符文件,可以设置pretraind_model为官方提供的预训练模型 + +#### Q: 下载的识别模型解压后缺失文件,没有期望的inference.pdiparams, inference.pdmodel等文件 + +A:用解压软件解压可能会出现这个问题,建议二次解压下或者用命令行解压tar xf + +#### Q: 为什么在checkpoints中load下载的预训练模型会报错? + +A: 这里有两个不同的概念: + +pretrained_model:指预训练模型,是已经训练完成的模型。这时会load预训练模型的参数,但并不会load学习率、优化器以及训练状态等。如果需要finetune,应该使用pretrained。 +checkpoints:指之前训练的中间结果,例如前一次训练到了100个epoch,想接着训练。这时会load尝试所有信息,包括模型的参数,之前的状态等。 + +#### Q: 如何对检测模型finetune,比如冻结前面的层或某些层使用小的学习率学习? + +**A**:如果是冻结某些层,可以将变量的stop_gradient属性设置为True,这样计算这个变量之前的所有参数都不会更新了,参考: + +如果对某些层使用更小的学习率学习,静态图里还不是很方便,一个方法是在参数初始化的时候,给权重的属性设置固定的学习率,参考: + +实际上我们实验发现,直接加载模型去fine-tune,不设置某些层不同学习率,效果也都不错 + +### 2.6 模型超参调整 + +#### Q: DB检测训练输入尺寸640,可以改大一些吗? + +A:不建议改大。检测模型训练输入尺寸是预处理中random crop后的尺寸,并非直接将原图进行resize,多数场景下这个尺寸并不小了,改大后可能反而并不合适,而且训练会变慢。另外,代码里可能有的地方参数按照预设输入尺寸适配的,改大后可能有隐藏风险。 + +#### Q: 预处理部分,图片的长和宽为什么要处理成32的倍数? + +A:以检测中的resnet骨干网络为例,图像输入网络之后,需要经过5次2倍降采样,共32倍,因此建议输入的图像尺寸为32的倍数。 + +#### Q: 在识别模型中,为什么降采样残差结构的stride为(2, 1)? + +**A**: stride为(2, 1),表示在图像y方向(高度方向)上stride为2,x方向(宽度方向)上为1。由于待识别的文本图像通常为长方形,这样只在高度方向做下采样,尽量保留宽度方向的序列信息,避免宽度方向下采样后丢失过多的文字信息。 + +#### Q:训练识别时,如何选择合适的网络输入shape? + +**A**:一般高度采用32,最长宽度的选择,有两种方法: + +(1)统计训练样本图像的宽高比分布。最大宽高比的选取考虑满足80%的训练样本。 + +(2)统计训练样本文字数目。最长字符数目的选取考虑满足80%的训练样本。然后中文字符长宽比近似认为是1,英文认为3:1,预估一个最长宽度。 + +#### Q:识别模型框出来的位置太紧凑,会丢失边缘的文字信息,导致识别错误 + +A:可以在命令中加入 --det_db_unclip_ratio ,参数定义位置,这个参数是检测后处理时控制文本框大小的,默认1.6,可以尝试改成2.5或者更大,反之,如果觉得文本框不够紧凑,也可以把该参数调小。 + +### 2.7 模型结构 + +#### Q:文本识别训练不加LSTM是否可以收敛? + +**A**:理论上是可以收敛的,加上LSTM模块主要是为了挖掘文字之间的序列关系,提升识别效果。对于有明显上下文语义的场景效果会比较明显。 + +#### Q:文本识别中LSTM和GRU如何选择? + +**A**:从项目实践经验来看,序列模块采用LSTM的识别效果优于GRU,但是LSTM的计算量比GRU大一些,可以根据自己实际情况选择。 + +#### Q:对于CRNN模型,backbone采用DenseNet和ResNet_vd,哪种网络结构更好? + +**A**:Backbone的识别效果在CRNN模型上的效果,与Imagenet 1000 图像分类任务上识别效果和效率一致。在图像分类任务上ResnNet_vd(79%+)的识别精度明显优于DenseNet(77%+),此外对于GPU,Nvidia针对ResNet系列模型做了优化,预测效率更高,所以相对而言,resnet_vd是较好选择。如果是移动端,可以优先考虑MobileNetV3系列。 + +#### Q: 如何根据不同的硬件平台选用不同的backbone? + +**A**:在不同的硬件上,不同的backbone的速度优势不同,可以根据不同平台的速度-精度图来确定backbone,这里可以参考[PaddleClas模型速度-精度图](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/docs/zh_CN/models)。 + +### 2.8 PP-OCR系统 + +#### Q: 在PP-OCR系统中,文本检测的骨干网络为什么没有使用SE模块? + +**A**:SE模块是MobileNetV3网络一个重要模块,目的是估计特征图每个特征通道重要性,给特征图每个特征分配权重,提高网络的表达能力。但是,对于文本检测,输入网络的分辨率比较大,一般是640\*640,利用SE模块估计特征图每个特征通道重要性比较困难,网络提升能力有限,但是该模块又比较耗时,因此在PP-OCR系统中,文本检测的骨干网络没有使用SE模块。实验也表明,当去掉SE模块,超轻量模型大小可以减小40%,文本检测效果基本不受影响。详细可以参考PP-OCR技术文章,. + +#### Q: PP-OCR系统中,文本检测的结果有置信度吗? + +**A**:文本检测的结果有置信度,由于推理过程中没有使用,所以没有显示的返回到最终结果中。如果需要文本检测结果的置信度,可以在[文本检测DB的后处理代码](../../ppocr/postprocess/db_postprocess.py)的155行,添加scores信息。这样,在[检测预测代码](../../tools/infer/predict_det.py)的197行,就可以拿到文本检测的scores信息。 + +#### Q: DB文本检测,特征提取网络金字塔构建的部分代码在哪儿? + +**A**:特征提取网络金字塔构建的部分:[代码位置](../../ppocr/modeling/necks/db_fpn.py)。ppocr/modeling文件夹里面是组网相关的代码,其中architectures是文本检测或者文本识别整体流程代码;backbones是骨干网络相关代码;necks是类似与FPN的颈函数代码;heads是提取文本检测或者文本识别预测结果相关的头函数;transforms是类似于TPS特征预处理模块。更多的信息可以参考[代码组织结构](./tree.md)。 + +#### Q:PaddleOCR如何做到横排和竖排同时支持的? + +**A**:合成了一批竖排文字,逆时针旋转90度后加入训练集与横排一起训练。预测时根据图片长宽比判断是否为竖排,若为竖排则将crop出的文本逆时针旋转90度后送入识别网络。 + +#### Q: 目前知识蒸馏有哪些主要的实践思路? + +**A**:知识蒸馏即利用教师模型指导学生模型的训练,目前有3种主要的蒸馏思路: + +1. 基于输出结果的蒸馏,即让学生模型学习教师模型的软标签(分类或者OCR识别等任务中)或者概率热度图(分割等任务中)。 +2. 基于特征图的蒸馏,即让学生模型学习教师模型中间层的特征图,拟合中间层的一些特征。 +3. 基于关系的蒸馏,针对不同的样本(假设个数为N),教师模型会有不同的输出,那么可以基于不同样本的输出,计算一个NxN的相关性矩阵,可以让学生模型去学习教师模型关于不同样本的相关性矩阵。 + +当然,知识蒸馏方法日新月异,也欢迎大家提出更多的总结与建议。 + +#### Q: 文字识别模型模型的输出矩阵需要进行解码才能得到识别的文本。代码中实现为preds_idx = preds.argmax(axis=2),也就是最佳路径解码法。这是一种贪心算法,是每一个时间步只将最大概率的字符作为当前时间步的预测输出,但得到的结果不一定是最好的。为什么不使用beam search这种方式进行解码呢? + +**A**:实验发现,使用贪心的方法去做解码,识别精度影响不大,但是速度方面的优势比较明显,因此PaddleOCR中使用贪心算法去做识别的解码。 + +### 2.9 端到端 + +#### Q: 端到端算法PGNet是否支持中文识别,速度会很慢嘛? + +**A**:目前开源的PGNet算法模型主要是用于检测英文数字,对于中文的识别需要自己训练,大家可以使用开源的端到端中文数据集,而对于复杂文本(弯曲文本)的识别,也可以自己构造一批数据集针对进行训练,对于推理速度,可以先将模型转换为inference再进行预测,速度应该会相当可观。 + +#### Q: 端到端算法PGNet提供了两种后处理方式,两者之间有什么区别呢? + +**A**: 两种后处理的区别主要在于速度的推理,config中PostProcess有fast/slow两种模式,slow模式的后处理速度慢,精度相对较高,fast模式的后处理速度快,精度也在可接受的范围之内。建议使用速度快的后处理方式。 + +#### Q: 使用PGNet进行eval报错? + +**A**: 需要注意,我们目前在release/2.1更新了评测代码,目前支持A,B两种评测模式: + +- A模式:该模式主要为了方便用户使用,与训练集一样的标注文件就可以正常进行eval操作, 代码中默认是A模式。 +- B模式:该模式主要为了保证我们的评测代码可以和Total Text官方的评测方式对齐,该模式下直接加载官方提供的mat文件进行eval。 + +#### Q: PGNet有中文预训练模型吗? + +**A**: 目前我们尚未提供针对中文的预训练模型,如有需要,可以尝试自己训练。具体需要修改的地方有: + + 1. [config文件中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/configs/e2e/e2e_r50_vd_pg.yml#L23-L24),字典文件路径及语种设置; + 1. [网络结构中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/heads/e2e_pg_head.py#L181),`out_channels`修改为字典中的字符数目+1(考虑到空格); + 1. [loss中](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/losses/e2e_pg_loss.py#L93),修改`37`为字典中的字符数目+1(考虑到空格); + +#### Q: 用于PGNet的训练集,文本框的标注有要求吗? + +**A**: PGNet支持多点标注,比如4点、8点、14点等。但需要注意的是,标注点尽可能分布均匀(相邻标注点间隔距离均匀一致),且label文件中的标注点需要从标注框的左上角开始,按标注点顺时针顺序依次编写,以上问题都可能对训练精度造成影响。 +我们提供的,基于Total Text数据集的PGNet预训练模型使用了14点标注方式。 + +#### Q: 用PGNet做进行端到端训练时,数据集标注的点的个数必须都是统一一样的吗? 能不能随意标点数,只要能够按顺时针从左上角开始标这样? + +**A**: 目前代码要求标注为统一的点数。 + +### 2.10 模型效果与效果不一致 + +#### Q: PP-OCR检测效果不好,该如何优化? + +A: 具体问题具体分析: +如果在你的场景上检测效果不可用,首选是在你的数据上做finetune训练; +如果图像过大,文字过于密集,建议不要过度压缩图像,可以尝试修改检测预处理的resize逻辑,防止图像被过度压缩; +检测框大小过于紧贴文字或检测框过大,可以调整db_unclip_ratio这个参数,加大参数可以扩大检测框,减小参数可以减小检测框大小; +检测框存在很多漏检问题,可以减小DB检测后处理的阈值参数det_db_box_thresh,防止一些检测框被过滤掉,也可以尝试设置det_db_score_mode为'slow'; +其他方法可以选择use_dilation为True,对检测输出的feature map做膨胀处理,一般情况下,会有效果改善; + +#### Q:同一张图通用检测出21个条目,轻量级检测出26个 ,难道不是轻量级的好吗? + +**A**:可以主要参考可视化效果,通用模型更倾向于检测一整行文字,轻量级可能会有一行文字被分成两段检测的情况,不是数量越多,效果就越好。 + +#### Q: DB有些框太贴文本了反而去掉了一些文本的边角影响识别,这个问题有什么办法可以缓解吗? + +**A**:可以把后处理的参数unclip_ratio适当调大一点。 + +#### Q: 使用合成数据精调小模型后,效果可以,但是还没开源的小infer模型效果好,这是为什么呢? + +**A**:(1)要保证使用的配置文件和pretrain weights是对应的; + +(2)在微调时,一般都需要真实数据,如果使用合成数据,效果反而可能会有下降,PaddleOCR中放出的识别inference模型也是基于预训练模型在真实数据上微调得到的,效果提升比较明显; + +(3)在训练的时候,文本长度超过25的训练图像都会被丢弃,因此需要看下真正参与训练的图像有多少,太少的话也容易过拟合。 + +#### Q: 表格识别中,如何提高单字的识别结果? + +**A**: 首先需要确认一下检测模型有没有有效的检测出单个字符,如果没有的话,需要在训练集当中添加相应的单字数据集。 + +#### Q: 动态图分支(dygraph,release/2.0),训练模型和推理模型效果不一致 + +A:当前问题表现为:使用训练完的模型直接测试结果较好,但是转换为inference model后,预测结果不一致;出现这个问题一般是两个原因: + +1. 预处理函数设置的不一致 +2. 后处理参数不一致 repo中config.yml文件的前后处理参数和inference预测默认的超参数有不一致的地方,建议排查下训练模型预测和inference预测的前后处理, 参考issue。 + +#### Q: 自己训练的det模型,在同一张图片上,inference模型与eval模型结果差别很大,为什么? + +A:这是由于图片预处理不同造成的。如果训练的det模型图片输入并不是默认的shape[600, 600],eval的程序中图片预处理方式与train时一致 (由xxx_reader.yml中的test_image_shape参数决定缩放大小,但predict_eval.py中的图片预处理方式由程序里的preprocess_params决定, 最好不要传入max_side_len,而是传入和训练时一样大小的test_image_shape。 + +#### Q: 训练模型和测试模型的检测结果差距较大 + +**A**:1. 检查两个模型使用的后处理参数是否是一样的,训练的后处理参数在配置文件中的PostProcess部分,测试模型的后处理参数在tools/infer/utility.py中,最新代码中两个后处理参数已保持一致。 + +#### Q: PaddleOCR模型Python端预测和C++预测结果不一致? + +A:正常来说,python端预测和C++预测文本是一致的,如果预测结果差异较大, 建议首先排查diff出现在检测模型还是识别模型,或者尝试换其他模型是否有类似的问题。 其次,检查python端和C++端数据处理部分是否存在差异,建议保存环境,更新PaddleOCR代码再试下。 如果更新代码或者更新代码都没能解决,建议在PaddleOCR微信群里或者issue中抛出您的问题。 + +用户总结的排查步骤: + +### 2.11 训练调试与配置文件 + +#### Q: 某个类别的样本比较少,通过增加训练的迭代次数或者是epoch,变相增加小样本的数目,这样能缓解这个问题么? + +A: 尽量保证类别均衡, 某些类别样本少,可以通过补充合成数据的方式处理;实验证明训练集中出现频次较少的字符,识别效果会比较差,增加迭代次数不能改变样本量少的问题。 + +#### Q:文本检测换成自己的数据没法训练,有一些”###”是什么意思? + +**A**:数据格式有问题,”###” 表示要被忽略的文本区域,所以你的数据都被跳过了,可以换成其他任意字符或者就写个空的。 + +#### Q:如何调试数据读取程序? + +A:tools/train.py中有一个test_reader()函数用于调试数据读取。 + +#### Q:中文文本检测、文本识别构建训练集的话,大概需要多少数据量 + +A:检测需要的数据相对较少,在PaddleOCR模型的基础上进行Fine-tune,一般需要500张可达到不错的效果。 识别分英文和中文,一般英文场景需要几十万数据可达到不错的效果,中文则需要几百万甚至更多。 + +#### Q: config yml文件中的ratio_list参数的作用是什么? + +**A**: 在动态图中,ratio_list在有多个数据源的情况下使用,ratio_list中的每个值是每个epoch从对应数据源采样数据的比例。如ratio_list=[0.3,0.2],label_file_list=['data1','data2'],代表每个epoch的训练数据包含data1 30%的数据,和data2里 20%的数据,ratio_list中数值的和不需要等于1。ratio_list和label_file_list的长度必须一致。 + +静态图检测数据采样的逻辑与动态图不同,但基本不影响训练精度。 + +在静态图中,使用 检测 dataloader读取数据时,会先设置每个epoch的数据量,比如这里设置为1000,ratio_list中的值表示在1000中的占比,比如ratio_list是[0.3, 0.7],则表示使用两个数据源,每个epoch从第一个数据源采样1000*0.3=300张图,从第二个数据源采样700张图。ratio_list的值的和也不需要等于1。 + +#### Q: iaa里面添加的数据增强方式,是每张图像训练都会做增强还是随机的?如何添加一个数据增强方法? + +**A**:iaa增强的训练配置参考:[这里](https://github.com/PaddlePaddle/PaddleOCR/blob/0ccc1720c252beb277b9e522a1b228eb6abffb8a/configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml#L82)。其中{ 'type': Fliplr, 'args': { 'p': 0.5 } } p是概率。新增数据增强,可以参考[这个方法](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.1/doc/doc_ch/add_new_algorithm.md#%E6%95%B0%E6%8D%AE%E5%8A%A0%E8%BD%BD%E5%92%8C%E5%A4%84%E7%90%86) + +#### Q: 怎么加速训练过程呢? + +**A**:OCR模型训练过程中一般包含大量的数据增广,这些数据增广是比较耗时的,因此可以离线生成大量增广后的图像,直接送入网络进行训练,机器资源充足的情况下,也可以使用分布式训练的方法,可以参考[分布式训练教程文档](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/distributed_training.md)。 + +#### Q: 一些特殊场景的数据识别效果差,但是数据量很少,不够用来finetune怎么办? + +**A**:您可以合成一些接近使用场景的数据用于训练。 +我们计划推出基于特定场景的文本数据合成工具,请您持续关注PaddleOCR的近期更新。 + +#### Q: PaddleOCR可以识别灰度图吗? + +**A**:PaddleOCR的模型均为三通道输入。如果您想使用灰度图作为输入,建议直接用3通道的模式读入灰度图, +或者将单通道图像转换为三通道图像再识别。例如,opencv的cvtColor函数就可以将灰度图转换为RGB三通道模式。 + +#### Q: 如何合成手写中文数据集? + +**A**: 手写数据集可以通过手写单字数据集合成得到。随机选取一定数量的单字图片和对应的label,将图片高度resize为随机的统一高度后拼接在一起,即可得到合成数据集。对于需要添加文字背景的情况,建议使用阈值化将单字图片的白色背景处理为透明背景,再与真实背景图进行合成。具体可以参考文档[手写数据集](https://github.com/PaddlePaddle/PaddleOCR/blob/a72d6f23be9979e0c103d911a9dca3e4613e8ccf/doc/doc_ch/handwritten_datasets.md)。 + +#### Q:PaddleOCR默认不是200个step保存一次模型吗?为啥文件夹下面都没有生成 + +**A**:因为默认保存的起始点不是0,而是4000,将eval_batch_step [4000, 5000]改为[0, 2000] 就是从第0次迭代开始,每2000迭代保存一次模型 + +#### Q: PaddleOCR在训练的时候一直使用cosine_decay的学习率下降策略,这是为什么呢? + +**A**:cosine_decay表示在训练的过程中,学习率按照cosine的变化趋势逐渐下降至0,在迭代轮数更长的情况下,比常量的学习率变化策略会有更好的收敛效果,因此在实际训练的时候,均采用了cosine_decay,来获得精度更高的模型。 + +#### Q: Cosine学习率的更新策略是怎样的?训练过程中为什么会在一个值上停很久? + +**A**: Cosine学习率的说明可以参考[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/optimizer/lr/CosineAnnealingDecay_cn.html#cosineannealingdecay) + +在PaddleOCR中,为了让学习率更加平缓,我们将其中的epoch调整成了iter。 +学习率的更新会和总的iter数量有关。当iter比较大时,会经过较多iter才能看出学习率的值有变化。 + +#### Q: 之前的CosineWarmup方法为什么不见了? + +**A**: 我们对代码结构进行了调整,目前的Cosine可以覆盖原有的CosineWarmup的功能,只需要在配置文件中增加相应配置即可。 +例如下面的代码,可以设置warmup为2个epoch: + +``` +lr: + name: Cosine + learning_rate: 0.001 + warmup_epoch: 2 +``` + +#### Q: 训练识别和检测时学习率要加上warmup,目的是什么? + +**A**: Warmup机制先使学习率从一个较小的值逐步升到一个较大的值,而不是直接就使用较大的学习率,这样有助于模型的稳定收敛。在OCR检测和OCR识别中,一般会带来精度~0.5%的提升。 + +#### Q: 关于dygraph分支中,文本识别模型训练,要使用数据增强应该如何设置? + +**A**:可以参考[配置文件](../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)在`Train['dataset']['transforms']`添加RecAug字段,使数据增强生效。可以通过添加对aug_prob设置,表示每种数据增强采用的概率。aug_prob默认是0.4。详细设置可以参考[ISSUE 1744](https://github.com/PaddlePaddle/PaddleOCR/issues/1744)。 + +#### Q: 训练过程中,训练程序意外退出/挂起,应该如何解决? + +**A**: 考虑内存,显存(使用GPU训练的话)是否不足,可在配置文件中,将训练和评估的batch size调小一些。需要注意,训练batch size调小时,学习率learning rate也要调小,一般可按等比例调整。 + +#### Q: 训练程序启动后直到结束,看不到训练过程log? + +**A**: 可以从以下三方面考虑: + 1. 检查训练进程是否正常退出、显存占用是否释放、是否有残留进程,如果确定是训练程序卡死,可以检查环境配置,遇到环境问题建议使用docker,可以参考说明文档[安装](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/installation.md)。 + 2. 检查数据集的数据量是否太小,可调小batch size从而增加一个epoch中的训练step数量,或在训练config文件中,将参数print_batch_step改为1,即每一个step打印一次log信息。 + 3. 如果使用私有数据集训练,可先用PaddleOCR提供/推荐的数据集进行训练,排查私有数据集是否存在问题。 + +#### Q: 配置文件中的参数num workers是什么意思,应该如何设置? + +**A**: 训练数据的读取需要硬盘IO,而硬盘IO速度远小于GPU运算速度,为了避免数据读取成为训练速度瓶颈,可以使用多进程读取数据,num workers表示数据读取的进程数量,0表示不使用多进程读取。在Linux系统下,多进程读取数据时,进程间通信需要基于共享内存,因此使用多进程读取数据时,建议设置共享内存不低于2GB,最好可以达到8GB,此时,num workers可以设置为CPU核心数。如果机器硬件配置较低,或训练进程卡死、dataloader报错,可以将num workers设置为0,即不使用多进程读取数据。 + +### 2.12 预测 + +#### Q: 为什么PaddleOCR检测预测是只支持一张图片测试?即test_batch_size_per_card=1 + +A:测试的时候,对图像等比例缩放,最长边960,不同图像等比例缩放后长宽不一致,无法组成batch,所以设置为test_batch_size为1。 + +#### Q: PaddleOCR支持tensorrt推理吗? + +A: 支持的,需要在编译的时候将CMakeLists.txt文件当中,将相关代码option(WITH_TENSORRT "Compile demo with TensorRT." OFF)的OFF改成ON。关于服务器端部署的更多设置,可以参考飞桨官网 + +#### Q: 如何使用TensorRT加速PaddleOCR预测? + +**A**: 目前paddle的dygraph分支已经支持了python和C++ TensorRT预测的代码,python端inference预测时把参数[--use_tensorrt=True](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/tools/infer/utility.py#L37)即可, +C++TensorRT预测需要使用支持TRT的预测库并在编译时打开[-DWITH_TENSORRT=ON](https://github.com/PaddlePaddle/PaddleOCR/blob/3ec57e8df9263de6fa897e33d2d91bc5d0849ef3/deploy/cpp_infer/tools/build.sh#L15)。 +如果想修改其他分支代码支持TensorRT预测,可以参考[PR](https://github.com/PaddlePaddle/PaddleOCR/pull/2921)。 + +注:建议使用TensorRT大于等于6.1.0.5以上的版本。 + +#### Q: 为什么识别模型做预测的时候,预测图片的数量数量还会影响预测的精度 + +**A**: 推理时识别模型默认的batch_size=6, 如预测图片长度变化大,可能影响预测效果。如果出现上述问题可在推理的时候设置识别bs=1,命令如下: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --rec_batch_num=1 +``` + +### 2.13 推理部署 + +#### Q:PaddleOCR模型推理方式有几种?各自的优缺点是什么 + +**A**:目前推理方式支持基于训练引擎推理和基于预测引擎推理。 + +(1)基于训练引擎推理不需要转换模型,但是需要先组网再load参数,语言只支持python,不适合系统集成。 + +(2)基于预测引擎的推理需要先转换模型为inference格式,然后可以进行不需要组网的推理,语言支持c++和python,适合系统集成。 + +#### Q:PaddleOCR中,对于模型预测加速,CPU加速的途径有哪些?基于TenorRT加速GPU对输入有什么要求? + +**A**:(1)CPU可以使用mkldnn进行加速;对于python inference的话,可以把enable_mkldnn改为true,[参考代码](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/tools/infer/utility.py#L99),对于cpp inference的话,可参考[文档](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/deploy/cpp_infer) + +(2)GPU需要注意变长输入问题等,TRT6 之后才支持变长输入 + +#### Q:hubserving、pdserving这两种部署方式区别是什么? + +A:hubserving原本是paddlehub的配套服务部署工具,可以很方便的将paddlehub内置的模型部署为服务,paddleocr使用了这个功能,并将模型路径等参数暴露出来方便用户自定义修改。paddle serving是面向所有paddle模型的部署工具,文档中可以看到我们提供了快速版和标准版,其中快速版和hubserving的本质是一样的,而标准版基于rpc,更稳定,更适合分布式部署。 + +#### Q: 目前paddle hub serving 只支持 imgpath,如果我想用imgurl 去哪里改呢? + +A:图片是在[这里](https://github.com/PaddlePaddle/PaddleOCR/blob/67ef25d593c4eabfaaceb22daade4577f53bed81/deploy/hubserving/ocr_system/module.py#L55)读取的, 可以参考下面的写法,将url path转化为np array + +``` +response = request.urlopen('http://i1.whymtj.com/uploads/tu/201902/9999/52491ae4ba.jpg') +img_array = np.array(bytearray(response.read()), dtype=np.uint8) +img = cv.imdecode(img_array, -1) +``` + +#### Q: C++ 端侧部署可以只对OCR的检测部署吗? + +A:可以的,识别和检测模块是解耦的。如果想对检测部署,需要自己修改一下main函数, 只保留检测相关就可以: [参考](https://github.com/PaddlePaddle/PaddleOCR/blob/de3e2e7cd3b8b65ee02d7a41e570fa5b511a3c1d/deploy/cpp_infer/src/main.cpp#L72) + +#### Q:服务部署可以只发布文本识别,而不带文本检测模型么? + +A:可以的。默认的服务部署是检测和识别串联预测的。也支持单独发布文本检测或文本识别模型,比如使用PaddleHUBPaddleOCR 模型时,deploy下有三个文件夹,分别是 +ocr_det:检测预测 +ocr_rec: 识别预测 +ocr_system: 检测识别串联预测 + +#### Q: lite预测库和nb模型版本不匹配,该如何解决? + +**A**: 如果可以正常预测就不用管,如果这个问题导致无法正常预测,可以尝试使用同一个commit的Paddle Lite代码编译预测库和opt文件,可以参考[移动端部署教程](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.1/deploy/lite/readme.md)。 + +#### Q:如何将PaddleOCR预测模型封装成SDK + +**A**:如果是Python的话,可以使用tools/infer/predict_system.py中的TextSystem进行sdk封装,如果是c++的话,可以使用deploy/cpp_infer/src下面的DBDetector和CRNNRecognizer完成封装 + +#### Q:为什么PaddleOCR检测预测是只支持一张图片测试?即test_batch_size_per_card=1 + +**A**:测试的时候,对图像等比例缩放,最长边960,不同图像等比例缩放后长宽不一致,无法组成batch,所以设置为test_batch_size为1。 + +#### Q:为什么第一张张图预测时间很长,第二张之后预测时间会降低? + +**A**:第一张图需要显存资源初始化,耗时较多。完成模型加载后,之后的预测时间会明显缩短。 + +#### Q: 采用Paddle-Lite进行端侧部署,出现问题,环境没问题 + +**A**:如果你的预测库是自己编译的,那么你的nb文件也要自己编译,用同一个lite版本。不能直接用下载的nb文件,因为版本不同。 + +#### Q: 如何多进程运行paddleocr? + +**A**:实例化多个paddleocr服务,然后将服务注册到注册中心,之后通过注册中心统一调度即可,关于注册中心,可以搜索eureka了解一下具体使用,其他的注册中心也行。 + +#### Q: 如何多进程预测? + +**A**: 近期PaddleOCR新增了[多进程预测控制参数](https://github.com/PaddlePaddle/PaddleOCR/blob/a312647be716776c1aac33ff939ae358a39e8188/tools/infer/utility.py#L103),`use_mp`表示是否使用多进程,`total_process_num`表示在使用多进程时的进程数。具体使用方式请参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/inference.md#1-%E8%B6%85%E8%BD%BB%E9%87%8F%E4%B8%AD%E6%96%87ocr%E6%A8%A1%E5%9E%8B%E6%8E%A8%E7%90%86)。 + +#### Q: 怎么解决paddleOCR在T4卡上有越预测越慢的情况? + +**A**: + +1. T4 GPU没有主动散热,因此在测试的时候需要在每次infer之后需要sleep 30ms,否则机器容易因为过热而降频(inference速度会变慢),温度过高也有可能会导致宕机。 +2. T4在不使用的时候,也有可能会降频,因此在做benchmark的时候需要锁频,下面这两条命令可以进行锁频。 + +``` +nvidia-smi -i 0 -pm ENABLED +nvidia-smi --lock-gpu-clocks=1590 -i 0 +``` + +#### Q: 在windows上进行cpp inference的部署时,总是提示找不到`paddle_fluid.dll`和`opencv_world346.dll` + +**A**:有2种方法可以解决这个问题: + +1. 将paddle预测库和opencv库的地址添加到系统环境变量中。 +2. 将提示缺失的dll文件拷贝到编译产出的`ocr_system.exe`文件夹中。 + +#### Q: win下C++部署中文识别乱码的解决方法 + +**A**: win下编码格式不是utf8,而ppocr_keys_v1.txt的编码格式的utf8,将ppocr_keys_v1.txt 的编码从utf-8修改为 Ansi 编码格式就行了。 + +#### Q: windows 3060显卡GPU模式启动 加载模型慢 + +**A**: 30系列的显卡需要使用cuda11。 + +#### Q:想在Mac上部署,从哪里下载预测库呢? + +**A**:Mac上的Paddle预测库可以从这里下载:[https://paddle-inference-lib.bj.bcebos.com/mac/2.0.0/cpu_avx_openblas/paddle_inference.tgz](https://paddle-inference-lib.bj.bcebos.com/mac/2.0.0/cpu_avx_openblas/paddle_inference.tgz) + +#### Q:内网环境如何进行服务化部署呢? + +**A**:仍然可以使用PaddleServing或者HubServing进行服务化部署,保证内网地址可以访问即可。 + +#### Q: 使用hub_serving部署,延时较高,可能的原因是什么呀? + +**A**: 首先,测试的时候第一张图延时较高,可以多测试几张然后观察后几张图的速度;其次,如果是在cpu端部署serving端模型(如backbone为ResNet34),耗时较慢,建议在cpu端部署mobile(如backbone为MobileNetV3)模型。 + +#### Q: 在使用PaddleLite进行预测部署时,启动预测后卡死/手机死机? + +**A**: 请检查模型转换时所用PaddleLite的版本,和预测库的版本是否对齐。即PaddleLite版本为2.8,则预测库版本也要为2.8。 + +#### Q: 预测时显存爆炸、内存泄漏问题? + +**A**: 打开显存/内存优化开关`enable_memory_optim`可以解决该问题,相关代码已合入,[查看详情](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/tools/infer/utility.py#L153)。 diff --git a/docs/algorithm/add_new_algorithm.en.md b/docs/algorithm/add_new_algorithm.en.md new file mode 100644 index 0000000000..f22d785bd5 --- /dev/null +++ b/docs/algorithm/add_new_algorithm.en.md @@ -0,0 +1,307 @@ +--- +comments: true +--- + +# Add New Algorithm + +PaddleOCR decomposes an algorithm into the following parts, and modularizes each part to make it more convenient to develop new algorithms. + +* Data loading and processing +* Network +* Post-processing +* Loss +* Metric +* Optimizer + +The following will introduce each part separately, and introduce how to add the modules required for the new algorithm. + +## Data loading and processing + +Data loading and processing are composed of different modules, which complete the image reading, data augment and label production. This part is under [ppocr/data](../../ppocr/data). The explanation of each file and folder are as follows: + +```bash linenums="1" +ppocr/data/ +├── imaug # Scripts for image reading, data augment and label production +│ ├── label_ops.py # Modules that transform the label +│ ├── operators.py # Modules that transform the image +│ ├──..... +├── __init__.py +├── lmdb_dataset.py # The dataset that reads the lmdb +└── simple_dataset.py # Read the dataset saved in the form of `image_path\tgt` +``` + +PaddleOCR has a large number of built-in image operation related modules. For modules that are not built-in, you can add them through the following steps: + +1. Create a new file under the [ppocr/data/imaug](../../ppocr/data/imaug) folder, such as my_module.py. +2. Add code in the my_module.py file, the sample code is as follows: + + ```python linenums="1" + class MyModule: + def __init__(self, *args, **kwargs): + # your init code + pass + + def __call__(self, data): + img = data['image'] + label = data['label'] + # your process code + + data['image'] = img + data['label'] = label + return data + ``` + +3. Import the added module in the [ppocr/data/imaug/\__init\__.py](../../ppocr/data/imaug/__init__.py) file. + +All different modules of data processing are executed by sequence, combined and executed in the form of a list in the config file. Such as: + +```yaml linenums="1" +# angle class data process +transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - MyModule: + args1: args1 + args2: args2 + - KeepKeys: + keep_keys: [ 'image', 'label' ] # dataloader will return list in this order +``` + +## Network + +The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones-> +necks->heads). + +```bash linenums="1" +├── architectures # Code for building network +├── transforms # Image Transformation Module +├── backbones # Feature extraction module +├── necks # Feature enhancement module +└── heads # Output module +``` + +PaddleOCR has built-in commonly used modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For modules that do not have built-in, you can add them through the following steps, the four parts are added in the same steps, take backbones as an example: + +1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py. +2. Add code in the my_backbone.py file, the sample code is as follows: + + ```python linenums="1" + import paddle + import paddle.nn as nn + import paddle.nn.functional as F + + + class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y + ``` + +3. Import the added module in the [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py) file. + +After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as: + +```yaml linenums="1" +Architecture: +model_type: rec +algorithm: CRNN +Transform: + name: MyTransform + args1: args1 + args2: args2 +Backbone: + name: MyBackbone + args1: args1 +Neck: + name: MyNeck + args1: args1 +Head: + name: MyHead + args1: args1 +``` + +## Post-processing + +Post-processing realizes decoding network output to obtain text box or recognized text. This part is under [ppocr/postprocess](../../ppocr/postprocess). +PaddleOCR has built-in post-processing modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For components that are not built-in, they can be added through the following steps: + +1. Create a new file under the [ppocr/postprocess](../../ppocr/postprocess) folder, such as my_postprocess.py. +2. Add code in the my_postprocess.py file, the sample code is as follows: + + ```python linenums="1" + import paddle + + + class MyPostProcess: + def __init__(self, *args, **kwargs): + # your init code + pass + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + # you preds decode code + preds = self.decode_preds(preds) + if label is None: + return preds + # you label decode code + label = self.decode_label(label) + return preds, label + + def decode_preds(self, preds): + # you preds decode code + pass + + def decode_label(self, preds): + # you label decode code + pass + ``` + +3. Import the added module in the [ppocr/postprocess/\__init\__.py](../../ppocr/postprocess/__init__.py) file. + +After the post-processing module is added, you only need to configure it in the configuration file to use, such as: + +```yaml linenums="1" +PostProcess: +name: MyPostProcess +args1: args1 +args2: args2 +``` + +## Loss + +The loss function is used to calculate the distance between the network output and the label. This part is under [ppocr/losses](../../ppocr/losses). +PaddleOCR has built-in loss function modules related to algorithms such as DB, EAST, SAST, CRNN and Attention. For modules that do not have built-in modules, you can add them through the following steps: + +1. Create a new file in the [ppocr/losses](../../ppocr/losses) folder, such as my_loss.py. +2. Add code in the my_loss.py file, the sample code is as follows: + + ```python linenums="1" + import paddle + from paddle import nn + + + class MyLoss(nn.Layer): + def __init__(self, **kwargs): + super(MyLoss, self).__init__() + # you init code + pass + + def __call__(self, predicts, batch): + label = batch[1] + # your loss code + loss = self.loss(input=predicts, label=label) + return {'loss': loss} + ``` + +3. Import the added module in the [ppocr/losses/\__init\__.py](../../ppocr/losses/__init__.py) file. + +After the loss function module is added, you only need to configure it in the configuration file to use it, such as: + +```yaml linenums="1" +Loss: + name: MyLoss + args1: args1 + args2: args2 +``` + +## Metric + +Metric is used to calculate the performance of the network on the current batch. This part is under [ppocr/metrics](../../ppocr/metrics). PaddleOCR has built-in evaluation modules related to algorithms such as detection, classification and recognition. For modules that do not have built-in modules, you can add them through the following steps: + +1. Create a new file under the [ppocr/metrics](../../ppocr/metrics) folder, such as my_metric.py. +2. Add code in the my_metric.py file, the sample code is as follows: + + ```python linenums="1" + + class MyMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + # main_indicator is used for select best model + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, *args, **kwargs): + # preds is out of postprocess + # batch is out of dataloader + labels = batch[1] + cur_correct_num = 0 + cur_all_num = 0 + # you metric code + self.correct_num += cur_correct_num + self.all_num += cur_all_num + return {'acc': cur_correct_num / cur_all_num, } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + acc = self.correct_num / self.all_num + self.reset() + return {'acc': acc} + + def reset(self): + # reset metric + self.correct_num = 0 + self.all_num = 0 + + ``` + +3. Import the added module in the [ppocr/metrics/\__init\__.py](../../ppocr/metrics/__init__.py) file. + +After the metric module is added, you only need to configure it in the configuration file to use it, such as: + +```yaml linenums="1" +Metric: + name: MyMetric + main_indicator: acc +``` + +## Optimizer + +The optimizer is used to train the network. The optimizer also contains network regularization and learning rate decay modules. This part is under [ppocr/optimizer](../../ppocr/optimizer). PaddleOCR has built-in +Commonly used optimizer modules such as `Momentum`, `Adam` and `RMSProp`, common regularization modules such as `Linear`, `Cosine`, `Step` and `Piecewise`, and common learning rate decay modules such as `L1Decay` and `L2Decay`. +Modules without built-in can be added through the following steps, take `optimizer` as an example: + +1. Create your own optimizer in the [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) file, the sample code is as follows: + + ```python linenums="1" + from paddle import optimizer as optim + + + class MyOptim(object): + def __init__(self, learning_rate=0.001, *args, **kwargs): + self.learning_rate = learning_rate + + def __call__(self, parameters): + # It is recommended to wrap the built-in optimizer of paddle + opt = optim.XXX( + learning_rate=self.learning_rate, + parameters=parameters) + return opt + + ``` + +After the optimizer module is added, you only need to configure it in the configuration file to use, such as: + +```yaml linenums="1" +Optimizer: + name: MyOptim + args1: args1 + args2: args2 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 +``` diff --git a/docs/algorithm/add_new_algorithm.md b/docs/algorithm/add_new_algorithm.md new file mode 100644 index 0000000000..4a0fef1936 --- /dev/null +++ b/docs/algorithm/add_new_algorithm.md @@ -0,0 +1,300 @@ +--- +comments: true +--- + +# 添加新算法 + +PaddleOCR将一个算法分解为以下几个部分,并对各部分进行模块化处理,方便快速组合出新的算法。 + +下面将分别对每个部分进行介绍,并介绍如何在该部分里添加新算法所需模块。 + +## 1. 数据加载和处理 + +数据加载和处理由不同的模块(module)组成,其完成了图片的读取、数据增强和label的制作。这一部分在[ppocr/data](../../ppocr/data)下。 各个文件及文件夹作用说明如下: + +```bash linenums="1" +ppocr/data/ +├── imaug # 图片的读取、数据增强和label制作相关的文件 +│ ├── label_ops.py # 对label进行变换的modules +│ ├── operators.py # 对image进行变换的modules +│ ├──..... +├── __init__.py +├── lmdb_dataset.py # 读取lmdb的数据集的dataset +└── simple_dataset.py # 读取以`image_path\tgt`形式保存的数据集的dataset +``` + +PaddleOCR内置了大量图像操作相关模块,对于没有没有内置的模块可通过如下步骤添加: + +1. 在 [ppocr/data/imaug](../../ppocr/data/imaug) 文件夹下新建文件,如my_module.py。 +2. 在 my_module.py 文件内添加相关代码,示例代码如下: + + ```python linenums="1" + class MyModule: + def __init__(self, *args, **kwargs): + # your init code + pass + + def __call__(self, data): + img = data['image'] + label = data['label'] + # your process code + + data['image'] = img + data['label'] = label + return data + ``` + +3. 在 [ppocr/data/imaug/\__init\__.py](../../ppocr/data/imaug/__init__.py) 文件内导入添加的模块。 + +数据处理的所有处理步骤由不同的模块顺序执行而成,在config文件中按照列表的形式组合并执行。如: + +```yaml linenums="1" +# angle class data process +transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - MyModule: + args1: args1 + args2: args2 + - KeepKeys: + keep_keys: [ 'image', 'label' ] # dataloader will return list in this order +``` + +## 2. 网络 + +网络部分完成了网络的组网操作,PaddleOCR将网络划分为四部分,这一部分在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones-> +necks->heads)依次通过这四个部分。 + +```bash linenums="1" +├── architectures # 网络的组网代码 +├── transforms # 网络的图像变换模块 +├── backbones # 网络的特征提取模块 +├── necks # 网络的特征增强模块 +└── heads # 网络的输出模块 +``` + +PaddleOCR内置了DB,EAST,SAST,CRNN和Attention等算法相关的常用模块,对于没有内置的模块可通过如下步骤添加,四个部分添加步骤一致,以backbones为例: + +1. 在 [ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。 +2. 在 my_backbone.py 文件内添加相关代码,示例代码如下: + + ```python linenums="1" + import paddle + import paddle.nn as nn + import paddle.nn.functional as F + + + class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y + ``` + +3. 在 [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py)文件内导入添加的模块。 + +在完成网络的四部分模块添加之后,只需要配置文件中进行配置即可使用,如: + +```yaml linenums="1" +Architecture: + model_type: rec + algorithm: CRNN + Transform: + name: MyTransform + args1: args1 + args2: args2 + Backbone: + name: MyBackbone + args1: args1 + Neck: + name: MyNeck + args1: args1 + Head: + name: MyHead + args1: args1 +``` + +## 3. 后处理 + +后处理实现解码网络输出获得文本框或者识别到的文字。这一部分在[ppocr/postprocess](../../ppocr/postprocess)下。 +PaddleOCR内置了DB,EAST,SAST,CRNN和Attention等算法相关的后处理模块,对于没有内置的组件可通过如下步骤添加: + +1. 在 [ppocr/postprocess](../../ppocr/postprocess) 文件夹下新建文件,如 my_postprocess.py。 +2. 在 my_postprocess.py 文件内添加相关代码,示例代码如下: + + ```python linenums="1" + import paddle + + + class MyPostProcess: + def __init__(self, *args, **kwargs): + # your init code + pass + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + # you preds decode code + preds = self.decode_preds(preds) + if label is None: + return preds + # you label decode code + label = self.decode_label(label) + return preds, label + + def decode_preds(self, preds): + # you preds decode code + pass + + def decode_label(self, preds): + # you label decode code + pass + ``` + +3. 在 [ppocr/postprocess/\__init\__.py](../../ppocr/postprocess/__init__.py)文件内导入添加的模块。 + +在后处理模块添加之后,只需要配置文件中进行配置即可使用,如: + +```yaml linenums="1" +PostProcess: + name: MyPostProcess + args1: args1 + args2: args2 +``` + +## 4. 损失函数 + +损失函数用于计算网络输出和label之间的距离。这一部分在[ppocr/losses](../../ppocr/losses)下。 +PaddleOCR内置了DB,EAST,SAST,CRNN和Attention等算法相关的损失函数模块,对于没有内置的模块可通过如下步骤添加: + +1. 在 [ppocr/losses](../../ppocr/losses) 文件夹下新建文件,如 my_loss.py。 +2. 在 my_loss.py 文件内添加相关代码,示例代码如下: + + ```python linenums="1" + import paddle + from paddle import nn + + + class MyLoss(nn.Layer): + def __init__(self, **kwargs): + super(MyLoss, self).__init__() + # you init code + pass + + def __call__(self, predicts, batch): + label = batch[1] + # your loss code + loss = self.loss(input=predicts, label=label) + return {'loss': loss} + ``` + +3. 在 [ppocr/losses/\__init\__.py](../../ppocr/losses/__init__.py)文件内导入添加的模块。 + +在损失函数添加之后,只需要配置文件中进行配置即可使用,如: + +```yaml linenums="1" +Loss: + name: MyLoss + args1: args1 + args2: args2 +``` + +## 5. 指标评估 + +指标评估用于计算网络在当前batch上的性能。这一部分在[ppocr/metrics](../../ppocr/metrics)下。 PaddleOCR内置了检测,分类和识别等算法相关的指标评估模块,对于没有内置的模块可通过如下步骤添加: + +1. 在 [ppocr/metrics](../../ppocr/metrics) 文件夹下新建文件,如my_metric.py。 +2. 在 my_metric.py 文件内添加相关代码,示例代码如下: + + ```python linenums="1" + + class MyMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + # main_indicator is used for select best model + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, *args, **kwargs): + # preds is out of postprocess + # batch is out of dataloader + labels = batch[1] + cur_correct_num = 0 + cur_all_num = 0 + # you metric code + self.correct_num += cur_correct_num + self.all_num += cur_all_num + return {'acc': cur_correct_num / cur_all_num, } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + acc = self.correct_num / self.all_num + self.reset() + return {'acc': acc} + + def reset(self): + # reset metric + self.correct_num = 0 + self.all_num = 0 + + ``` + +3. 在 [ppocr/metrics/\__init\__.py](../../ppocr/metrics/__init__.py)文件内导入添加的模块。 + +在指标评估模块添加之后,只需要配置文件中进行配置即可使用,如: + +```yaml linenums="1" +Metric: + name: MyMetric + main_indicator: acc +``` + +## 6. 优化器 + +优化器用于训练网络。优化器内部还包含了网络正则化和学习率衰减模块。 这一部分在[ppocr/optimizer](../../ppocr/optimizer)下。 PaddleOCR内置了`Momentum`,`Adam` +和`RMSProp`等常用的优化器模块,`Linear`,`Cosine`,`Step`和`Piecewise`等常用的正则化模块与`L1Decay`和`L2Decay`等常用的学习率衰减模块。 +对于没有内置的模块可通过如下步骤添加,以`optimizer`为例: + +1. 在 [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) 文件内创建自己的优化器,示例代码如下: + + ```python linenums="1" + from paddle import optimizer as optim + + + class MyOptim(object): + def __init__(self, learning_rate=0.001, *args, **kwargs): + self.learning_rate = learning_rate + + def __call__(self, parameters): + # It is recommended to wrap the built-in optimizer of paddle + opt = optim.XXX( + learning_rate=self.learning_rate, + parameters=parameters) + return opt + + ``` + +在优化器模块添加之后,只需要配置文件中进行配置即可使用,如: + +```yaml linenums="1" +Optimizer: + name: MyOptim + args1: args1 + args2: args2 + lr: + name: Cosine + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 +``` diff --git a/docs/algorithm/end_to_end/algorithm_e2e_pgnet.en.md b/docs/algorithm/end_to_end/algorithm_e2e_pgnet.en.md new file mode 100644 index 0000000000..f3328ced59 --- /dev/null +++ b/docs/algorithm/end_to_end/algorithm_e2e_pgnet.en.md @@ -0,0 +1,224 @@ +--- +comments: true +--- + +## 1. Brief Introduction + +OCR algorithms can be divided into two categories: two-stage algorithm and end-to-end algorithm. The two-stage OCR algorithm is generally divided into two parts, text detection and text recognition algorithm. The text detection algorithm locates the box of the text line from the image, and then the recognition algorithm identifies the content of the text box. The end-to-end OCR algorithm combines text detection and recognition in one algorithm. Its basic idea is to design a model with both detection unit and recognition module, share the CNN features of both and train them together. Because one algorithm can complete character recognition, the end-to-end model is smaller and faster. + +### Introduction Of PGNet Algorithm + +During the recent years, the end-to-end OCR algorithm has been well developed, including MaskTextSpotter series, TextSnake, TextDragon, PGNet series and so on. Among these algorithms, PGNet algorithm has some advantages over the other algorithms. + +- PGNet loss is designed to guide training, and no character-level annotations is needed. +- NMS and ROI related operations are not needed. It can accelerate the prediction +- The reading order prediction module is proposed +- A graph based modification module (GRM) is proposed to further improve the performance of model recognition +- Higher accuracy and faster prediction speed + +For details of PGNet algorithm, please refer to [paper](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf). The schematic diagram of the algorithm is as follows: + +![](./images/pgnet_framework.png) + +After feature extraction, the input image is sent to four branches: TBO module for text edge offset prediction, TCL module for text center-line prediction, TDO module for text direction offset prediction, and TCC module for text character classification graph prediction. +The output of TBO and TCL can get text detection results after post-processing, and TCL, TDO and TCC are responsible for text recognition. + +The results of detection and recognition are as follows: + +![](./images/e2e_res_img293_pgnet.png) + +![](./images/e2e_res_img295_pgnet.png) + +### Performance + +#### Test set: Total Text + +#### Test environment: NVIDIA Tesla V100-SXM2-16GB + +|PGNetA|det_precision|det_recall|det_f_score|e2e_precision|e2e_recall|e2e_f_score|FPS|download| +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +|Paper|85.30|86.80|86.10|-|-|61.70|38.20 (size=640)|-| +|Ours|87.03|82.48|84.69|61.71|58.43|60.03|48.73 (size=768)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar)| + +*note:PGNet in PaddleOCR optimizes the prediction speed, and can significantly improve the end-to-end prediction speed within the acceptable range of accuracy reduction* + +## 2. Environment Configuration + +Please refer to [Operation Environment Preparation](../../ppocr/environment.en.md) to configure PaddleOCR operating environment first, refer to [Project Clone](../../ppocr/blog/clone.en.md) to clone the project + +## 3. Quick Use + +### Inference model download + +This section takes the trained end-to-end model as an example to quickly use the model prediction. First, download the trained end-to-end inference model [download address](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.tar) + +```bash linenums="1" +mkdir inference && cd inference +# Download the English end-to-end model and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.tar && tar xf e2e_server_pgnetA_infer.tar +``` + +- In Windows environment, if 'wget' is not installed, the link can be copied to the browser when downloading the model, and decompressed and placed in the corresponding directory + +After decompression, there should be the following file structure: + +```text linenums="1" +├── e2e_server_pgnetA_infer +│ ├── inference.pdiparams +│ ├── inference.pdiparams.info +│ └── inference.pdmodel +``` + +### Single image or image set prediction + +```bash linenums="1" +# Prediction single image specified by image_dir +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" + +# Prediction the collection of images specified by image_dir +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" + +# If you want to use CPU for prediction, you need to set use_gpu parameter is false +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --use_gpu=False --e2e_pgnet_valid_set="totaltext" +``` + +### Visualization results + +The visualized end-to-end results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows: + +![](./images//e2e_res_img623_pgnet.jpg) + +## 4. Model Training,Evaluation And Inference + +This section takes the totaltext dataset as an example to introduce the training, evaluation and testing of the end-to-end model in PaddleOCR. + +### Data Preparation + +Download and unzip [totaltext](https://paddleocr.bj.bcebos.com/dataset/total_text.tar) dataset to PaddleOCR/train_data/, dataset organization structure is as follow: + +```text linenums="1" +/PaddleOCR/train_data/total_text/train/ + |- rgb/ # total_text training data of dataset + |- img11.png + | ... + |- train.txt # total_text training annotation of dataset +``` + +total_text.txt: the format of dimension file is as follows,the file name and annotation information are separated by "\t": + +```text linenums="1" +" Image file name Image annotation information encoded by json.dumps" +rgb/img11.jpg [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}] +``` + +The image annotation after **json.dumps()** encoding is a list containing multiple dictionaries. + +The `points` in the dictionary represent the coordinates (x, y) of the four points of the text box, arranged clockwise from the point at the upper left corner. + +`transcription` represents the text of the current text box. **When its content is "###" it means that the text box is invalid and will be skipped during training.** + +If you want to train PaddleOCR on other datasets, please build the annotation file according to the above format. + +### Start Training + +PGNet training is divided into two steps: Step 1: training on the synthetic data to get the pretrain_model, and the accuracy of the model is still low; step 2: loading the pretrain_model and training on the totaltext data set; for fast training, we directly provide the pre training model of step 1[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/train_step1.tar). + +```bash linenums="1" +cd PaddleOCR/ + +# download step1 pretrain_models +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/train_step1.tar + +# You can get the following file format +./pretrain_models/train_step1/ + └─ best_accuracy.pdopt + └─ best_accuracy.states + └─ best_accuracy.pdparams +``` + +*If CPU version installed, please set the parameter `use_gpu` to `false` in the configuration.* + +```bash linenums="1" +# single GPU training +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./pretrain_models/train_step1/best_accuracy Global.load_static_weights=False +# multi-GPU training +# Set the GPU ID used by the '--gpus' parameter. +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./pretrain_models/train_step1/best_accuracy Global.load_static_weights=False +``` + +In the above instruction, use `-c` to select the training to use the `configs/e2e/e2e_r50_vd_pg.yml` configuration file. +For a detailed explanation of the configuration file, please refer to [config](../../ppocr/blog/config.en.md). + +You can also use `-o` to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001 + +```bash linenums="1" +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Optimizer.base_lr=0.0001 +``` + +#### Load trained model and continue training + +If you would like to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. + +```bash linenums="1" +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.checkpoints=./your/trained/model +``` + +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrain_weights`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrain_weights` will be loaded. + +PaddleOCR calculates three indicators for evaluating performance of OCR end-to-end task: Precision, Recall, and Hmean. + +Run the following code to calculate the evaluation indicators. The result will be saved in the test result file specified by `save_res_path` in the configuration file `e2e_r50_vd_pg.yml` +When evaluating, set post-processing parameters `max_side_len=768`. If you use different datasets, different models for training. +The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. + +```bash linenums="1" +python3 tools/eval.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" +``` + +### Model Test + +Test the end-to-end result on a single image: + +```bash linenums="1" +python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/e2e_pgnet/best_accuracy" Global.load_static_weights=false +``` + +Test the end-to-end result on all images in the folder: + +```bash linenums="1" +python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/e2e_pgnet/best_accuracy" Global.load_static_weights=false +``` + +### Model inference + +#### (1).Quadrangle text detection model (ICDAR2015) + +First, convert the model saved in the PGNet end-to-end training process into an inference model. In the first stage of training based on composite dataset, the model of English data set training is taken as an example[model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar), you can use the following command to convert: + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar +python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e +``` + +**For PGNet quadrangle end-to-end model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**, run the following command: + +```bash linenums="1" +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows: + +![](./images//e2e_res_img_10_pgnet.jpg) + +#### (2). Curved text detection model (Total-Text) + +For the curved text example, we use the same model as the quadrilateral +**For PGNet end-to-end curved text detection model inference, you need to set the parameter `--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="totaltext"`**, run the following command: + +```bash linenums="1" +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'e2e_res'. Examples of results are as follows: + +![](./images//e2e_res_img623_pgnet.jpg) diff --git a/docs/algorithm/end_to_end/algorithm_e2e_pgnet.md b/docs/algorithm/end_to_end/algorithm_e2e_pgnet.md new file mode 100644 index 0000000000..0989796984 --- /dev/null +++ b/docs/algorithm/end_to_end/algorithm_e2e_pgnet.md @@ -0,0 +1,220 @@ +--- +comments: true +--- + +## 一、简介 + +OCR算法可以分为两阶段算法和端对端的算法。二阶段OCR算法一般分为两个部分,文本检测和文本识别算法,文件检测算法从图像中得到文本行的检测框,然后识别算法去识别文本框中的内容。而端对端OCR算法可以在一个算法中完成文字检测和文字识别,其基本思想是设计一个同时具有检测单元和识别模块的模型,共享其中两者的CNN特征,并联合训练。由于一个算法即可完成文字识别,端对端模型更小,速度更快。 + +### PGNet算法介绍 + +近些年来,端对端OCR算法得到了良好的发展,包括MaskTextSpotter系列、TextSnake、TextDragon、PGNet系列等算法。在这些算法中,PGNet算法具备其他算法不具备的优势,包括: + +- 设计PGNet loss指导训练,不需要字符级别的标注 +- 不需要NMS和ROI相关操作,加速预测 +- 提出预测文本行内的阅读顺序模块; +- 提出基于图的修正模块(GRM)来进一步提高模型识别性能 +- 精度更高,预测速度更快 + +PGNet算法细节详见[论文](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) ,算法原理图如下所示: + +![](./images/pgnet_framework.png) + +输入图像经过特征提取送入四个分支,分别是:文本边缘偏移量预测TBO模块,文本中心线预测TCL模块,文本方向偏移量预测TDO模块,以及文本字符分类图预测TCC模块。 +其中TBO以及TCL的输出经过后处理后可以得到文本的检测结果,TCL、TDO、TCC负责文本识别。 + +其检测识别效果图如下: + +![](./images//e2e_res_img293_pgnet.png) + +![](./images//e2e_res_img295_pgnet.png) + +### 性能指标 + +#### 测试集: Total Text + +#### 测试环境: NVIDIA Tesla V100-SXM2-16GB + +|PGNetA|det_precision|det_recall|det_f_score|e2e_precision|e2e_recall|e2e_f_score|FPS|下载| +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +|Paper|85.30|86.80|86.10|-|-|61.70|38.20 (size=640)|-| +|Ours|87.03|82.48|84.69|61.71|58.43|60.03|48.73 (size=768)|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar)| + +*note:PaddleOCR里的PGNet实现针对预测速度做了优化,在精度下降可接受范围内,可以显著提升端对端预测速度* + +## 二、环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目 + +## 三、快速使用 + +### inference模型下载 + +本节以训练好的端到端模型为例,快速使用模型预测,首先下载训练好的端到端inference模型[下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.tar) + +```bash linenums="1" +mkdir inference && cd inference +# 下载英文端到端模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/e2e_server_pgnetA_infer.tar && tar xf e2e_server_pgnetA_infer.tar +``` + +- windows 环境下如果没有安装wget,下载模型时可将链接复制到浏览器中下载,并解压放置在相应目录下 + +解压完毕后应有如下文件结构: + +```text linenums="1" +├── e2e_server_pgnetA_infer +│ ├── inference.pdiparams +│ ├── inference.pdiparams.info +│ └── inference.pdmodel +``` + +### 单张图像或者图像集合预测 + +```bash linenums="1" +# 预测image_dir指定的单张图像 +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" + +# 预测image_dir指定的图像集合 +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" + +# 如果想使用CPU进行预测,需设置use_gpu参数为False +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e_server_pgnetA_infer/" --e2e_pgnet_valid_set="totaltext" --use_gpu=False +``` + +### 可视化结果 + +可视化文本检测结果默认保存到./inference_results文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下: + +![](./images//e2e_res_img623_pgnet.jpg) + +## 四、模型训练、评估、推理 + +本节以totaltext数据集为例,介绍PaddleOCR中端到端模型的训练、评估与测试。 + +### 准备数据 + +下载解压[totaltext](https://paddleocr.bj.bcebos.com/dataset/total_text.tar) 数据集到PaddleOCR/train_data/目录,数据集组织结构: + +```text linenums="1" +/PaddleOCR/train_data/total_text/train/ + |- rgb/ # total_text数据集的训练数据 + |- img11.jpg + | ... + |- train.txt # total_text数据集的训练标注 +``` + +train.txt标注文件格式如下,文件名和标注信息中间用"\t"分隔: + +```text linenums="1" +" 图像文件名 json.dumps编码的图像标注信息" +rgb/img11.jpg [{"transcription": "ASRAMA", "points": [[214.0, 325.0], [235.0, 308.0], [259.0, 296.0], [286.0, 291.0], [313.0, 295.0], [338.0, 305.0], [362.0, 320.0], [349.0, 347.0], [330.0, 337.0], [310.0, 329.0], [290.0, 324.0], [269.0, 328.0], [249.0, 336.0], [231.0, 346.0]]}, {...}] +``` + +json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。 +`transcription` 表示当前文本框的文字,**当其内容为“###”时,表示该文本框无效,在训练时会跳过。** +如果您想在其他数据集上训练,可以按照上述形式构建标注文件。 + +### 启动训练 + +PGNet训练分为两个步骤:step1: 在合成数据上训练,得到预训练模型,此时模型精度依然较低;step2: 加载预训练模型,在totaltext数据集上训练;为快速训练,我们直接提供了step1的预训练模型。 + +```bash linenums="1" +cd PaddleOCR/ +# 下载step1 预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/train_step1.tar + +# 可以得到以下的文件格式 +./pretrain_models/train_step1/ + └─ best_accuracy.pdopt + └─ best_accuracy.states + └─ best_accuracy.pdparams +``` + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +```bash linenums="1" +# 单机单卡训练 e2e 模型 +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./pretrain_models/train_step1/best_accuracy Global.load_static_weights=False +# 单机多卡训练,通过 --gpus 参数设置使用的GPU ID +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./pretrain_models/train_step1/best_accuracy Global.load_static_weights=False +``` + +上述指令中,通过-c 选择训练使用configs/e2e/e2e_r50_vd_pg.yml配置文件。 +有关配置文件的详细解释,请参考[链接](../../ppocr/blog/config.md)。 + +您也可以通过-o参数在不需要修改yml文件的情况下,改变训练的参数,比如,调整训练的学习率为0.0001 + +```bash linenums="1" +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Optimizer.base_lr=0.0001 +``` + +#### 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: + +```bash linenums="1" +python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrain_weights`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrain_weights`指定的模型。 + +PaddleOCR计算三个OCR端到端相关的指标,分别是:Precision、Recall、Hmean。 + +运行如下代码,根据配置文件`e2e_r50_vd_pg.yml`中`save_res_path`指定的测试集检测结果文件,计算评估指标。 + +评估时设置后处理参数`max_side_len=768`,使用不同数据集、不同模型训练,可调整参数进行优化 +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。 + +```bash linenums="1" +python3 tools/eval.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" +``` + +### 模型预测 + +测试单张图像的端到端识别效果 + +```bash linenums="1" +python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/e2e_pgnet/best_accuracy" Global.load_static_weights=false +``` + +测试文件夹下所有图像的端到端识别效果 + +```bash linenums="1" +python3 tools/infer_e2e.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/e2e_pgnet/best_accuracy" Global.load_static_weights=false +``` + +### 预测推理 + +#### (1). 四边形文本检测模型(ICDAR2015) + +首先将PGNet端到端训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,以英文数据集训练的模型为例[模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar) ,可以使用如下命令进行转换: + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/pgnet/en_server_pgnetA.tar && tar xf en_server_pgnetA.tar +python3 tools/export_model.py -c configs/e2e/e2e_r50_vd_pg.yml -o Global.pretrained_model=./en_server_pgnetA/best_accuracy Global.load_static_weights=False Global.save_inference_dir=./inference/e2e +``` + +**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"` and `--e2e_pgnet_valid_set="partvgg"`**,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img_10.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="partvgg" --e2e_pgnet_valid_set="totaltext" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下: + +![](./images//e2e_res_img_10_pgnet.jpg) + +#### (2). 弯曲文本检测模型(Total-Text) + +对于弯曲文本样例 + +**PGNet端到端模型推理,需要设置参数`--e2e_algorithm="PGNet"`,同时,还需要增加参数`--e2e_pgnet_valid_set="totaltext"`,**可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_e2e.py --e2e_algorithm="PGNet" --image_dir="./doc/imgs_en/img623.jpg" --e2e_model_dir="./inference/e2e/" --e2e_pgnet_valid_set="totaltext" +``` + +可视化文本端到端结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'e2e_res'。结果示例如下: + +![](./images//e2e_res_img623_pgnet.jpg) diff --git a/docs/algorithm/end_to_end/images/e2e_res_img293_pgnet.png b/docs/algorithm/end_to_end/images/e2e_res_img293_pgnet.png new file mode 100644 index 0000000000..232f8293ad Binary files /dev/null and b/docs/algorithm/end_to_end/images/e2e_res_img293_pgnet.png differ diff --git a/docs/algorithm/end_to_end/images/e2e_res_img295_pgnet.png b/docs/algorithm/end_to_end/images/e2e_res_img295_pgnet.png new file mode 100644 index 0000000000..69337e3adf Binary files /dev/null and b/docs/algorithm/end_to_end/images/e2e_res_img295_pgnet.png differ diff --git a/docs/algorithm/end_to_end/images/e2e_res_img623_pgnet.jpg b/docs/algorithm/end_to_end/images/e2e_res_img623_pgnet.jpg new file mode 100644 index 0000000000..b45dc05f7b Binary files /dev/null and b/docs/algorithm/end_to_end/images/e2e_res_img623_pgnet.jpg differ diff --git a/docs/algorithm/end_to_end/images/e2e_res_img_10_pgnet.jpg b/docs/algorithm/end_to_end/images/e2e_res_img_10_pgnet.jpg new file mode 100644 index 0000000000..a0962993f8 Binary files /dev/null and b/docs/algorithm/end_to_end/images/e2e_res_img_10_pgnet.jpg differ diff --git a/docs/algorithm/end_to_end/images/pgnet_framework.png b/docs/algorithm/end_to_end/images/pgnet_framework.png new file mode 100644 index 0000000000..88fbca3947 Binary files /dev/null and b/docs/algorithm/end_to_end/images/pgnet_framework.png differ diff --git a/docs/algorithm/formula_recognition/algorithm_rec_can.en.md b/docs/algorithm/formula_recognition/algorithm_rec_can.en.md new file mode 100644 index 0000000000..9683c183d5 --- /dev/null +++ b/docs/algorithm/formula_recognition/algorithm_rec_can.en.md @@ -0,0 +1,99 @@ +--- +comments: true +--- + +## 1. Introduction + +Paper: +> [When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition](https://arxiv.org/abs/2207.11463) +> Bohan Li, Ye Yuan, Dingkang Liang, Xiao Liu, Zhilong Ji, Jinfeng Bai, Wenyu Liu, Xiang Bai +> ECCV, 2022 + +Using CROHME handwrittem mathematical expression recognition datasets for training, and evaluating on its test sets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|exprate|Download link| +| --- | --- | --- | --- | --- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_d28_can.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_d28_can.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams +``` + +Prediction: + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the CAN handwritten mathematical expression recognition training process is converted into an inference model. you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False + +# The default output max length of the model is 36. If you need to predict a longer sequence, please specify its output sequence as an appropriate value when exporting the model, as: Architecture.Head.max_ text_ length=72 +``` + +For CAN handwritten mathematical expression recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/datasets/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" + +# If you need to predict on a picture with black characters on a white background, please set: -- rec_ image_ inverse=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2207.11463, + doi = {10.48550/ARXIV.2207.11463}, + url = {https://arxiv.org/abs/2207.11463}, + author = {Li, Bohan and Yuan, Ye and Liang, Dingkang and Liu, Xiao and Ji, Zhilong and Bai, Jinfeng and Liu, Wenyu and Bai, Xiang}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition}, + publisher = {arXiv}, + year = {2022}, + copyright = {arXiv.org perpetual, non-exclusive license} +} +``` diff --git a/docs/algorithm/formula_recognition/algorithm_rec_can.md b/docs/algorithm/formula_recognition/algorithm_rec_can.md new file mode 100644 index 0000000000..6067fd9b26 --- /dev/null +++ b/docs/algorithm/formula_recognition/algorithm_rec_can.md @@ -0,0 +1,155 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 手写数学公式识别算法-CAN + +## 1. 算法简介 + +论文信息: +> [When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition](https://arxiv.org/abs/2207.11463) +> Bohan Li, Ye Yuan, Dingkang Liang, Xiao Liu, Zhilong Ji, Jinfeng Bai, Wenyu Liu, Xiang Bai +> ECCV, 2022 + +`CAN`使用CROHME手写公式数据集进行训练,在对应测试集上的精度如下: + +|模型 |骨干网络|配置文件|ExpRate|下载链接| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`CAN`识别模型时需要**更换配置文件**为`CAN`的[配置文件](../../configs/rec/rec_d28_can.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_d28_can.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_d28_can.yml +``` + +**注意:** + +- 我们提供的数据集,即[`CROHME数据集`](https://paddleocr.bj.bcebos.com/dataset/CROHME.tar)将手写公式存储为黑底白字的格式,若您自行准备的数据集与之相反,即以白底黑字模式存储,请在训练时做出如下修改 + + ```bash linenums="1" + python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Train.dataset.transforms.GrayImageChannelFormat.inverse=False + ``` + +- 默认每训练1个epoch(1105次iteration)进行1次评估,若您更改训练的batch_size,或更换数据集,请在训练时作出如下修改 + + ```bash linenums="1" + python3 tools/train.py -c configs/rec/rec_d28_can.yml -o Global.eval_batch_step=[0, {length_of_dataset//batch_size}] + ``` + +### 3.2 评估 + +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar),使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。若使用自行训练保存的模型,请注意修改路径和文件名为{path/to/weights}/{model_name}。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_d28_can.yml -o Architecture.Head.attdecoder.is_train=False Global.infer_img='./doc/datasets/crohme_demo/hme_00.jpg' Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams + +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/datasets/crohme_demo/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_d28_can.yml -o Global.pretrained_model=./rec_d28_can_train/best_accuracy.pdparams Global.save_inference_dir=./inference/rec_d28_can/ Architecture.Head.attdecoder.is_train=False + +# 目前的静态图模型默认的输出长度最大为36,如果您需要预测更长的序列,请在导出模型时指定其输出序列为合适的值,例如 Architecture.Head.max_text_length=72 +``` + +**注意:** +如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_d28_can/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/datasets/crohme_demo/hme_00.jpg" --rec_algorithm="CAN" --rec_batch_num=1 --rec_model_dir="./inference/rec_d28_can/" --rec_char_dict_path="./ppocr/utils/dict/latex_symbol_dict.txt" + +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/datasets/crohme_demo/'。 + +# 如果您需要在白底黑字的图片上进行预测,请设置 --rec_image_inverse=False +``` + +![测试图片样例](./images/hme_00.jpg) + +执行命令后,上面图像的预测结果(识别的文本)会打印到屏幕上,示例如下: + +```bash linenums="1" +Predicts of ./doc/imgs_hme/hme_00.jpg:['x _ { k } x x _ { k } + y _ { k } y x _ { k }', []] +``` + +**注意**: + +- 需要注意预测图像为**黑底白字**,即手写公式部分为白色,背景为黑色的图片。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中CAN的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持CAN,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +1. CROHME数据集来自于[CAN源repo](https://github.com/LBH1024/CAN) 。 + +## 引用 + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2207.11463, + doi = {10.48550/ARXIV.2207.11463}, + url = {https://arxiv.org/abs/2207.11463}, + author = {Li, Bohan and Yuan, Ye and Liang, Dingkang and Liu, Xiao and Ji, Zhilong and Bai, Jinfeng and Liu, Wenyu and Bai, Xiang}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {When Counting Meets HMER: Counting-Aware Network for Handwritten Mathematical Expression Recognition}, + publisher = {arXiv}, + year = {2022}, + copyright = {arXiv.org perpetual, non-exclusive license} +} +``` diff --git a/docs/algorithm/formula_recognition/images/hme_00.jpg b/docs/algorithm/formula_recognition/images/hme_00.jpg new file mode 100644 index 0000000000..66ff27db26 Binary files /dev/null and b/docs/algorithm/formula_recognition/images/hme_00.jpg differ diff --git a/docs/algorithm/kie/algorithm_kie_layoutxlm.en.md b/docs/algorithm/kie/algorithm_kie_layoutxlm.en.md new file mode 100644 index 0000000000..a19a9f23b1 --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_layoutxlm.en.md @@ -0,0 +1,171 @@ +--- +comments: true +--- + +# KIE Algorithm - LayoutXLM + +## 1. Introduction + +Paper: + +> [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) +> +> Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei +> +> 2021 + +On XFUND_zh dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone|Task |Cnnfig|Hmean|Download link| +| --- | --- |--|--- | --- | --- | +|LayoutXLM|LayoutXLM-base|SER |[ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)/[inference model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar)| +|LayoutXLM|LayoutXLM-base|RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)/[inference model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh_infer.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [KIE tutorial](../../ppocr/model_train/kie.en.md)。PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +#### SER + +First, we need to export the trained model into inference model. Take LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)). Use the following command to export. + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar +tar -xf ser_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_LayoutXLM_xfun_zh Global.save_inference_dir=./inference/ser_layoutxlm_infer +``` + +Use the following command to infer using LayoutXLM SER model: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +The SER visualization results are saved in the `./output` directory by default. The results are as follows. + +![](./images/zh_val_42_ser.jpg) + +#### RE + +First, we need to export the trained model into inference model. Take LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)). Use the following command to export. + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar +tar -xf re_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./re_LayoutXLM_xfun_zh Global.save_inference_dir=./inference/re_layoutxlm_infer +``` + +Use the following command to infer using LayoutXLM RE model: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_layoutxlm_infer \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +The RE visualization results are saved in the `./output` directory by default. The results are as follows. + +![](./images/zh_val_42_re.jpg) + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/docs/algorithm/kie/algorithm_kie_layoutxlm.md b/docs/algorithm/kie/algorithm_kie_layoutxlm.md new file mode 100644 index 0000000000..225b3c58cd --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_layoutxlm.md @@ -0,0 +1,171 @@ +--- +comments: true +--- + +# 关键信息抽取算法-LayoutXLM + +## 1. 算法简介 + +论文信息: + +> [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) +> +> Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei +> +> 2021 + +在XFUND_zh数据集上,算法复现效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- |--|--- | --- | --- | +|LayoutXLM|LayoutXLM-base|SER |[ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)/[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar)| +|LayoutXLM|LayoutXLM-base|RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)/[推理模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh_infer.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[关键信息抽取教程](../../ppocr/model_train/kie.md)。PaddleOCR对代码进行了模块化,训练不同的关键信息抽取模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +#### SER + +首先将训练得到的模型转换成inference model。LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar +tar -xf ser_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_LayoutXLM_xfun_zh Global.save_inference_dir=./inference/ser_layoutxlm_infer +``` + +LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +SER可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +![](./images/zh_val_42_ser.jpg) + +#### RE + +首先将训练得到的模型转换成inference model。LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar +tar -xf re_LayoutXLM_xfun_zh.tar +python3 tools/export_model.py -c configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./re_LayoutXLM_xfun_zh Global.save_inference_dir=./inference/ser_layoutxlm_infer +``` + +LayoutXLM模型基于RE任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_layoutxlm_infer \ + --ser_model_dir=../inference/ser_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +RE可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +![](./images/zh_val_42_re.jpg) + +### 4.2 C++推理部署 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/docs/algorithm/kie/algorithm_kie_sdmgr.en.md b/docs/algorithm/kie/algorithm_kie_sdmgr.en.md new file mode 100644 index 0000000000..2eb7139709 --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_sdmgr.en.md @@ -0,0 +1,113 @@ +--- +comments: true +--- + +# KIE Algorithm - SDMGR + +## 1. Introduction + +Paper: + +> [Spatial Dual-Modality Graph Reasoning for Key Information Extraction](https://arxiv.org/abs/2103.14470) +> +> Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang +> +> 2021 + +On wildreceipt dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone |Cnnfig|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[inference model(coming soon)]()| + +## 2. 环境配置 + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +SDMGR is a key information extraction algorithm that classifies each detected textline into predefined categories, such as order ID, invoice number, amount, etc. + +The training and test data are collected in the wildreceipt dataset, use following command to downloaded the dataset. + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar && tar xf wildreceipt.tar +``` + +Create dataset soft link to `PaddleOCR/train_data` directory. + +```bash linenums="1" +cd PaddleOCR/ && mkdir train_data && cd train_data +ln -s ../../wildreceipt ./ +``` + +### 3.1 Model training + +The config file is `configs/kie/sdmgr/kie_unet_sdmgr.yml`, the default dataset path is `train_data/wildreceipt`. + +Use the following command to train the model. + +```bash linenums="1" +python3 tools/train.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ +``` + +### 3.2 Model evaluation + +Use the following command to evaluate the model: + +```bash linenums="1" +python3 tools/eval.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy +``` + +An example of output information is shown below. + +```bash linenums="1" +[2022/08/10 05:22:23] ppocr INFO: metric eval *************** +[2022/08/10 05:22:23] ppocr INFO: hmean:0.8670120239257812 +[2022/08/10 05:22:23] ppocr INFO: fps:10.18816520530961 +``` + +### 3.3 Model prediction + +Use the following command to load the model and predict. During the prediction, the text file storing the image path and OCR information needs to be loaded in advance. Use `Global.infer_img` to assign. + +```bash linenums="1" +python3 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=./train_data/wildreceipt/1.txt +``` + +The visualization results and texts are saved in the `./output/sdmgr_kie/` directory by default. The results are as follows. + +![img](./images/sdmgr_result.png) + +## 4. Inference and Deployment + +### 4.1 Python Inference + +Not supported + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@misc{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, + year={2021}, + eprint={2103.14470}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/docs/algorithm/kie/algorithm_kie_sdmgr.md b/docs/algorithm/kie/algorithm_kie_sdmgr.md new file mode 100644 index 0000000000..04ac586cec --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_sdmgr.md @@ -0,0 +1,114 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 关键信息抽取算法-SDMGR + +## 1. 算法简介 + +论文信息: + +> [Spatial Dual-Modality Graph Reasoning for Key Information Extraction](https://arxiv.org/abs/2103.14470) +> +> Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang +> +> 2021 + +在wildreceipt发票公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[训练模型]( https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)/[推理模型(coming soon)]()| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +SDMGR是一个关键信息提取算法,将每个检测到的文本区域分类为预定义的类别,如订单ID、发票号码,金额等。 + +训练和测试的数据采用wildreceipt数据集,通过如下指令下载数据集: + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar && tar xf wildreceipt.tar +``` + +创建数据集软链到PaddleOCR/train_data目录下: + +```bash linenums="1" +cd PaddleOCR/ && mkdir train_data && cd train_data +ln -s ../../wildreceipt ./ +``` + +### 3.1 模型训练 + +训练采用的配置文件是`configs/kie/sdmgr/kie_unet_sdmgr.yml`,配置文件中默认训练数据路径是`train_data/wildreceipt`,准备好数据后,可以通过如下指令执行训练: + +```bash linenums="1" +python3 tools/train.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.save_model_dir=./output/kie/ +``` + +### 3.2 模型评估 + +执行下面的命令进行模型评估 + +```bash linenums="1" +python3 tools/eval.py -c configs/kie/sdmgr/kie_unet_sdmgr.yml -o Global.checkpoints=./output/kie/best_accuracy +``` + +输出信息示例如下所示: + +```bash linenums="1" +[2022/08/10 05:22:23] ppocr INFO: metric eval *************** +[2022/08/10 05:22:23] ppocr INFO: hmean:0.8670120239257812 +[2022/08/10 05:22:23] ppocr INFO: fps:10.18816520530961 +``` + +### 3.3 模型预测 + +执行下面的命令进行模型预测,预测的时候需要预先加载存储图片路径以及OCR信息的文本文件,使用`Global.infer_img`进行指定。 + +```bash linenums="1" +python3 tools/infer_kie.py -c configs/kie/kie_unet_sdmgr.yml -o Global.checkpoints=kie_vgg16/best_accuracy Global.infer_img=./train_data/wildreceipt/1.txt +``` + +执行预测后的结果保存在`./output/sdmgr_kie/predicts_kie.txt`文件中,可视化结果保存在`/output/sdmgr_kie/kie_results/`目录下。 + +可视化结果如下图所示: + +![img](./images/sdmgr_result.png) + +## 4. 推理部署 + +### 4.1 Python推理 + +暂不支持 + +### 4.2 C++推理部署 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@misc{sun2021spatial, + title={Spatial Dual-Modality Graph Reasoning for Key Information Extraction}, + author={Hongbin Sun and Zhanghui Kuang and Xiaoyu Yue and Chenhao Lin and Wayne Zhang}, + year={2021}, + eprint={2103.14470}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.en.md b/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.en.md new file mode 100644 index 0000000000..50b3c8e7ea --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.en.md @@ -0,0 +1,168 @@ +--- +comments: true +--- + +# KIE Algorithm - VI-LayoutXLM + +## 1. Introduction + +VI-LayoutXLM is improved based on LayoutXLM. In the process of downstream finetuning, the visual backbone network module is removed, and the model infernce speed is further improved on the basis of almost lossless accuracy. + +On XFUND_zh dataset, the algorithm reproduction Hmean is as follows. + +|Model|Backbone|Task |Config|Hmean|Download link| +| --- | --- |---| --- | --- | --- | +|VI-LayoutXLM |VI-LayoutXLM-base | SER |[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|93.19%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)/[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar)| +|VI-LayoutXLM |VI-LayoutXLM-base |RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|83.92%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)/[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [KIE tutorial](../../ppocr/model_train/kie.en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +#### SER + +First, we need to export the trained model into inference model. Take VI-LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)). Use the following command to export. + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar +tar -xf ser_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm_infer +``` + +Use the following command to infer using VI-LayoutXLM SER model. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The SER visualization results are saved in the `./output` folder by default. The results are as follows. + +![](./images/zh_val_42_ser.jpg) + +#### RE + +First, we need to export the trained model into inference model. Take VI-LayoutXLM model trained on XFUND_zh as an example ([trained model download link](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)). Use the following command to export. + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar +tar -xf re_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./re_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm_infer +``` + +Use the following command to infer using VI-LayoutXLM RE model. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The RE visualization results are saved in the `./output` folder by default. The results are as follows. + +![](./images/zh_val_42_re.jpg) + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.md b/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.md new file mode 100644 index 0000000000..859de15834 --- /dev/null +++ b/docs/algorithm/kie/algorithm_kie_vi_layoutxlm.md @@ -0,0 +1,169 @@ +--- +comments: true +--- + + +# 关键信息抽取算法-VI-LayoutXLM + +## 1. 算法简介 + +VI-LayoutXLM基于LayoutXLM进行改进,在下游任务训练过程中,去除视觉骨干网络模块,最终精度基本无损的情况下,模型推理速度进一步提升。 + +在XFUND_zh数据集上,算法复现效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- |---| --- | --- | --- | +|VI-LayoutXLM |VI-LayoutXLM-base | SER |[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|93.19%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)/[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar)| +|VI-LayoutXLM |VI-LayoutXLM-base |RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|83.92%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)/[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[关键信息抽取教程](../../ppocr/model_train/kie.md)。PaddleOCR对代码进行了模块化,训练不同的关键信息抽取模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +#### SER + +首先将训练得到的模型转换成inference model。以VI-LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar +tar -xf ser_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./ser_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm_infer +``` + +VI-LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +SER可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +![](./images/zh_val_42_ser.jpg) + +#### RE + +首先将训练得到的模型转换成inference model。以VI-LayoutXLM模型在XFUND_zh数据集上训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)),可以使用下面的命令进行转换。 + +``` bash +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar +tar -xf re_vi_layoutxlm_xfund_pretrained.tar +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./re_vi_layoutxlm_xfund_pretrained/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm_infer +``` + +VI-LayoutXLM模型基于RE任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +RE可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +![](images/zh_val_42_re.jpg) + +### 4.2 C++推理部署 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{DBLP:journals/corr/abs-2104-08836, + author = {Yiheng Xu and + Tengchao Lv and + Lei Cui and + Guoxin Wang and + Yijuan Lu and + Dinei Flor{\^{e}}ncio and + Cha Zhang and + Furu Wei}, + title = {LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich + Document Understanding}, + journal = {CoRR}, + volume = {abs/2104.08836}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08836}, + eprinttype = {arXiv}, + eprint = {2104.08836}, + timestamp = {Thu, 14 Oct 2021 09:17:23 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08836.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-1912-13318, + author = {Yiheng Xu and + Minghao Li and + Lei Cui and + Shaohan Huang and + Furu Wei and + Ming Zhou}, + title = {LayoutLM: Pre-training of Text and Layout for Document Image Understanding}, + journal = {CoRR}, + volume = {abs/1912.13318}, + year = {2019}, + url = {http://arxiv.org/abs/1912.13318}, + eprinttype = {arXiv}, + eprint = {1912.13318}, + timestamp = {Mon, 01 Jun 2020 16:20:46 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1912-13318.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/abs-2012-14740, + author = {Yang Xu and + Yiheng Xu and + Tengchao Lv and + Lei Cui and + Furu Wei and + Guoxin Wang and + Yijuan Lu and + Dinei A. F. Flor{\^{e}}ncio and + Cha Zhang and + Wanxiang Che and + Min Zhang and + Lidong Zhou}, + title = {LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding}, + journal = {CoRR}, + volume = {abs/2012.14740}, + year = {2020}, + url = {https://arxiv.org/abs/2012.14740}, + eprinttype = {arXiv}, + eprint = {2012.14740}, + timestamp = {Tue, 27 Jul 2021 09:53:52 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2012-14740.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/docs/algorithm/kie/images/sdmgr_result.png b/docs/algorithm/kie/images/sdmgr_result.png new file mode 100644 index 0000000000..6fa4fe8be7 Binary files /dev/null and b/docs/algorithm/kie/images/sdmgr_result.png differ diff --git a/docs/algorithm/kie/images/zh_val_0_ser.jpg b/docs/algorithm/kie/images/zh_val_0_ser.jpg new file mode 100644 index 0000000000..4605c3a7f3 Binary files /dev/null and b/docs/algorithm/kie/images/zh_val_0_ser.jpg differ diff --git a/docs/algorithm/kie/images/zh_val_21_re.jpg b/docs/algorithm/kie/images/zh_val_21_re.jpg new file mode 100644 index 0000000000..7bf248dd0e Binary files /dev/null and b/docs/algorithm/kie/images/zh_val_21_re.jpg differ diff --git a/docs/algorithm/kie/images/zh_val_40_re.jpg b/docs/algorithm/kie/images/zh_val_40_re.jpg new file mode 100644 index 0000000000..242f9d6e80 Binary files /dev/null and b/docs/algorithm/kie/images/zh_val_40_re.jpg differ diff --git a/docs/algorithm/kie/images/zh_val_42_re.jpg b/docs/algorithm/kie/images/zh_val_42_re.jpg new file mode 100644 index 0000000000..49a0fad352 Binary files /dev/null and b/docs/algorithm/kie/images/zh_val_42_re.jpg differ diff --git a/docs/algorithm/kie/images/zh_val_42_ser.jpg b/docs/algorithm/kie/images/zh_val_42_ser.jpg new file mode 100644 index 0000000000..d69d83569b Binary files /dev/null and b/docs/algorithm/kie/images/zh_val_42_ser.jpg differ diff --git a/docs/algorithm/overview.en.md b/docs/algorithm/overview.en.md new file mode 100755 index 0000000000..9e78ea6ec6 --- /dev/null +++ b/docs/algorithm/overview.en.md @@ -0,0 +1,178 @@ +--- +comments: true +--- + +# Algorithms + +This tutorial lists the OCR algorithms supported by PaddleOCR, as well as the models and metrics of each algorithm on **English public datasets**. It is mainly used for algorithm introduction and algorithm performance comparison. For more models on other datasets including Chinese, please refer to [PP-OCRv3 models list](../ppocr/model_list.en.md). + +> +Developers are welcome to contribute more algorithms! Please refer to [add new algorithm](./add_new_algorithm.en.md) guideline. + +## 1. Two-stage OCR Algorithms + +### 1.1 Text Detection Algorithms + +Supported text detection algorithms (Click the link to get the tutorial): + +- [x] [DB && DB++](./text_detection/algorithm_det_db.en.md) +- [x] [EAST](./text_detection/algorithm_det_east.en.md) +- [x] [SAST](./text_detection/algorithm_det_sast.en.md) +- [x] [PSENet](./text_detection/algorithm_det_psenet.en.md) +- [x] [FCENet](./text_detection/algorithm_det_fcenet.en.md) +- [x] [DRRG](./text_detection/algorithm_det_drrg.en.md) +- [x] [CT](./text_detection/algorithm_det_ct.en.md) + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|Precision|Recall|Hmean|Download link| +| --- | --- | --- | --- | --- | --- | +|EAST|ResNet50_vd|88.71%|81.36%|84.88%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|78.20%|79.10%|78.65%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| +|DB|ResNet50_vd|86.41%|78.72%|82.38%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| +|DB|MobileNetV3|77.29%|73.08%|75.12%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| +|SAST|ResNet50_vd|91.39%|83.77%|87.42%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|PSE|ResNet50_vd|85.81%|79.53%|82.55%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE|MobileNetV3|82.20%|70.48%|75.89%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| +|DB++|ResNet50|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| + +On Total-Text dataset, the text detection result is as follows: + +|Model|Backbone|Precision|Recall|Hmean|Download link| +| --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|CT|ResNet18_vd|88.68%|81.70%|85.05%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + +On CTW1500 dataset, the text detection result is as follows: + +|Model|Backbone|Precision|Recall|Hmean| Download link| +| --- | --- | --- | --- | --- |---| +|FCE|ResNet50_dcn|88.39%|82.18%|85.27%| [trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar) | +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| + +**Note:** Additional data, like icdar2013, icdar2017, COCO-Text, ArT, was added to the model training of SAST. Download English public dataset in organized format used by PaddleOCR from: + +- [Baidu Drive](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (download code: 2bpi). +- [Google Drive](https://drive.google.com/drive/folders/1ll2-XEVyCQLpJjawLDiRlvo_i4BqHCJe?usp=sharing) + +### 1.2 Text Recognition Algorithms + +Supported text recognition algorithms (Click the link to get the tutorial): + +- [x] [CRNN](./text_recognition/algorithm_rec_crnn.en.md) +- [x] [Rosetta](./text_recognition/algorithm_rec_rosetta.en.md) +- [x] [STAR-Net](./text_recognition/algorithm_rec_starnet.en.md) +- [x] [RARE](./text_recognition/algorithm_rec_rare.en.md) +- [x] [SRN](./text_recognition/algorithm_rec_srn.en.md) +- [x] [NRTR](./text_recognition/algorithm_rec_nrtr.en.md) +- [x] [SAR](./text_recognition/algorithm_rec_sar.en.md) +- [x] [SEED](./text_recognition/algorithm_rec_seed.en.md) +- [x] [SVTR](./text_recognition/algorithm_rec_svtr.en.md) +- [x] [ViTSTR](./text_recognition/algorithm_rec_vitstr.en.md) +- [x] [ABINet](./text_recognition/algorithm_rec_abinet.en.md) +- [x] [VisionLAN](./text_recognition/algorithm_rec_visionlan.en.md) +- [x] [SPIN](./text_recognition/algorithm_rec_spin.en.md) +- [x] [RobustScanner](./text_recognition/algorithm_rec_robustscanner.en.md) +- [x] [RFL](./text_recognition/algorithm_rec_rfl.en.md) +- [x] [ParseQ](./text_recognition/algorithm_rec_parseq.md) +- [x] [CPPD](./text_recognition/algorithm_rec_cppd.en.md) +- [x] [SATRN](./text_recognition/algorithm_rec_satrn.en.md) + +Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: + +|Model|Backbone|Avg Accuracy|Module combination|Download link| +|---|---|---|---|---| +|Rosetta|Resnet34_vd|79.11%|rec_r34_vd_none_none_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)| +|Rosetta|MobileNetV3|75.80%|rec_mv3_none_none_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)| +|CRNN|Resnet34_vd|81.04%|rec_r34_vd_none_bilstm_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)| +|CRNN|MobileNetV3|77.95%|rec_mv3_none_bilstm_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| +|StarNet|Resnet34_vd|82.85%|rec_r34_vd_tps_bilstm_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| +|StarNet|MobileNetV3|79.28%|rec_mv3_tps_bilstm_ctc|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| +|RARE|Resnet34_vd|83.98%|rec_r34_vd_tps_bilstm_att |[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|81.76%|rec_mv3_tps_bilstm_att |[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|SRN|Resnet50_vd_fpn| 86.31% | rec_r50fpn_vd_none_srn |[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| +|NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | +|SAR|Resnet31| 87.20% | rec_r31_sar | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | +|SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | +|SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | +|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar) | +|ABINet|Resnet45| 90.75% | rec_r45_abinet | [trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | +|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [trained model](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar) | +|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | +|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| +|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | +|ParseQ|VIT| 91.24% | rec_vit_parseq_synth | [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz) | +|CPPD|SVTR-Base| 93.8% | rec_svtrnet_cppd_base_en | [trained model](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) | +|SATRN|ShallowCNN| 88.05% | rec_satrn | [trained model](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a) | + +### 1.3 Text Super-Resolution Algorithms + +Supported text super-resolution algorithms (Click the link to get the tutorial): + +- [x] [Text Gestalt](./super_resolution/algorithm_sr_gestalt.en.md) +- [x] [Text Telescope](./super_resolution/algorithm_sr_telescope.en.md) + +On the TextZoom public dataset, the effect of the algorithm is as follows: + +|Model|Backbone|PSNR_Avg|SSIM_Avg|Config|Download link| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[trained model](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[trained model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + +### 1.4 Formula Recognition Algorithm + +Supported formula recognition algorithms (Click the link to get the tutorial): + +- [x] [CAN](./formula_recognition/algorithm_rec_can.en.md) + +On the CROHME handwritten formula dataset, the effect of the algorithm is as follows: + +|Model |Backbone|Config|ExpRate|Download link| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + +## 2. End-to-end OCR Algorithms + +Supported end-to-end algorithms (Click the link to get the tutorial): + +- [x] [PGNet](./end_to_end/algorithm_e2e_pgnet.en.md) + +## 3. Table Recognition Algorithms + +Supported table recognition algorithms (Click the link to get the tutorial): + +- [x] [TableMaster](./table_recognition/algorithm_table_master.en.md) + +On the PubTabNet dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Acc|Download link| +|---|---|---|---|---| +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + +## 4. Key Information Extraction Algorithms + +Supported KIE algorithms (Click the link to get the tutorial): + +- [x] [VI-LayoutXLM](./kie/algorithm_kie_vi_layoutxlm.en.md) +- [x] [LayoutLM](./kie/algorithm_kie_layoutxlm.en.md) +- [x] [LayoutLMv2](./kie/algorithm_kie_layoutxlm.en.md) +- [x] [LayoutXLM](./kie/algorithm_kie_layoutxlm.en.md) +- [x] [SDMGR](./kie/algorithm_kie_sdmgr.en.md) + +On wildreceipt dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| + +On XFUND_zh dataset, the algorithm result is as follows: + +|Model|Backbone|Task|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| diff --git a/docs/algorithm/overview.md b/docs/algorithm/overview.md new file mode 100755 index 0000000000..007667ab04 --- /dev/null +++ b/docs/algorithm/overview.md @@ -0,0 +1,180 @@ +--- +comments: true +--- + +# 前沿算法与模型 + +本文给出了PaddleOCR已支持的OCR算法列表,以及每个算法在**英文公开数据集**上的模型和指标,主要用于算法简介和算法性能对比,更多包括中文在内的其他数据集上的模型请参考[PP-OCRv3 系列模型下载](../ppocr/model_list.md)。 + +> +PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广大开发者合作共建,贡献更多算法,合入有奖🎁!具体可查看[社区常规赛](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。** +> +新增算法可参考教程:[使用PaddleOCR架构添加新算法](./add_new_algorithm.md) + +## 1. 两阶段算法 + +### 1.1 文本检测算法 + +已支持的文本检测算法列表(戳链接获取使用教程): + +- [x] [DB与DB++](./text_detection/algorithm_det_db.md) +- [x] [EAST](./text_detection/algorithm_det_east.md) +- [x] [SAST](./text_detection/algorithm_det_sast.md) +- [x] [PSENet](./text_detection/algorithm_det_psenet.md) +- [x] [FCENet](./text_detection/algorithm_det_fcenet.md) +- [x] [DRRG](./text_detection/algorithm_det_drrg.md) +- [x] [CT](./text_detection/algorithm_det_ct.md) + +在ICDAR2015文本检测公开数据集上,算法效果如下: + +|模型|骨干网络|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | +|EAST|ResNet50_vd|88.71%|81.36%|84.88%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|78.20%|79.10%|78.65%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| +|DB|ResNet50_vd|86.41%|78.72%|82.38%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| +|DB|MobileNetV3|77.29%|73.08%|75.12%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| +|SAST|ResNet50_vd|91.39%|83.77%|87.42%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| +|PSE|ResNet50_vd|85.81%|79.53%|82.55%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE|MobileNetV3|82.20%|70.48%|75.89%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| +|DB++|ResNet50|90.89%|82.66%|86.58%|[合成数据预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| + +在Total-text文本检测公开数据集上,算法效果如下: + +|模型|骨干网络|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|89.63%|78.44%|83.66%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| +|CT|ResNet18_vd|88.68%|81.70%|85.05%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + +在CTW1500文本检测公开数据集上,算法效果如下: + +|模型|骨干网络|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | +|FCE|ResNet50_dcn|88.39%|82.18%|85.27%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)| +|DRRG|ResNet50_vd|89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| + +**说明:** SAST模型训练额外加入了icdar2013、icdar2017、COCO-Text、ArT等公开数据集进行调优。PaddleOCR用到的经过整理格式的英文公开数据集下载: + +- [百度云地址](https://pan.baidu.com/s/12cPnZcVuV1zn5DOd4mqjVw) (提取码: 2bpi) +- [Google Drive下载地址](https://drive.google.com/drive/folders/1ll2-XEVyCQLpJjawLDiRlvo_i4BqHCJe?usp=sharing) + +### 1.2 文本识别算法 + +已支持的文本识别算法列表(戳链接获取使用教程): + +- [x] [CRNN](./text_recognition/algorithm_rec_crnn.md) +- [x] [Rosetta](./text_recognition/algorithm_rec_rosetta.md) +- [x] [STAR-Net](./text_recognition/algorithm_rec_starnet.md) +- [x] [RARE](./text_recognition/algorithm_rec_rare.md) +- [x] [SRN](./text_recognition/algorithm_rec_srn.md) +- [x] [NRTR](./text_recognition/algorithm_rec_nrtr.md) +- [x] [SAR](./text_recognition/algorithm_rec_sar.md) +- [x] [SEED](./text_recognition/algorithm_rec_seed.md) +- [x] [SVTR](./text_recognition/algorithm_rec_svtr.md) +- [x] [ViTSTR](./text_recognition/algorithm_rec_vitstr.md) +- [x] [ABINet](./text_recognition/algorithm_rec_abinet.md) +- [x] [VisionLAN](./text_recognition/algorithm_rec_visionlan.md) +- [x] [SPIN](./text_recognition/algorithm_rec_spin.md) +- [x] [RobustScanner](./text_recognition/algorithm_rec_robustscanner.md) +- [x] [RFL](./text_recognition/algorithm_rec_rfl.md) +- [x] [ParseQ](./text_recognition/algorithm_rec_parseq.md) +- [x] [CPPD](./text_recognition/algorithm_rec_cppd.md) +- [x] [SATRN](./text_recognition/algorithm_rec_satrn.md) + +参考[DTRB](https://arxiv.org/abs/1904.01906) (3)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: + +|模型|骨干网络|Avg Accuracy|模型存储命名|下载链接| +|---|---|---|---|---| +|Rosetta|Resnet34_vd|79.11%|rec_r34_vd_none_none_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)| +|Rosetta|MobileNetV3|75.80%|rec_mv3_none_none_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)| +|CRNN|Resnet34_vd|81.04%|rec_r34_vd_none_bilstm_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)| +|CRNN|MobileNetV3|77.95%|rec_mv3_none_bilstm_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| +|StarNet|Resnet34_vd|82.85%|rec_r34_vd_tps_bilstm_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| +|StarNet|MobileNetV3|79.28%|rec_mv3_tps_bilstm_ctc|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| +|RARE|Resnet34_vd|83.98%|rec_r34_vd_tps_bilstm_att |[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|81.76%|rec_mv3_tps_bilstm_att |[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|SRN|Resnet50_vd_fpn| 86.31% | rec_r50fpn_vd_none_srn | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) | +|NRTR|NRTR_MTB| 84.21% | rec_mtb_nrtr | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) | +|SAR|Resnet31| 87.20% | rec_r31_sar | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) | +|SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | +|SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) | +|ViTSTR|ViTSTR| 79.82% | rec_vitstr_none_ce | [训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar) | +|ABINet|Resnet45| 90.75% | rec_r45_abinet | [训练模型](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) | +|VisionLAN|Resnet45| 90.30% | rec_r45_visionlan | [训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar) | +|SPIN|ResNet32| 90.00% | rec_r32_gaspin_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | +|RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| +|RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | +|ParseQ|VIT| 91.24% | rec_vit_parseq_synth | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz) | +|CPPD|SVTR-Base| 93.8% | rec_svtrnet_cppd_base_en | [训练模型](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) | +|SATRN|ShallowCNN| 88.05% | rec_satrn | [训练模型](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a) | + +### 1.3 文本超分辨率算法 + +已支持的文本超分辨率算法列表(戳链接获取使用教程): + +- [x] [Text Gestalt](./super_resolution/algorithm_sr_gestalt.md) +- [x] [Text Telescope](./super_resolution/algorithm_sr_telescope.md) + +在TextZoom公开数据集上,算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[训练模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + +### 1.4 公式识别算法 + +已支持的公式识别算法列表(戳链接获取使用教程): + +- [x] [CAN](./formula_recognition/algorithm_rec_can.md) + +在CROHME手写公式数据集上,算法效果如下: + +|模型 |骨干网络|配置文件|ExpRate|下载链接| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + +## 2. 端到端算法 + +已支持的端到端OCR算法列表(戳链接获取使用教程): + +- [x] [PGNet](./end_to_end/algorithm_e2e_pgnet.md) + +## 3. 表格识别算法 + +已支持的表格识别算法列表(戳链接获取使用教程): + +- [x] [TableMaster](./table_recognition/algorithm_table_master.md) + +在PubTabNet表格识别公开数据集上,算法效果如下: + +|模型|骨干网络|配置文件|acc|下载链接| +|---|---|---|---|---| +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + +## 4. 关键信息抽取算法 + +已支持的关键信息抽取算法列表(戳链接获取使用教程): + +- [x] [VI-LayoutXLM](./kie/algorithm_kie_vi_layoutxlm.md) +- [x] [LayoutLM](./kie/algorithm_kie_layoutxlm.md) +- [x] [LayoutLMv2](./kie/algorithm_kie_layoutxlm.md) +- [x] [LayoutXLM](./kie/algorithm_kie_layoutxlm.md) +- [x] [SDMGR](./kie/algorithm_kie_sdmgr.md) + +在wildreceipt发票公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| + +在XFUND_zh公开数据集上,算法效果如下: + +|模型|骨干网络|任务|配置文件|hmean|下载链接| +| --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| diff --git a/docs/algorithm/super_resolution/algorithm_sr_gestalt.en.md b/docs/algorithm/super_resolution/algorithm_sr_gestalt.en.md new file mode 100644 index 0000000000..7f29b4a943 --- /dev/null +++ b/docs/algorithm/super_resolution/algorithm_sr_gestalt.en.md @@ -0,0 +1,107 @@ +--- +comments: true +--- + +# Text Gestalt + +## 1. Introduction + +Paper: +> [Text Gestalt: Stroke-Aware Scene Text Image Super-Resolution](https://arxiv.org/pdf/2112.08171.pdf) +> Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang +> AAAI, 2022 + +Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/text-gestalt) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: + +|Model | Backbone|config|Acc|Download link| +|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[train model](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training + +python3 tools/infer_sr.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![img](./images/word_52.png) + +After executing the command, the super-resolution result of the above image is as follows: + +![img](./images/sr_word_52.png) + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +For Text-Gestalt super-resolution model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 +``` + +After executing the command, the super-resolution result of the above image is as follows: + +![img](./images/sr_word_52-20240704093810101.png) + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{chen2022text, + title={Text gestalt: Stroke-aware scene text image super-resolution}, + author={Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={1}, + pages={285--293}, + year={2022} +} +``` diff --git a/docs/algorithm/super_resolution/algorithm_sr_gestalt.md b/docs/algorithm/super_resolution/algorithm_sr_gestalt.md new file mode 100644 index 0000000000..83c7004a6a --- /dev/null +++ b/docs/algorithm/super_resolution/algorithm_sr_gestalt.md @@ -0,0 +1,107 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Text Gestalt + +## 1. 算法简介 + +论文信息: +> [Text Gestalt: Stroke-Aware Scene Text Image Super-Resolution](https://arxiv.org/pdf/2112.08171.pdf) +> Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang +> AAAI, 2022 + +参考[FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/text-gestalt) 数据下载说明,在TextZoom测试集合上超分算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[训练模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_tsrn_transformer_strock.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_sr.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![img](./images/word_52.png) + +执行命令后,上面图像的超分结果如下: + +![img](./images/sr_word_52.png) + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将文本超分训练过程中保存的模型,转换成inference model。以 Text-Gestalt 训练的[模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar) 为例,可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/sr/sr_tsrn_transformer_strock.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +Text-Gestalt 文本超分模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 +``` + +执行命令后,图像的超分结果如下: + +![img](./images/sr_word_52-20240704093810101.png) + +### 4.2 C++推理 + +暂未支持 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{chen2022text, + title={Text gestalt: Stroke-aware scene text image super-resolution}, + author={Chen, Jingye and Yu, Haiyang and Ma, Jianqi and Li, Bin and Xue, Xiangyang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={1}, + pages={285--293}, + year={2022} +} +``` diff --git a/docs/algorithm/super_resolution/algorithm_sr_telescope.en.md b/docs/algorithm/super_resolution/algorithm_sr_telescope.en.md new file mode 100644 index 0000000000..edfe3baeb0 --- /dev/null +++ b/docs/algorithm/super_resolution/algorithm_sr_telescope.en.md @@ -0,0 +1,110 @@ +--- +comments: true +--- + +# Text Gestalt + +## 1. Introduction + +Paper: +> [Scene Text Telescope: Text-Focused Scene Image Super-Resolution](https://openaccess.thecvf.com/content/CVPR2021/papers/Chen_Scene_Text_Telescope_Text-Focused_Scene_Image_Super-Resolution_CVPR_2021_paper.pdf) +> Chen, Jingye, Bin Li, and Xiangyang Xue +> CVPR, 2021 + +Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: + +|Model|Backbone|config|Acc|Download link| +|---|---|---|---|---| +|Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + +The [TextZoom dataset](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) comes from two superfraction data sets, RealSR and SR-RAW, both of which contain LR-HR pairs. TextZoom has 17367 pairs of training data and 4373 pairs of test data. + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/sr/sr_telescope.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_telescope.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training + +python3 tools/infer_sr.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![img](./images/word_52-20240704094304807.png) + +After executing the command, the super-resolution result of the above image is as follows: + +![img](./images/sr_word_52-20240704094309205.png) + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +For Text-Telescope super-resolution model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 + +``` + +After executing the command, the super-resolution result of the above image is as follows: + +![img](./images/sr_word_52-20240704094309205.png) + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@INPROCEEDINGS{9578891, + author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + year={2021}, + volume={}, + number={}, + pages={12021-12030}, + doi={10.1109/CVPR46437.2021.01185}} +``` diff --git a/docs/algorithm/super_resolution/algorithm_sr_telescope.md b/docs/algorithm/super_resolution/algorithm_sr_telescope.md new file mode 100644 index 0000000000..5bb99b07fd --- /dev/null +++ b/docs/algorithm/super_resolution/algorithm_sr_telescope.md @@ -0,0 +1,109 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Text Telescope + +## 1. 算法简介 + +论文信息: +> [Scene Text Telescope: Text-Focused Scene Image Super-Resolution](https://openaccess.thecvf.com/content/CVPR2021/papers/Chen_Scene_Text_Telescope_Text-Focused_Scene_Image_Super-Resolution_CVPR_2021_paper.pdf) +> Chen, Jingye, Bin Li, and Xiangyang Xue +> CVPR, 2021 + +参考[FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) 数据下载说明,在TextZoom测试集合上超分算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + +[TextZoom数据集](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) 来自两个超分数据集RealSR和SR-RAW,两个数据集都包含LR-HR对,TextZoom有17367对训数据和4373对测试数据。 + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/sr/sr_telescope.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/sr/sr_telescope.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_sr.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words_en/word_52.png +``` + +![img](./images/word_52-20240704094304807.png) + +执行命令后,上面图像的超分结果如下: + +![img](./images/sr_word_52-20240704094309205.png) + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将文本超分训练过程中保存的模型,转换成inference model。以 Text-Telescope 训练的[模型](https://paddleocr.bj.bcebos.com/contribution/Telescope_train.tar.gz) 为例,可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/sr/sr_telescope.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/sr_out +``` + +Text-Telescope 文本超分模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_sr.py --sr_model_dir=./inference/sr_out --image_dir=doc/imgs_words_en/word_52.png --sr_image_shape=3,32,128 +``` + +执行命令后,图像的超分结果如下: + +![img](./images/sr_word_52-20240704094309205.png) + +### 4.2 C++推理 + +暂未支持 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@INPROCEEDINGS{9578891, + author={Chen, Jingye and Li, Bin and Xue, Xiangyang}, + booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + title={Scene Text Telescope: Text-Focused Scene Image Super-Resolution}, + year={2021}, + volume={}, + number={}, + pages={12021-12030}, + doi={10.1109/CVPR46437.2021.01185}} +``` diff --git a/docs/algorithm/super_resolution/images/sr_word_52-20240704093810101.png b/docs/algorithm/super_resolution/images/sr_word_52-20240704093810101.png new file mode 100644 index 0000000000..c983e9ad7a Binary files /dev/null and b/docs/algorithm/super_resolution/images/sr_word_52-20240704093810101.png differ diff --git a/docs/algorithm/super_resolution/images/sr_word_52-20240704094309205.png b/docs/algorithm/super_resolution/images/sr_word_52-20240704094309205.png new file mode 100644 index 0000000000..c983e9ad7a Binary files /dev/null and b/docs/algorithm/super_resolution/images/sr_word_52-20240704094309205.png differ diff --git a/docs/algorithm/super_resolution/images/sr_word_52.png b/docs/algorithm/super_resolution/images/sr_word_52.png new file mode 100644 index 0000000000..c983e9ad7a Binary files /dev/null and b/docs/algorithm/super_resolution/images/sr_word_52.png differ diff --git a/docs/algorithm/super_resolution/images/word_52-20240704094304807.png b/docs/algorithm/super_resolution/images/word_52-20240704094304807.png new file mode 100644 index 0000000000..493c590183 Binary files /dev/null and b/docs/algorithm/super_resolution/images/word_52-20240704094304807.png differ diff --git a/docs/algorithm/super_resolution/images/word_52.png b/docs/algorithm/super_resolution/images/word_52.png new file mode 100644 index 0000000000..493c590183 Binary files /dev/null and b/docs/algorithm/super_resolution/images/word_52.png differ diff --git a/docs/algorithm/table_recognition/algorithm_table_master.en.md b/docs/algorithm/table_recognition/algorithm_table_master.en.md new file mode 100644 index 0000000000..77e8adeb09 --- /dev/null +++ b/docs/algorithm/table_recognition/algorithm_table_master.en.md @@ -0,0 +1,93 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# Table Recognition Algorithm-TableMASTER + +## 1. Introduction + +Paper: +> [TableMaster: PINGAN-VCGROUP’S SOLUTION FOR ICDAR 2021 COMPETITION ON SCIENTIFIC LITERATURE PARSING TASK B: TABLE RECOGNITION TO HTML](https://arxiv.org/pdf/2105.01848.pdf) +> Ye, Jiaquan and Qi, Xianbiao and He, Yelin and Chen, Yihao and Gu, Dengyi and Gao, Peng and Xiao, Rong +> 2021 + +On the PubTabNet table recognition public data set, the algorithm reproduction acc is as follows: + +|Model|Backbone|Cnnfig|Acc|Download link| +| --- | --- | --- | --- | --- | +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar)/[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +The above TableMaster model is trained using the PubTabNet table recognition public dataset. For the download of the dataset, please refer to [table_datasets](../../datasets/table_datasets.en.md). + +After the data download is complete, please refer to [Text Recognition Training Tutorial](../../ppocr/model_train/recognition.en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the TableMaster table recognition training process into an inference model. Taking the model based on the TableResNetExtra backbone network and trained on the PubTabNet dataset as example ([model download link](https://paddleocr.bj.bcebos.com/contribution/table_master.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/table/table_master.yml -o Global.pretrained_model=output/table_master/best_accuracy Global.save_inference_dir=./inference/table_master +``` + +**Note:** + +- If you trained the model on your own dataset and adjusted the dictionary file, please pay attention to whether the `character_dict_path` in the modified configuration file is the correct dictionary file + +Execute the following command for model inference: + +```bash linenums="1" +cd ppstructure/ +# When predicting all images in a folder, you can modify image_dir to a folder, such as --image_dir='docs/table'. +python3.7 table/predict_structure.py --table_model_dir=../output/table_master/table_structure_tablemaster_infer/ --table_algorithm=TableMaster --table_char_dict_path=../ppocr/utils/dict/table_master_structure_dict.txt --table_max_len=480 --image_dir=docs/table/table.jpg +``` + +After executing the command, the prediction results of the above image (structural information and the coordinates of each cell in the table) are printed to the screen, and the visualization of the cell coordinates is also saved. An example is as follows: + +result: + +```bash linenums="1" +[2022/06/16 13:06:54] ppocr INFO: result: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''], [[72.17591094970703, 10.759100914001465, 60.29658508300781, 16.6805362701416], [161.85562133789062, 10.884308815002441, 14.9495210647583, 16.727018356323242], [277.79876708984375, 29.54340362548828, 31.490320205688477, 18.143272399902344], +... +[336.11724853515625, 280.3601989746094, 39.456939697265625, 18.121286392211914]] +[2022/06/16 13:06:54] ppocr INFO: save vis result to ./output/table.jpg +[2022/06/16 13:06:54] ppocr INFO: Predict time of docs/table/table.jpg: 17.36806297302246 +``` + +**Note**: + +- TableMaster is relatively slow during inference, and it is recommended to use GPU for use. + +### 4.2 C++ Inference + +Since the post-processing is not written in CPP, the TableMaster does not support CPP inference. + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{ye2021pingan, + title={PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML}, + author={Ye, Jiaquan and Qi, Xianbiao and He, Yelin and Chen, Yihao and Gu, Dengyi and Gao, Peng and Xiao, Rong}, + journal={arXiv preprint arXiv:2105.01848}, + year={2021} +} +``` diff --git a/docs/algorithm/table_recognition/algorithm_table_master.md b/docs/algorithm/table_recognition/algorithm_table_master.md new file mode 100644 index 0000000000..7621d017f3 --- /dev/null +++ b/docs/algorithm/table_recognition/algorithm_table_master.md @@ -0,0 +1,100 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 表格识别算法-TableMASTER + +## 1. 算法简介 + +论文信息: +> [TableMaster: PINGAN-VCGROUP’S SOLUTION FOR ICDAR 2021 COMPETITION ON SCIENTIFIC LITERATURE PARSING TASK B: TABLE RECOGNITION TO HTML](https://arxiv.org/pdf/2105.01848.pdf) +> Ye, Jiaquan and Qi, Xianbiao and He, Yelin and Chen, Yihao and Gu, Dengyi and Gao, Peng and Xiao, Rong +> 2021 + +在PubTabNet表格识别公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|acc|下载链接| +| --- | --- | --- | --- | --- | +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar)/[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上述TableMaster模型使用PubTabNet表格识别公开数据集训练得到,数据集下载可参考 [table_datasets](../../datasets/table_datasets.md)。 + +数据下载完成后,请参考[文本识别教程](../../ppocr/model_train/recognition.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。以基于TableResNetExtra骨干网络,在PubTabNet数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/table_master.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/table/table_master.yml -o Global.pretrained_model=output/table_master/best_accuracy Global.save_inference_dir=./inference/table_master +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +./inference/table_master/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +cd ppstructure/ +python3.7 table/predict_structure.py --table_model_dir=../output/table_master/table_structure_tablemaster_infer/ --table_algorithm=TableMaster --table_char_dict_path=../ppocr/utils/dict/table_master_structure_dict.txt --table_max_len=480 --image_dir=docs/table/table.jpg +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='docs/table'。 +``` + +执行命令后,上面图像的预测结果(结构信息和表格中每个单元格的坐标)会打印到屏幕上,同时会保存单元格坐标的可视化结果。示例如下: +结果如下: + +```bash linenums="1" +[2022/06/16 13:06:54] ppocr INFO: result: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''], [[72.17591094970703, 10.759100914001465, 60.29658508300781, 16.6805362701416], [161.85562133789062, 10.884308815002441, 14.9495210647583, 16.727018356323242], [277.79876708984375, 29.54340362548828, 31.490320205688477, 18.143272399902344], +... +[336.11724853515625, 280.3601989746094, 39.456939697265625, 18.121286392211914]] +[2022/06/16 13:06:54] ppocr INFO: save vis result to ./output/table.jpg +[2022/06/16 13:06:54] ppocr INFO: Predict time of docs/table/table.jpg: 17.36806297302246 +``` + +**注意**: + +- TableMaster在推理时比较慢,建议使用GPU进行使用。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持TableMaster,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{ye2021pingan, + title={PingAn-VCGroup's Solution for ICDAR 2021 Competition on Scientific Literature Parsing Task B: Table Recognition to HTML}, + author={Ye, Jiaquan and Qi, Xianbiao and He, Yelin and Chen, Yihao and Gu, Dengyi and Gao, Peng and Xiao, Rong}, + journal={arXiv preprint arXiv:2105.01848}, + year={2021} +} +``` diff --git a/docs/algorithm/table_recognition/algorithm_table_slanet.md b/docs/algorithm/table_recognition/algorithm_table_slanet.md new file mode 100644 index 0000000000..0c79a25582 --- /dev/null +++ b/docs/algorithm/table_recognition/algorithm_table_slanet.md @@ -0,0 +1,97 @@ +--- +comments: true +--- + +# 表格识别算法-SLANet-LCNetV2 + +## 1. 算法简介 + +PaddleOCR 算法模型挑战赛 - 赛题二:通用表格识别任务排行榜第一算法。核心思路: + +- 1. 改善推理过程,至EOS停止,速度提升3倍 +- 2. 升级Backbone为LCNetV2(SSLD版本) +- 3. 行列特征增强模块 +- 4. 提升分辨率488至512 +- 5. 三阶段训练策略 + +在PubTabNet表格识别公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|acc| +| --- | --- | --- | --- | +|SLANet|LCNetV2|[configs/table/SLANet_lcnetv2.yml](../../configs/table/SLANet_lcnetv2.yml)|76.67%| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上述SLANet_LCNetv2模型使用PubTabNet表格识别公开数据集训练得到,数据集下载可参考 [table_datasets](../../datasets/table_datasets.md)。 + +### 启动训练 + +数据下载完成后,请参考[文本识别教程](../../ppocr/model_train/recognition.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的模型只需要**更换配置文件**即可。 + +训练命令如下: + +```bash linenums="1" +# stage1 +python3 -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/table/SLANet_lcnetv2.yml +# stage2 加载stage1的best model作为预训练模型,学习率调整为0.0001; +# stage3 加载stage2的best model作为预训练模型,不调整学习率,将配置文件中所有的488修改为512. +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +将训练得到best模型,转换成inference model,可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/table/SLANet_lcnetv2.yml -o Global.pretrained_model=path/best_accuracy Global.save_inference_dir=./inference/slanet_lcnetv2_infer +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +./inference/slanet_lcnetv2_infer/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +cd ppstructure/ +python3.7 table/predict_structure.py --table_model_dir=../inference/slanet_lcnetv2_infer/ --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --image_dir=docs/table/table.jpg --output=../output/table_slanet_lcnetv2 --use_gpu=False --benchmark=True --enable_mkldnn=True +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='docs/table'。 +``` + +执行命令后,上面图像的预测结果(结构信息和表格中每个单元格的坐标)会打印到屏幕上,同时会保存单元格坐标的可视化结果。示例如下: +结果如下: + +```bash linenums="1" +[2022/06/16 13:06:54] ppocr INFO: result: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
', '', ''], [[72.17591094970703, 10.759100914001465, 60.29658508300781, 16.6805362701416], [161.85562133789062, 10.884308815002441, 14.9495210647583, 16.727018356323242], [277.79876708984375, 29.54340362548828, 31.490320205688477, 18.143272399902344], +... +[336.11724853515625, 280.3601989746094, 39.456939697265625, 18.121286392211914]] +[2022/06/16 13:06:54] ppocr INFO: save vis result to ./output/table.jpg +[2022/06/16 13:06:54] ppocr INFO: Predict time of docs/table/table.jpg: 17.36806297302246 +``` + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持SLANet + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ diff --git a/docs/algorithm/text_detection/algorithm_det_ct.en.md b/docs/algorithm/text_detection/algorithm_det_ct.en.md new file mode 100644 index 0000000000..0181e17e25 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_ct.en.md @@ -0,0 +1,74 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# CT + +## 1. Introduction + +Paper: +> [CentripetalText: An Efficient Text Instance Representation for Scene Text Detection](https://arxiv.org/abs/2107.05945) +> Tao Sheng, Jie Chen, Zhouhui Lian +> NeurIPS, 2021 + +On the Total-Text dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|CT|ResNet18_vd|[configs/det/det_r18_vd_ct.yml](../../configs/det/det_r18_vd_ct.yml)|88.68%|81.70%|85.05%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +The above CT model is trained using the Total-Text text detection public dataset. For the download of the dataset, please refer to [Total-Text-Dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset). PaddleOCR format annotation download link [train.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/train.txt), [test.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/test.txt). + +Please refer to [text detection training tutorial](../../ppocr/model_train/detection.en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the CT text detection training process into an inference model. Taking the model based on the Resnet18_vd backbone network and trained on the Total Text English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o Global.pretrained_model=./det_r18_ct_train/best_accuracy Global.save_inference_dir=./inference/det_ct +``` + +CT text detection model inference, you can execute the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_ct/" --det_algorithm="CT" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with `det_res`. Examples of results are as follows: + +![img](./images/det_res_img623_ct.jpg) + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{sheng2021centripetaltext, + title={CentripetalText: An Efficient Text Instance Representation for Scene Text Detection}, + author={Tao Sheng and Jie Chen and Zhouhui Lian}, + booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, + year={2021} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_ct.md b/docs/algorithm/text_detection/algorithm_det_ct.md new file mode 100644 index 0000000000..d55efb02c4 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_ct.md @@ -0,0 +1,74 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# CT + +## 1. 算法简介 + +论文信息: +> [CentripetalText: An Efficient Text Instance Representation for Scene Text Detection](https://arxiv.org/abs/2107.05945) +> Tao Sheng, Jie Chen, Zhouhui Lian +> NeurIPS, 2021 + +在Total-Text文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|CT|ResNet18_vd|[configs/det/det_r18_vd_ct.yml](../../configs/det/det_r18_vd_ct.yml)|88.68%|81.70%|85.05%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +CT模型使用Total-Text文本检测公开数据集训练得到,数据集下载可参考 [Total-Text-Dataset](https://github.com/cs-chan/Total-Text-Dataset/tree/master/Dataset), 我们将标签文件转成了paddleocr格式,转换好的标签文件下载参考[train.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/train.txt), [text.txt](https://paddleocr.bj.bcebos.com/dataset/ct_tipc/test.txt)。 + +请参考[文本检测训练教程](../../ppocr/model_train/detection.md)。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将CT文本检测训练过程中保存的模型,转换成inference model。以基于Resnet18_vd骨干网络,在Total-Text英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r18_ct_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r18_vd_ct.yml -o Global.pretrained_model=./det_r18_ct_train/best_accuracy Global.save_inference_dir=./inference/det_ct +``` + +CT文本检测模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_ct/" --det_algorithm="CT" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![img](./images/det_res_img623_ct.jpg) + +### 4.2 C++推理 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{sheng2021centripetaltext, + title={CentripetalText: An Efficient Text Instance Representation for Scene Text Detection}, + author={Tao Sheng and Jie Chen and Zhouhui Lian}, + booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, + year={2021} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_db.en.md b/docs/algorithm/text_detection/algorithm_det_db.en.md new file mode 100644 index 0000000000..eb8e403e67 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_db.en.md @@ -0,0 +1,98 @@ +--- +comments: true +--- + +# DB && DB++ + +## 1. Introduction + +Paper: +> [Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/abs/1911.08947) +> Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang +> AAAI, 2020 + +> [Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion](https://arxiv.org/abs/2202.10304) +> Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang +> TPAMI, 2022 + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|DB|ResNet50_vd|[configs/det/det_r50_vd_db.yml](../../configs/det/det_r50_vd_db.yml)|86.41%|78.72%|82.38%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| +|DB|MobileNetV3|[configs/det/det_mv3_db.yml](../../configs/det/det_mv3_db.yml)|77.29%|73.08%|75.12%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| +|DB++|ResNet50|[configs/det/det_r50_db++_ic15.yml](../../configs/det/det_r50_db++_ic15.yml)|90.89%|82.66%|86.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| + +On the TD_TR dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|DB++|ResNet50|[configs/det/det_r50_db++_td_tr.yml](../../configs/det/det_r50_db++_td_tr.yml)|92.92%|86.48%|89.58%|[pretrained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_td_tr_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +Please refer to [text detection training tutorial](../../ppocr/model_train/detection.en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the DB text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_db.yml -o Global.pretrained_model=./det_r50_vd_db_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_db +``` + +DB text detection model inference, you can execute the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_db/" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with `det_res`. Examples of results are as follows: + +![img](./images/det_res_img_10_db.jpg) + +**Note**: Since the ICDAR2015 dataset has only 1,000 training images, mainly for English scenes, the above model has very poor detection result on Chinese text images. + +### 4.2 C++ Inference + +With the inference model prepared, refer to the [cpp infer](../../ppocr/infer_deploy/cpp_infer.en.md) tutorial for C++ inference. + +### 4.3 Serving + +With the inference model prepared, refer to the [pdserving](../../ppocr/infer_deploy/paddle_server.en.md) tutorial for service deployment by Paddle Serving. + +### 4.4 More + +More deployment schemes supported for DB: + +- Paddle2ONNX: with the inference model prepared, please refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{liao2020real, + title={Real-time scene text detection with differentiable binarization}, + author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={34}, + number={07}, + pages={11474--11481}, + year={2020} +} + +@article{liao2022real, + title={Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion}, + author={Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year={2022}, + publisher={IEEE} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_db.md b/docs/algorithm/text_detection/algorithm_det_db.md new file mode 100644 index 0000000000..3efcabda0b --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_db.md @@ -0,0 +1,99 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# DB与DB++ + +## 1. 算法简介 + +论文信息: +> [Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/abs/1911.08947) +> Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang +> AAAI, 2020 + +> [Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion](https://arxiv.org/abs/2202.10304) +> Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang +> TPAMI, 2022 + +在ICDAR2015文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|DB|ResNet50_vd|[configs/det/det_r50_vd_db.yml](../../configs/det/det_r50_vd_db.yml)|86.41%|78.72%|82.38%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar)| +|DB|MobileNetV3|[configs/det/det_mv3_db.yml](../../configs/det/det_mv3_db.yml)|77.29%|73.08%|75.12%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_db_v2.0_train.tar)| +|DB++|ResNet50|[configs/det/det_r50_db++_icdar15.yml](../../configs/det/det_r50_db++_icdar15.yml)|90.89%|82.66%|86.58%|[合成数据预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_icdar15_train.tar)| + +在TD_TR文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|DB++|ResNet50|[configs/det/det_r50_db++_td_tr.yml](../../configs/det/det_r50_db++_td_tr.yml)|92.92%|86.48%|89.58%|[合成数据预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/ResNet50_dcn_asf_synthtext_pretrained.pdparams)/[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_db%2B%2B_td_tr_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本检测训练教程](../../ppocr/model_train/detection.md)。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将DB文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_db.yml -o Global.pretrained_model=./det_r50_vd_db_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_db +``` + +DB文本检测模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_db/" --det_algorithm="DB" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![img](./images/det_res_img_10_db.jpg) + +**注意**:由于ICDAR2015数据集只有1000张训练图像,且主要针对英文场景,所以上述模型对中文文本图像检测效果会比较差。 + +### 4.2 C++推理 + +准备好推理模型后,参考[cpp infer](../../ppocr/infer_deploy/cpp_infer.md)教程进行操作即可。 + +### 4.3 Serving服务化部署 + +准备好推理模型后,参考[pdserving](../../ppocr/infer_deploy/paddle_server.md)教程进行Serving服务化部署,包括Python Serving和C++ Serving两种模式。 + +### 4.4 更多推理部署 + +DB模型还支持以下推理部署方式: + +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.md)教程操作。 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{liao2020real, + title={Real-time scene text detection with differentiable binarization}, + author={Liao, Minghui and Wan, Zhaoyi and Yao, Cong and Chen, Kai and Bai, Xiang}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={34}, + number={07}, + pages={11474--11481}, + year={2020} +} + +@article{liao2022real, + title={Real-Time Scene Text Detection with Differentiable Binarization and Adaptive Scale Fusion}, + author={Liao, Minghui and Zou, Zhisheng and Wan, Zhaoyi and Yao, Cong and Bai, Xiang}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year={2022}, + publisher={IEEE} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_drrg.en.md b/docs/algorithm/text_detection/algorithm_det_drrg.en.md new file mode 100644 index 0000000000..8949c0ce73 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_drrg.en.md @@ -0,0 +1,61 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# DRRG + +## 1. Introduction + +Paper: +> [Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection](https://arxiv.org/abs/2003.07493) +> Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng +> CVPR, 2020 + +On the CTW1500 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +The above DRRG model is trained using the CTW1500 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). + +After the data download is complete, please refer to [Text Detection Training Tutorial](../../ppocr/model_train/detection.en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +Since the model needs to be converted to Numpy data for many times in the forward, DRRG dynamic graph to static graph is not supported. + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{zhang2020deep, + title={Deep relational reasoning graph network for arbitrary shape text detection}, + author={Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9699--9708}, + year={2020} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_drrg.md b/docs/algorithm/text_detection/algorithm_det_drrg.md new file mode 100644 index 0000000000..ab3b4aad51 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_drrg.md @@ -0,0 +1,61 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# DRRG + +## 1. 算法简介 + +论文信息: +> [Deep Relational Reasoning Graph Network for Arbitrary Shape Text Detection](https://arxiv.org/abs/2003.07493) +> Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng +> CVPR, 2020 + +在CTW1500文本检测公开数据集上,算法复现效果如下: + +| 模型 |骨干网络|配置文件|precision|recall|Hmean|下载链接| +|-----| --- | --- | --- | --- | --- | --- | +| DRRG | ResNet50_vd | [configs/det/det_r50_drrg_ctw.yml](../../configs/det/det_r50_drrg_ctw.yml)| 89.92%|80.91%|85.18%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_drrg_ctw_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上述DRRG模型使用CTW1500文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](../../datasets/ocr_datasets.md)。 + +数据下载完成后,请参考[文本检测训练教程](../../ppocr/model_train/detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +由于模型前向运行时需要多次转换为Numpy数据进行运算,因此DRRG的动态图转静态图暂未支持。 + +### 4.2 C++推理 + +暂未支持 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{zhang2020deep, + title={Deep relational reasoning graph network for arbitrary shape text detection}, + author={Zhang, Shi-Xue and Zhu, Xiaobin and Hou, Jie-Bo and Liu, Chang and Yang, Chun and Wang, Hongfa and Yin, Xu-Cheng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9699--9708}, + year={2020} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_east.en.md b/docs/algorithm/text_detection/algorithm_det_east.en.md new file mode 100644 index 0000000000..5a420fdfb1 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_east.en.md @@ -0,0 +1,76 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# EAST + +## 1. Introduction + +Paper: +> [EAST: An Efficient and Accurate Scene Text Detector](https://arxiv.org/abs/1704.03155) +> Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, Jiajun Liang +> CVPR, 2017 + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|EAST|ResNet50_vd| [det_r50_vd_east.yml](../../configs/det/det_r50_vd_east.yml)|88.71%| 81.36%| 84.88%| [model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|[det_mv3_east.yml](../../configs/det/det_mv3_east.yml) | 78.20%| 79.10%| 78.65%| [model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +The above EAST model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). + +After the data download is complete, please refer to [Text Detection Training Tutorial](../../ppocr/model_train/detection.en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the EAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.pretrained_model=./det_r50_vd_east_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_r50_east/ +``` + +For EAST text detection model inference, you need to set the parameter --det_algorithm="EAST", run the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_r50_east/" --det_algorithm="EAST" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with `det_res`. + +![img](./images/det_res_img_10_east.jpg) + +### 4.2 C++ Inference + +Since the post-processing is not written in CPP, the EAST text detection model does not support CPP inference. + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{zhou2017east, + title={East: an efficient and accurate scene text detector}, + author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={5551--5560}, + year={2017} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_east.md b/docs/algorithm/text_detection/algorithm_det_east.md new file mode 100644 index 0000000000..01bca2dd16 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_east.md @@ -0,0 +1,76 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# EAST + +## 1. 算法简介 + +论文信息: +> [EAST: An Efficient and Accurate Scene Text Detector](https://arxiv.org/abs/1704.03155) +> Xinyu Zhou, Cong Yao, He Wen, Yuzhi Wang, Shuchang Zhou, Weiran He, Jiajun Liang +> CVPR, 2017 + +在ICDAR2015文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|EAST|ResNet50_vd| [det_r50_vd_east.yml](../../configs/det/det_r50_vd_east.yml)|88.71%| 81.36%| 84.88%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)| +|EAST|MobileNetV3|[det_mv3_east.yml](../../configs/det/det_mv3_east.yml) | 78.20%| 79.10%| 78.65%| [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上表中的EAST训练模型使用ICDAR2015文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](../../datasets/ocr_datasets.md)。 + +数据下载完成后,请参考[文本检测训练教程](../../ppocr/model_train/detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将EAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.pretrained_model=./det_r50_vd_east_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_r50_east/ +``` + +EAST文本检测模型推理,需要设置参数--det_algorithm="EAST",执行预测: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_r50_east/" --det_algorithm="EAST" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。 + +![img](./images/det_res_img_10_east.jpg) + +### 4.2 C++推理 + +由于后处理暂未使用CPP编写,EAST文本检测模型暂不支持CPP推理。 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{zhou2017east, + title={East: an efficient and accurate scene text detector}, + author={Zhou, Xinyu and Yao, Cong and Wen, He and Wang, Yuzhi and Zhou, Shuchang and He, Weiran and Liang, Jiajun}, + booktitle={Proceedings of the IEEE conference on Computer Vision and Pattern Recognition}, + pages={5551--5560}, + year={2017} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_fcenet.en.md b/docs/algorithm/text_detection/algorithm_det_fcenet.en.md new file mode 100644 index 0000000000..2231094d5b --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_fcenet.en.md @@ -0,0 +1,86 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# FCENet + +## 1. Introduction + +Paper: +> [Fourier Contour Embedding for Arbitrary-Shaped Text Detection](https://arxiv.org/abs/2104.10442) +> Yiqin Zhu and Jianyong Chen and Lingyu Liang and Zhanghui Kuang and Lianwen Jin and Wayne Zhang +> CVPR, 2021 + +On the CTW1500 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +| FCE | ResNet50_dcn | [configs/det/det_r50_vd_dcn_fce_ctw.yml](../../configs/det/det_r50_vd_dcn_fce_ctw.yml)| 88.39%|82.18%|85.27%|[trained model](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +The above FCE model is trained using the CTW1500 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). + +After the data download is complete, please refer to [Text Detection Training Tutorial](../../ppocr/model_train/detection.en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the FCE text detection training process into an inference model. Taking the model based on the Resnet50_vd_dcn backbone network and trained on the CTW1500 English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_dcn_fce_ctw.yml -o Global.pretrained_model=./det_r50_dcn_fce_ctw_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_fce +``` + +FCE text detection model inference, to perform non-curved text detection, you can run the following commands: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_fce/" --det_algorithm="FCE" --det_fce_box_type=quad +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_img_10_fce.jpg) + +If you want to perform curved text detection, you can execute the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_fce/" --det_algorithm="FCE" --det_fce_box_type=poly +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_img623_fce.jpg) + +**Note**: Since the CTW1500 dataset has only 1,000 training images, mainly for English scenes, the above model has very poor detection result on Chinese or curved text images. + +### 4.2 C++ Inference + +Since the post-processing is not written in CPP, the FCE text detection model does not support CPP inference. + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@InProceedings{zhu2021fourier, + title={Fourier Contour Embedding for Arbitrary-Shaped Text Detection}, + author={Yiqin Zhu and Jianyong Chen and Lingyu Liang and Zhanghui Kuang and Lianwen Jin and Wayne Zhang}, + year={2021}, + booktitle = {CVPR} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_fcenet.md b/docs/algorithm/text_detection/algorithm_det_fcenet.md new file mode 100644 index 0000000000..21e5eab492 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_fcenet.md @@ -0,0 +1,86 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# FCENet + +## 1. 算法简介 + +论文信息: +> [Fourier Contour Embedding for Arbitrary-Shaped Text Detection](https://arxiv.org/abs/2104.10442) +> Yiqin Zhu and Jianyong Chen and Lingyu Liang and Zhanghui Kuang and Lianwen Jin and Wayne Zhang +> CVPR, 2021 + +在CTW1500文本检测公开数据集上,算法复现效果如下: + +| 模型 |骨干网络|配置文件|precision|recall|Hmean|下载链接| +|-----| --- | --- | --- | --- | --- | --- | +| FCE | ResNet50_dcn | [configs/det/det_r50_vd_dcn_fce_ctw.yml](../../configs/det/det_r50_vd_dcn_fce_ctw.yml)| 88.39%|82.18%|85.27%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上述FCE模型使用CTW1500文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](../../datasets/ocr_datasets.md)。 + +数据下载完成后,请参考[文本检测训练教程](../../ppocr/model_train/detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将FCE文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd_dcn骨干网络,在CTW1500英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/contribution/det_r50_dcn_fce_ctw_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_dcn_fce_ctw.yml -o Global.pretrained_model=./det_r50_dcn_fce_ctw_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_fce +``` + +FCE文本检测模型推理,执行非弯曲文本检测,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_fce/" --det_algorithm="FCE" --det_fce_box_type=quad +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img_10_fce.jpg) + +如果想执行弯曲文本检测,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_fce/" --det_algorithm="FCE" --det_fce_box_type=poly +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img623_fce.jpg) + +**注意**:由于CTW1500数据集只有1000张训练图像,且主要针对英文场景,所以上述模型对中文文本图像检测效果会比较差。 + +### 4.2 C++推理 + +由于后处理暂未使用CPP编写,FCE文本检测模型暂不支持CPP推理。 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@InProceedings{zhu2021fourier, + title={Fourier Contour Embedding for Arbitrary-Shaped Text Detection}, + author={Yiqin Zhu and Jianyong Chen and Lingyu Liang and Zhanghui Kuang and Lianwen Jin and Wayne Zhang}, + year={2021}, + booktitle = {CVPR} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_psenet.en.md b/docs/algorithm/text_detection/algorithm_det_psenet.en.md new file mode 100644 index 0000000000..692d3f0338 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_psenet.en.md @@ -0,0 +1,89 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# PSENet + +## 1. Introduction + +Paper: +> [Shape robust text detection with progressive scale expansion network](https://arxiv.org/abs/1903.12473) +> Wang, Wenhai and Xie, Enze and Li, Xiang and Hou, Wenbo and Lu, Tong and Yu, Gang and Shao, Shuai +> CVPR, 2019 + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|PSE| ResNet50_vd | [configs/det/det_r50_vd_pse.yml](../../configs/det/det_r50_vd_pse.yml)| 85.81% |79.53%|82.55%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE| MobileNetV3| [configs/det/det_mv3_pse.yml](../../configs/det/det_mv3_pse.yml) | 82.20% |70.48%|75.89%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +The above PSE model is trained using the ICDAR2015 text detection public dataset. For the download of the dataset, please refer to [ocr_datasets](./dataset/ocr_datasets_en.md). + +After the data download is complete, please refer to [Text Detection Training Tutorial](../../ppocr/model_train/detection.en.md) for training. PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved in the PSE text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_pse.yml -o Global.pretrained_model=./det_r50_vd_pse_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_pse +``` + +PSE text detection model inference, to perform non-curved text detection, you can run the following commands: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_pse/" --det_algorithm="PSE" --det_pse_box_type=quad +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_img_10_pse.jpg) + +If you want to perform curved text detection, you can execute the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_pse/" --det_algorithm="PSE" --det_pse_box_type=poly +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![](./images/det_res_img_10_pse_poly.jpg) + +**Note**: Since the ICDAR2015 dataset has only 1,000 training images, mainly for English scenes, the above model has very poor detection result on Chinese or curved text images. + +### 4.2 C++ Inference + +Since the post-processing is not written in CPP, the PSE text detection model does not support CPP inference. + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{wang2019shape, + title={Shape robust text detection with progressive scale expansion network}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Hou, Wenbo and Lu, Tong and Yu, Gang and Shao, Shuai}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9336--9345}, + year={2019} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_psenet.md b/docs/algorithm/text_detection/algorithm_det_psenet.md new file mode 100644 index 0000000000..0be6356dbe --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_psenet.md @@ -0,0 +1,88 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PSENet + +## 1. 算法简介 + +论文信息: +> [Shape robust text detection with progressive scale expansion network](https://arxiv.org/abs/1903.12473) +> Wang, Wenhai and Xie, Enze and Li, Xiang and Hou, Wenbo and Lu, Tong and Yu, Gang and Shao, Shuai +> CVPR, 2019 + +在ICDAR2015文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|PSE| ResNet50_vd | [configs/det/det_r50_vd_pse.yml](../../configs/det/det_r50_vd_pse.yml)| 85.81% |79.53%|82.55%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar)| +|PSE| MobileNetV3| [configs/det/det_mv3_pse.yml](../../configs/det/det_mv3_pse.yml) | 82.20% |70.48%|75.89%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_mv3_pse_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +上述PSE模型使用ICDAR2015文本检测公开数据集训练得到,数据集下载可参考 [ocr_datasets](../../datasets/ocr_datasets.md)。 + +数据下载完成后,请参考[文本检测训练教程](../../ppocr/model_train/detection.md)进行训练。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将PSE文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.1/en_det/det_r50_vd_pse_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_pse.yml -o Global.pretrained_model=./det_r50_vd_pse_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_pse +``` + +PSE文本检测模型推理,执行非弯曲文本检测,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_pse/" --det_algorithm="PSE" --det_pse_box_type=quad +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img_10_pse.jpg) + +如果想执行弯曲文本检测,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_pse/" --det_algorithm="PSE" --det_pse_box_type=poly +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img_10_pse_poly.jpg) + +**注意**:由于ICDAR2015数据集只有1000张训练图像,且主要针对英文场景,所以上述模型对中文或弯曲文本图像检测效果会比较差。 + +### 4.2 C++推理 + +由于后处理暂未使用CPP编写,PSE文本检测模型暂不支持CPP推理。 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{wang2019shape, + title={Shape robust text detection with progressive scale expansion network}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Hou, Wenbo and Lu, Tong and Yu, Gang and Shao, Shuai}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={9336--9345}, + year={2019} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_sast.en.md b/docs/algorithm/text_detection/algorithm_det_sast.en.md new file mode 100644 index 0000000000..389fc6f674 --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_sast.en.md @@ -0,0 +1,101 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# SAST + +## 1. Introduction + +Paper: +> [A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning](https://arxiv.org/abs/1908.05498) +> Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming +> ACM MM, 2019 + +On the ICDAR2015 dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_icdar15.yml](../../configs/det/det_r50_vd_sast_icdar15.yml)|91.39%|83.77%|87.42%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| + +On the Total-text dataset, the text detection result is as follows: + +|Model|Backbone|Configuration|Precision|Recall|Hmean|Download| +| --- | --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_totaltext.yml](../../configs/det/det_r50_vd_sast_totaltext.yml)|89.63%|78.44%|83.66%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| + +## 2. Environment + +Please prepare your environment referring to [prepare the environment](../../ppocr/environment.en.md) and [clone the repo](../../ppocr/blog/clone.en.md). + +## 3. Model Training / Evaluation / Prediction + +Please refer to [text detection training tutorial](../../ppocr/model_train/detection.en.md). PaddleOCR has modularized the code structure, so that you only need to **replace the configuration file** to train different detection models. + +## 4. Inference and Deployment + +### 4.1 Python Inference + +#### (1). Quadrangle text detection model (ICDAR2015) + +First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the ICDAR2015 English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.pretrained_model=./det_r50_vd_sast_icdar15_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_ic15 +``` + +**For SAST quadrangle text detection model inference, you need to set the parameter `--det_algorithm="SAST"`**, run the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/" +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_img_10_sast.jpg) + +#### (2). Curved text detection model (Total-Text) + +First, convert the model saved in the SAST text detection training process into an inference model. Taking the model based on the Resnet50_vd backbone network and trained on the Total-Text English dataset as an example ([model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt +``` + +For SAST curved text detection model inference, you need to set the parameter `--det_algorithm="SAST"` and `--det_box_type=poly`, run the following command: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly' +``` + +The visualized text detection results are saved to the `./inference_results` folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_img623_sast.jpg) + +**Note**: SAST post-processing locality aware NMS has two versions: Python and C++. The speed of C++ version is obviously faster than that of Python version. Due to the compilation version problem of NMS of C++ version, C++ version NMS will be called only in Python 3.5 environment, and python version NMS will be called in other cases. + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{wang2019single, + title={A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning}, + author={Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming}, + booktitle={Proceedings of the 27th ACM International Conference on Multimedia}, + pages={1277--1285}, + year={2019} +} +``` diff --git a/docs/algorithm/text_detection/algorithm_det_sast.md b/docs/algorithm/text_detection/algorithm_det_sast.md new file mode 100644 index 0000000000..b9c38b375f --- /dev/null +++ b/docs/algorithm/text_detection/algorithm_det_sast.md @@ -0,0 +1,101 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# SAST + +## 1. 算法简介 + +论文信息: +> [A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning](https://arxiv.org/abs/1908.05498) +> Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming +> ACM MM, 2019 + +在ICDAR2015文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_icdar15.yml](../../configs/det/det_r50_vd_sast_icdar15.yml)|91.39%|83.77%|87.42%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)| + +在Total-text文本检测公开数据集上,算法复现效果如下: + +|模型|骨干网络|配置文件|precision|recall|Hmean|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|SAST|ResNet50_vd|[configs/det/det_r50_vd_sast_totaltext.yml](../../configs/det/det_r50_vd_sast_totaltext.yml)|89.63%|78.44%|83.66%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本检测训练教程](../../ppocr/model_train/detection.md)。PaddleOCR对代码进行了模块化,训练不同的检测模型只需要**更换配置文件**即可。 + +## 4. 推理部署 + +### 4.1 Python推理 + +#### (1). 四边形文本检测模型(ICDAR2015) + +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.pretrained_model=./det_r50_vd_sast_icdar15_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_ic15 +``` + +**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`**,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img_10_sast.jpg) + +#### (2). 弯曲文本检测模型(Total-Text) + +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在Total-Text英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt +``` + +SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_box_type=poly`,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_box_type='poly' +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为'det_res'。结果示例如下: + +![img](./images/det_res_img623_sast.jpg) + +**注意**:本代码库中,SAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。 + +### 4.2 C++推理 + +暂未支持 + +### 4.3 Serving服务化部署 + +暂未支持 + +### 4.4 更多推理部署 + +暂未支持 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{wang2019single, + title={A Single-Shot Arbitrarily-Shaped Text Detector based on Context Attended Multi-Task Learning}, + author={Wang, Pengfei and Zhang, Chengquan and Qi, Fei and Huang, Zuming and En, Mengyi and Han, Junyu and Liu, Jingtuo and Ding, Errui and Shi, Guangming}, + booktitle={Proceedings of the 27th ACM International Conference on Multimedia}, + pages={1277--1285}, + year={2019} +} +``` diff --git a/docs/algorithm/text_detection/images/det_res_img623_ct.jpg b/docs/algorithm/text_detection/images/det_res_img623_ct.jpg new file mode 100644 index 0000000000..2c5f57d96c Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img623_ct.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img623_fce.jpg b/docs/algorithm/text_detection/images/det_res_img623_fce.jpg new file mode 100644 index 0000000000..938ae4cabf Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img623_fce.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img623_sast.jpg b/docs/algorithm/text_detection/images/det_res_img623_sast.jpg new file mode 100644 index 0000000000..af5e2d6e2c Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img623_sast.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_db.jpg b/docs/algorithm/text_detection/images/det_res_img_10_db.jpg new file mode 100644 index 0000000000..6af89f6bb3 Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_db.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_east.jpg b/docs/algorithm/text_detection/images/det_res_img_10_east.jpg new file mode 100644 index 0000000000..908d077c3e Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_east.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_fce.jpg b/docs/algorithm/text_detection/images/det_res_img_10_fce.jpg new file mode 100644 index 0000000000..fb32950ffd Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_fce.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_pse.jpg b/docs/algorithm/text_detection/images/det_res_img_10_pse.jpg new file mode 100644 index 0000000000..cdb7625dd0 Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_pse.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_pse_poly.jpg b/docs/algorithm/text_detection/images/det_res_img_10_pse_poly.jpg new file mode 100644 index 0000000000..9c06a17ccb Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_pse_poly.jpg differ diff --git a/docs/algorithm/text_detection/images/det_res_img_10_sast.jpg b/docs/algorithm/text_detection/images/det_res_img_10_sast.jpg new file mode 100644 index 0000000000..702f773e68 Binary files /dev/null and b/docs/algorithm/text_detection/images/det_res_img_10_sast.jpg differ diff --git a/docs/algorithm/text_recognition/algorithm_rec_abinet.en.md b/docs/algorithm/text_recognition/algorithm_rec_abinet.en.md new file mode 100644 index 0000000000..d53cd2dab4 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_abinet.en.md @@ -0,0 +1,121 @@ +--- +comments: true +--- + +# ABINet + +## 1. Introduction + +Paper: +> [ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](https://openaccess.thecvf.com/content/CVPR2021/papers/Fang_Read_Like_Humans_Autonomous_Bidirectional_and_Iterative_Language_Modeling_for_CVPR_2021_paper.pdf) +> Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang +> CVPR, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|ABINet|ResNet45|[rec_r45_abinet.yml](../../configs/rec/rec_r45_abinet.yml)|90.75%|[pretrained & trained model](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r45_abinet.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_abinet.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r45_abinet.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the ABINet text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar)) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_abinet +``` + +**Note:** + +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to ABINet in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_r45_abinet/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For ABINet text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_r45_abinet/' --rec_algorithm='ABINet' --rec_image_shape='3,32,128' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +``` + +![img](./images/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999995231628418) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +1. Note that the MJSynth and SynthText datasets come from [ABINet repo](https://github.com/FangShancheng/ABINet). +2. We use the pre-trained model provided by the ABINet authors for finetune training. + +## Citation + +```bibtex +@article{Fang2021ABINet, + title = {ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition}, + author = {Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang}, + booktitle = {CVPR}, + year = {2021}, + url = {https://arxiv.org/abs/2103.06495}, + pages = {7098-7107} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_abinet.md b/docs/algorithm/text_recognition/algorithm_rec_abinet.md new file mode 100644 index 0000000000..dd37dd9861 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_abinet.md @@ -0,0 +1,136 @@ +--- +comments: true +--- + +# 场景文本识别算法-ABINet + +## 1. 算法简介 + +论文信息: +> [ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition](https://openaccess.thecvf.com/content/CVPR2021/papers/Fang_Read_Like_Humans_Autonomous_Bidirectional_and_Iterative_Language_Modeling_for_CVPR_2021_paper.pdf) +> Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang +> CVPR, 2021 + +`ABINet`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|ABINet|ResNet45|[rec_r45_abinet.yml](../../configs/rec/rec_r45_abinet.yml)|90.75%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`ABINet`识别模型时需要**更换配置文件**为`ABINet`的[配置文件](../../configs/rec/rec_r45_abinet.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r45_abinet.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_abinet.yml +``` + +### 3.2 评估 + +可下载已训练完成的模型文件,使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_r45_abinet.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_r45_abinet_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/rec_r45_abinet_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_r45_abinet.yml -o Global.pretrained_model=./rec_r45_abinet_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_abinet/ +``` + +**注意:** + +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应ABINet的`infer_shape`。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_r45_abinet/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_r45_abinet/' --rec_algorithm='ABINet' --rec_image_shape='3,32,128' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![img](./images/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999995231628418) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[3,32,128],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中ABINet的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持ABINet,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +1. MJSynth和SynthText两种数据集来自于[ABINet源repo](https://github.com/FangShancheng/ABINet) 。 +2. 我们使用ABINet作者提供的预训练模型进行finetune训练。 + +## 引用 + +```bibtex +@article{Fang2021ABINet, + title = {ABINet: Read Like Humans: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Recognition}, + author = {Shancheng Fang and Hongtao Xie and Yuxin Wang and Zhendong Mao and Yongdong Zhang}, + booktitle = {CVPR}, + year = {2021}, + url = {https://arxiv.org/abs/2103.06495}, + pages = {7098-7107} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_aster.en.md b/docs/algorithm/text_recognition/algorithm_rec_aster.en.md new file mode 100644 index 0000000000..d4ae880104 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_aster.en.md @@ -0,0 +1,99 @@ +--- +comments: true +--- + +# STAR-Net + +## 1. Introduction + +Paper: +> [STAR-Net: a spatial attention residue network for scene text recognition.](http://www.bmva.org/bmvc/2016/papers/paper043/paper043.pdf) +> Wei Liu, Chaofeng Chen, Kwan-Yee K. Wong, Zhizhong Su and Junyu Han. +> BMVC, pages 43.1-43.13, 2016 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|ACC|config|Download link| +| --- | --- | --- | --- | --- | +|---|---|---|---|---| +|StarNet|Resnet34_vd|84.44%|[configs/rec/rec_r34_vd_tps_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| +|StarNet|MobileNetV3|81.42%|[configs/rec/rec_mv3_tps_bilstm_ctc.yml](../../configs/rec/rec_mv3_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_tps_bilstm_ctc.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the STAR-Net text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_STAR-Net_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_starnet +``` + +For STAR-Net text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +### 4.2 C++ Inference + +With the inference model prepared, refer to the [cpp infer](../../ppocr/infer_deploy/cpp_infer.en.md) tutorial for C++ inference. + +### 4.3 Serving + +With the inference model prepared, refer to the [pdserving](../../ppocr/infer_deploy/paddle_server.en.md) tutorial for service deployment by Paddle Serving. + +### 4.4 More + +More deployment schemes supported for STAR-Net: + +- Paddle2ONNX: with the inference model prepared, please refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{liu2016star, + title={STAR-Net: a spatial attention residue network for scene text recognition.}, + author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu}, + booktitle={BMVC}, + volume={2}, + pages={7}, + year={2016} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_cppd.en.md b/docs/algorithm/text_recognition/algorithm_rec_cppd.en.md new file mode 100644 index 0000000000..04b19c9b10 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_cppd.en.md @@ -0,0 +1,150 @@ +--- +comments: true +--- + +# CPPD + +## 1. Introduction + +Paper: +> [Context Perception Parallel Decoder for Scene Text Recognition](https://arxiv.org/abs/2307.12270) +> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Chenxia Li and Yuning Du and Yu-Gang Jiang + +Scene text recognition models based on deep learning typically follow an Encoder-Decoder structure, where the decoder can be categorized into two types: (1) CTC and (2) Attention-based. Currently, most state-of-the-art (SOTA) models use an Attention-based decoder, which can be further divided into AR and PD types. In general, AR decoders achieve higher recognition accuracy than PD, while PD decoders are faster than AR. CPPD, with carefully designed CO and CC modules, achieves a balance between the accuracy of AR and the speed of PD. + +The accuracy (%) and model files of CPPD on the public dataset of scene text recognition are as follows:: + +* English dataset from [PARSeq](https://github.com/baudm/parseq). + +| Model |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg | Download | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:| +| CPPD Tiny | 97.1 | 94.4 | 96.6 | 86.6 | 88.5 | 90.3 | 92.25 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_tiny_en_train.tar) | +| CPPD Base | 98.2 | 95.5 | 97.6 | 87.9 | 90.0 | 92.7 | 93.80 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar)| +| CPPD Base 48*160 | 97.5 | 95.5 | 97.7 | 87.7 | 92.4 | 93.7 | 94.10 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_48_160_en_train.tar) | + +* Trained on Synth dataset(MJ+ST), Test on Union14M-L benchmark from [U14m](https://github.com/Mountchicken/Union14M/). + +| Model |Curve | Multi-
Oriented |Artistic |Contextless| Salient | Multi-
word | General | Avg | Download | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:| +| CPPD Tiny | 52.4 | 12.3 | 48.2 | 54.4 | 61.5 | 53.4 | 61.4 | 49.10 | Same as the table above. | +| CPPD Base | 65.5 | 18.6 | 56.0 | 61.9 | 71.0 | 57.5 | 65.8 | 56.63 | Same as the table above. | +| CPPD Base 48*160 | 71.9 | 22.1 | 60.5 | 67.9 | 78.3 | 63.9 | 67.1 | 61.69 | Same as the table above. | + +* Trained on Union14M-L training dataset. + +| Model |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg | Download | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:| +| CPPD Base 32*128 | 98.7 | 98.5 | 99.4 | 91.7 | 96.7 | 99.7 | 97.44 | [en](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_u14m_train.tar) | + +| Model |Curve | Multi-
Oriented |Artistic |Contextless| Salient | Multi-
word | General | Avg | Download | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:| +| CPPD Base 32*128 | 87.5 | 70.7 | 78.2 | 82.9 | 85.5 | 85.4 | 84.3 | 82.08 | Same as the table above. | + +* Chinese dataset from [Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition). + +| Model | Scene | Web | Document | Handwriting | Avg | Download | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:| +| CPPD Base | 74.4 | 76.1 | 98.6 | 55.3 | 76.10 | [ch](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_ch_train.tar) | +| CPPD Base + STN | 78.4 | 79.3 | 98.9 | 57.6 | 78.55 | [ch](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_stn_ch_train.tar) | + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +### Dataset Preparation + +[English dataset download](https://github.com/baudm/parseq) +[Union14M-Benchmark download](https://github.com/Mountchicken/Union14M) +[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download) + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml +``` + +### Evaluation + +You can download the model files and configuration files provided by `CPPD`: [download link](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar), take `CPPD-B` as an example, using the following command to evaluate: + +```bash linenums="1" +# Download the tar archive containing the model files and configuration files of CPPD-B and extract it +wget https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar && tar xf rec_svtr_cppd_base_en_train.tar +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model +``` + +### Prediction + +```bash linenums="1" +python3 tools/infer_rec.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the CPPD text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +# export model +# en +python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_en_infer +# ch +python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_ch.yml -o Global.pretrained_model=./rec_svtr_cppd_base_ch_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_ch_infer + +# speed test +# docker image https://hub.docker.com/r/paddlepaddle/paddle/tags/: sudo docker pull paddlepaddle/paddle:2.4.2-gpu-cuda11.2-cudnn8.2-trt8.0 +# install auto_log: pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl +# en +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_en_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,32,100' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +# ch +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_ch_infer/' --rec_algorithm='CPPDPadding' --rec_image_shape='3,32,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +# stn_ch +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_stn_ch_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,64,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +``` + +**Note:** If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_svtr_cppd_base_en_infer/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## Citation + +```bibtex +@article{Du2023CPPD, + title = {Context Perception Parallel Decoder for Scene Text Recognition}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {Arxiv}, + year = {2023}, + url = {https://arxiv.org/abs/2307.12270} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_cppd.md b/docs/algorithm/text_recognition/algorithm_rec_cppd.md new file mode 100644 index 0000000000..561fda146c --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_cppd.md @@ -0,0 +1,175 @@ +--- +comments: true +--- + +# 场景文本识别算法-CPPD + +## 1. 算法简介 + +论文信息: +> [Context Perception Parallel Decoder for Scene Text Recognition](https://arxiv.org/abs/2307.12270) +> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Chenxia Li and Yuning Du and Yu-Gang Jiang + +### CPPD算法简介 + +基于深度学习的场景文本识别模型通常是Encoder-Decoder结构,其中decoder可以分为两种:(1)CTC,(2)Attention-based。目前SOTA模型大多使用Attention-based的decoder,而attention-based可以分为AR和PD两种,一般来说,AR解码器识别精度优于PD,而PD解码速度快于AR,CPPD通过精心设计的CO和CC模块,达到了“AR的精度,PD的速度”的效果。 + +CPPD在场景文本识别公开数据集上的精度(%)和模型文件如下: + +* 英文训练集和测试集来自于[PARSeq](https://github.com/baudm/parseq)。 + +| 模型 |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:| +| CPPD Tiny | 97.1 | 94.4 | 96.6 | 86.6 | 88.5 | 90.3 | 92.25 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_tiny_en_train.tar) | +| CPPD Base | 98.2 | 95.5 | 97.6 | 87.9 | 90.0 | 92.7 | 93.80 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar)| +| CPPD Base 48*160 | 97.5 | 95.5 | 97.7 | 87.7 | 92.4 | 93.7 | 94.10 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_48_160_en_train.tar) | + +* 英文合成数据集(MJ+ST)训练,英文Union14M-L benchmark测试结果[U14m](https://github.com/Mountchicken/Union14M/)。 + +| 模型 |Curve | Multi-
Oriented |Artistic |Contextless| Salient | Multi-
word | General | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:| +| CPPD Tiny | 52.4 | 12.3 | 48.2 | 54.4 | 61.5 | 53.4 | 61.4 | 49.10 | 同上表 | +| CPPD Base | 65.5 | 18.6 | 56.0 | 61.9 | 71.0 | 57.5 | 65.8 | 56.63 | 同上表 | +| CPPD Base 48*160 | 71.9 | 22.1 | 60.5 | 67.9 | 78.3 | 63.9 | 67.1 | 61.69 | 同上表 | + +* Union14M-L 训练集From scratch训练,英文测试结果。 + +| 模型 |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:| +| CPPD Base 32*128 | 98.5 | 97.7 | 99.2 | 90.3 | 94.6 | 98.3 | 96.42 | Coming soon | + +| 模型 |Curve | Multi-
Oriented |Artistic |Contextless| Salient | Multi-
word | General | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:| +| CPPD Base 32*128 | 83.0 | 71.2 | 75.1 | 80.9 | 79.4 | 82.6 | 83.7 | 79.41 | Coming soon | + +* 加载合成数据集预训练模型,Union14M-L 训练集微调训练,英文测试结果。 + +| 模型 |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:| +| CPPD Base 32*128 | 98.7 | 98.5 | 99.4 | 91.7 | 96.7 | 99.7 | 97.44 | [英文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_u14m_train.tar) | + +| 模型 |Curve | Multi-
Oriented |Artistic |Contextless| Salient | Multi-
word | General | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:| +| CPPD Base 32*128 | 87.5 | 70.7 | 78.2 | 82.9 | 85.5 | 85.4 | 84.3 | 82.08 | 同上表 | + +* 中文训练集和测试集来自于[Chinese Benckmark](https://github.com/FudanVI/benchmarking-chinese-text-recognition)。 + +| 模型 | Scene | Web | Document | Handwriting | Avg | 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:| +| CPPD Base | 74.4 | 76.1 | 98.6 | 55.3 | 76.10 | [中文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_ch_train.tar) | +| CPPD Base + STN | 78.4 | 79.3 | 98.9 | 57.6 | 78.55 | [中文](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_stn_ch_train.tar) | + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +#### 数据集准备 + +[英文数据集下载](https://github.com/baudm/parseq) + +[Union14M-L 下载](https://github.com/Mountchicken/Union14M) + +[中文数据集下载](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download) + +#### 启动训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`CPPD`识别模型时需要**更换配置文件**为`CPPD`的[配置文件](../../configs/rec/rec_svtrnet_cppd_base_en.yml)。 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet_cppd_base_en.yml +``` + +### 3.2 评估 + +可下载`CPPD`提供的模型文件和配置文件:[下载地址](https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar) ,以`CPPD-B`为例,使用如下命令进行评估: + +```bash linenums="1" +# 下载包含CPPD-B的模型文件和配置文件的tar压缩包并解压 +wget https://paddleocr.bj.bcebos.com/CCPD/rec_svtr_cppd_base_en_train.tar && tar xf rec_svtr_cppd_base_en_train.tar +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c ./rec_svtr_cppd_base_en_train/rec_svtrnet_cppd_base_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。下面以基于`CPPD-B`,在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/CPPD/rec_svtr_cppd_base_en_train.tar),可以使用如下命令进行转换: + +**注意:** + +* 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +执行如下命令进行模型导出和推理: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +# export model +# en +python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_en.yml -o Global.pretrained_model=./rec_svtr_cppd_base_en_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_en_infer +# ch +python3 tools/export_model.py -c configs/rec/rec_svtrnet_cppd_base_ch.yml -o Global.pretrained_model=./rec_svtr_cppd_base_ch_train/best_model.pdparams Global.save_inference_dir=./rec_svtr_cppd_base_ch_infer + +# speed test +# docker image https://hub.docker.com/r/paddlepaddle/paddle/tags/: sudo docker pull paddlepaddle/paddle:2.4.2-gpu-cuda11.2-cudnn8.2-trt8.0 +# install auto_log: pip install https://paddleocr.bj.bcebos.com/libs/auto_log-1.2.0-py3-none-any.whl +# en +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_en_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,32,100' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +# ch +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_ch_infer/' --rec_algorithm='CPPDPadding' --rec_image_shape='3,32,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +# stn_ch +python3 tools/infer/predict_rec.py --image_dir='../iiik' --rec_model_dir='./rec_svtr_cppd_base_stn_ch_infer/' --rec_algorithm='CPPD' --rec_image_shape='3,64,256' --warmup=True --benchmark=True --rec_batch_num=1 --use_tensorrt=True +``` + +导出成功后,在目录下有三个文件: + +``` +/inference/rec_svtr_cppd_base_en_infer/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持CPPD,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 引用 + +```bibtex +@article{Du2023CPPD, + title = {Context Perception Parallel Decoder for Scene Text Recognition}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {Arxiv}, + year = {2023}, + url = {https://arxiv.org/abs/2307.12270} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_crnn.en.md b/docs/algorithm/text_recognition/algorithm_rec_crnn.en.md new file mode 100644 index 0000000000..db0d989552 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_crnn.en.md @@ -0,0 +1,102 @@ +--- +comments: true +--- + +# CRNN + +## 1. Introduction + +Paper: +> [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717) + +> Baoguang Shi, Xiang Bai, Cong Yao + +> IEEE, 2015 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|ACC|config|Download link| +| --- | --- | --- | --- | --- | +|---|---|---|---|---| +|CRNN|Resnet34_vd|81.04%|[configs/rec/rec_r34_vd_none_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)| +|CRNN|MobileNetV3|77.95%|[configs/rec/rec_mv3_none_bilstm_ctc.yml](../../configs/rec/rec_mv3_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the CRNN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_CRNN_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn +``` + +For CRNN text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +### 4.2 C++ Inference + +With the inference model prepared, refer to the [cpp infer](../../ppocr/infer_deploy/cpp_infer.en.md) tutorial for C++ inference. + +### 4.3 Serving + +With the inference model prepared, refer to the [pdserving](../../ppocr/infer_deploy/paddle_server.en.md) tutorial for service deployment by Paddle Serving. + +### 4.4 More + +More deployment schemes supported for CRNN: + +- Paddle2ONNX: with the inference model prepared, please refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +```bibtex +@ARTICLE{7801919, + author={Shi, Baoguang and Bai, Xiang and Yao, Cong}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + title={An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition}, + year={2017}, + volume={39}, + number={11}, + pages={2298-2304}, + doi={10.1109/TPAMI.2016.2646371}} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_crnn.md b/docs/algorithm/text_recognition/algorithm_rec_crnn.md new file mode 100644 index 0000000000..1f572beca5 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_crnn.md @@ -0,0 +1,118 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# CRNN + +## 1. 算法简介 + +论文信息: +> [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717) +> Baoguang Shi, Xiang Bai, Cong Yao +> IEEE, 2015 + +参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: + +|模型|骨干网络|Avg Accuracy|配置文件|下载链接| +|---|---|---|---|---| +|CRNN|Resnet34_vd|81.04%|[configs/rec/rec_r34_vd_none_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar)| +|CRNN|MobileNetV3|77.95%|[configs/rec/rec_mv3_none_bilstm_ctc.yml](../../configs/rec/rec_mv3_none_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_none_bilstm_ctc.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将 CRNN 文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,使用MJSynth和SynthText两个英文文本识别合成数据集训练的[模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) 为例,可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn +``` + +CRNN 文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +![img](./images/word_336-20240705082445918.png) + +执行命令后,上面图像的识别结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) +``` + +**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同: + +- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。 +- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。 + + ```python linenums="1" + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + ``` + +### 4.2 C++推理 + +准备好推理模型后,参考[cpp infer](../../ppocr/infer_deploy/cpp_infer.md)教程进行操作即可。 + +### 4.3 Serving服务化部署 + +准备好推理模型后,参考[pdserving](../../ppocr/infer_deploy/paddle_server.md)教程进行Serving服务化部署,包括Python Serving和C++ Serving两种模式。 + +### 4.4 更多推理部署 + +CRNN模型还支持以下推理部署方式: + +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.md)教程操作。 + +## 5. FAQ + +## 引用 + +```bibtex +@ARTICLE{7801919, + author={Shi, Baoguang and Bai, Xiang and Yao, Cong}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + title={An End-to-End Trainable Neural Network for Image-Based Sequence Recognition and Its Application to Scene Text Recognition}, + year={2017}, + volume={39}, + number={11}, + pages={2298-2304}, + doi={10.1109/TPAMI.2016.2646371}} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_nrtr.en.md b/docs/algorithm/text_recognition/algorithm_rec_nrtr.en.md new file mode 100644 index 0000000000..c56d78945c --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_nrtr.en.md @@ -0,0 +1,256 @@ +--- +comments: true +--- + +# NRTR + +## 1. Introduction + +Paper: +> [NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition](https://arxiv.org/abs/1806.00926) +> Fenfen Sheng and Zhineng Chen and Bo Xu +> ICDAR, 2019 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_mtb_nrtr.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_mtb_nrtr.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_mtb_nrtr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the NRTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy Global.save_inference_dir=./inference/rec_mtb_nrtr +``` + +**Note:** + +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to NRTR in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_mtb_nrtr/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For NRTR text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_mtb_nrtr/' --rec_algorithm='NRTR' --rec_image_shape='1,32,100' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt' +``` + +![img](./images/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9465042352676392) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +1. In the `NRTR` paper, Beam search is used to decode characters, but the speed is slow. Beam search is not used by default here, and greedy search is used to decode characters. + +## 6. Release Note + +1. The release/2.6 version updates the NRTR code structure. The new version of NRTR can load the model parameters of the old version (release/2.5 and before), and you may use the following code to convert the old version model parameters to the new version model parameters: + +
+ Click to expand + + ```python linenums="1" + params = paddle.load('path/' + '.pdparams') # the old version parameters + state_dict = model.state_dict() # the new version model parameters + new_state_dict = {} + + for k1, v1 in state_dict.items(): + + k = k1 + if 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'encoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'encoder' in k and 'norm3' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para.replace('norm3', 'norm2')] + + elif 'encoder' in k and 'norm1' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'decoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = q[:, :, 0, 0] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')] + new_state_dict[k1] = q + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('kv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')] + v = params[k_para.replace('kv', 'conv3')] + new_state_dict[k1] = np.concatenate([k, v], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + new_state_dict[k1] = params[k_para] + elif 'decoder' in k and 'norm' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + elif 'mlp' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para].transpose((1, 0, 2, 3)) + new_state_dict[k1] = w[:, :, 0, 0] + elif 'mlp' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para] + new_state_dict[k1] = w + + else: + new_state_dict[k1] = params[k1] + + if list(new_state_dict[k1].shape) != list(v1.shape): + print(k1) + + + for k, v1 in state_dict.items(): + if k not in new_state_dict.keys(): + print(1, k) + elif list(new_state_dict[k].shape) != list(v1.shape): + print(2, k) + + + + model.set_state_dict(new_state_dict) + paddle.save(model.state_dict(), 'nrtrnew_from_old_params.pdparams') + + ``` + +
+ +2. The new version has a clean code structure and improved inference speed compared with the old version. + +## Citation + +```bibtex +@article{Sheng2019NRTR, + title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition}, + author = {Fenfen Sheng and Zhineng Chen and Bo Xu}, + booktitle = {ICDAR}, + year = {2019}, + url = {http://arxiv.org/abs/1806.00926}, + pages = {781-786} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_nrtr.md b/docs/algorithm/text_recognition/algorithm_rec_nrtr.md new file mode 100644 index 0000000000..68bb58b46d --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_nrtr.md @@ -0,0 +1,272 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 场景文本识别算法-NRTR + +## 1. 算法简介 + +论文信息: +> [NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition](https://arxiv.org/abs/1806.00926) +> Fenfen Sheng and Zhineng Chen and Bo Xu +> ICDAR, 2019 + +`NRTR`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|NRTR|MTB|[rec_mtb_nrtr.yml](../../configs/rec/rec_mtb_nrtr.yml)|84.21%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`NRTR`识别模型时需要**更换配置文件**为`NRTR`的[配置文件](../../configs/rec/rec_mtb_nrtr.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_mtb_nrtr.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_mtb_nrtr.yml +``` + +### 3.2 评估 + +可下载已训练完成的模型文件,使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_mtb_nrtr.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mtb_nrtr_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_mtb_nrtr.yml -o Global.pretrained_model=./rec_mtb_nrtr_train/best_accuracy Global.save_inference_dir=./inference/rec_mtb_nrtr/ +``` + +**注意:** + +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应NRTR的`infer_shape`。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_mtb_nrtr/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_mtb_nrtr/' --rec_algorithm='NRTR' --rec_image_shape='1,32,100' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![img](./images/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9465042352676392) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[1,32,100],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中NRTR的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持NRTR,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +1. `NRTR`论文中使用Beam搜索进行解码字符,但是速度较慢,这里默认未使用Beam搜索,以贪婪搜索进行解码字符。 + +## 6. 发行公告 + +1. release/2.6更新NRTR代码结构,新版NRTR可加载旧版(release/2.5及之前)模型参数,使用下面示例代码将旧版模型参数转换为新版模型参数: + +
+ 详情 + + ```python linenums="1" + params = paddle.load('path/' + '.pdparams') # 旧版本参数 + state_dict = model.state_dict() # 新版模型参数 + new_state_dict = {} + + for k1, v1 in state_dict.items(): + + k = k1 + if 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'encoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'encoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'encoder' in k and 'norm3' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para.replace('norm3', 'norm2')] + + elif 'encoder' in k and 'norm1' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')].transpose((1, 0, 2, 3)) + k = params[k_para.replace('qkv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('qkv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([q[:, :, 0, 0], k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'self_attn' in k and 'qkv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + q = params[k_para.replace('qkv', 'conv1')] + k = params[k_para.replace('qkv', 'conv2')] + v = params[k_para.replace('qkv', 'conv3')] + new_state_dict[k1] = np.concatenate([q, k, v], -1) + + elif 'decoder' in k and 'self_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = q[:, :, 0, 0] + + elif 'decoder' in k and 'cross_attn' in k and 'q' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + q = params[k_para.replace('q', 'conv1')] + new_state_dict[k1] = q + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')].transpose((1, 0, 2, 3)) + v = params[k_para.replace('kv', 'conv3')].transpose((1, 0, 2, 3)) + new_state_dict[k1] = np.concatenate([k[:, :, 0, 0], v[:, :, 0, 0]], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'kv' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + k = params[k_para.replace('kv', 'conv2')] + v = params[k_para.replace('kv', 'conv3')] + new_state_dict[k1] = np.concatenate([k, v], -1) + + elif 'decoder' in k and 'cross_attn' in k and 'out_proj' in k: + + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('cross_attn', 'multihead_attn') + new_state_dict[k1] = params[k_para] + elif 'decoder' in k and 'norm' in k: + k_para = k[:13] + 'layers.' + k[13:] + new_state_dict[k1] = params[k_para] + elif 'mlp' in k and 'weight' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para].transpose((1, 0, 2, 3)) + new_state_dict[k1] = w[:, :, 0, 0] + elif 'mlp' in k and 'bias' in k: + k_para = k[:13] + 'layers.' + k[13:] + k_para = k_para.replace('fc', 'conv') + k_para = k_para.replace('mlp.', '') + w = params[k_para] + new_state_dict[k1] = w + + else: + new_state_dict[k1] = params[k1] + + if list(new_state_dict[k1].shape) != list(v1.shape): + print(k1) + + + for k, v1 in state_dict.items(): + if k not in new_state_dict.keys(): + print(1, k) + elif list(new_state_dict[k].shape) != list(v1.shape): + print(2, k) + + + + model.set_state_dict(new_state_dict) + paddle.save(model.state_dict(), 'nrtrnew_from_old_params.pdparams') + + ``` + +
+ +2. 新版相比与旧版,代码结构简洁,推理速度有所提高。 + +## 引用 + +```bibtex +@article{Sheng2019NRTR, + title = {NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition}, + author = {Fenfen Sheng and Zhineng Chen and Bo Xu}, + booktitle = {ICDAR}, + year = {2019}, + url = {http://arxiv.org/abs/1806.00926}, + pages = {781-786} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_parseq.en.md b/docs/algorithm/text_recognition/algorithm_rec_parseq.en.md new file mode 100644 index 0000000000..2a971f88d6 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_parseq.en.md @@ -0,0 +1,105 @@ +--- +comments: true +--- + +# PasreQ + +## 1. Introduction + +Paper: +> [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/abs/2207.06966) +> Darwin Bautista, Rowel Atienza +> ECCV, 2021 + +Using real datasets (real) and synthetic datsets (synth) for training respectively,and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets. + +- The real datasets include COCO-Text, RCTW17, Uber-Text, ArT, LSVT, MLT19, ReCTS, TextOCR and OpenVINO datasets. +- The synthesis datasets include MJSynth and SynthText datasets. + +the algorithm reproduction effect is as follows: + +|Training Dataset|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | --- | +|Synth|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|91.24%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz)| +|Real|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|94.74%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_vit_parseq.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vit_parseq.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SAR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model=./rec_vit_parseq_real/best_accuracy Global.save_inference_dir=./inference/rec_parseq +``` + +For SAR text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_parseq/" --rec_image_shape="3, 32, 128" --rec_algorithm="ParseQ" --rec_char_dict_path="ppocr/utils/dict/parseq_dict.txt" --max_text_length=25 --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@InProceedings{bautista2022parseq, + title={Scene Text Recognition with Permuted Autoregressive Sequence Models}, + author={Bautista, Darwin and Atienza, Rowel}, + booktitle={European Conference on Computer Vision}, + pages={178--196}, + month={10}, + year={2022}, + publisher={Springer Nature Switzerland}, + address={Cham}, + doi={10.1007/978-3-031-19815-1_11}, + url={https://doi.org/10.1007/978-3-031-19815-1_11} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_parseq.md b/docs/algorithm/text_recognition/algorithm_rec_parseq.md new file mode 100644 index 0000000000..73caf946c1 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_parseq.md @@ -0,0 +1,105 @@ +--- +comments: true +--- + +# ParseQ + +## 1. 算法简介 + +论文信息: +> [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/abs/2207.06966) +> Darwin Bautista, Rowel Atienza +> ECCV, 2021 + +原论文分别使用真实文本识别数据集(Real)和合成文本识别数据集(Synth)进行训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估。其中: + +- 真实文本识别数据集(Real)包含COCO-Text, RCTW17, Uber-Text, ArT, LSVT, MLT19, ReCTS, TextOCR, OpenVINO数据集 +- 合成文本识别数据集(Synth)包含MJSynth和SynthText数据集 + +在不同数据集上训练的算法的复现效果如下: + +|数据集|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | --- | +|Synth|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|91.24%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_synth.tgz)| +|Real|ParseQ|VIT|[rec_vit_parseq.yml](../../configs/rec/rec_vit_parseq.yml)|94.74%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_vit_parseq.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vit_parseq.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将ParseQ文本识别训练过程中保存的模型,转换成inference model。( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.1/parseq/rec_vit_parseq_real.tgz) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_vit_parseq.yml -o Global.pretrained_model=./rec_vit_parseq_real/best_accuracy Global.save_inference_dir=./inference/rec_parseq +``` + +ParseQ文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_parseq/" --rec_image_shape="3, 32, 128" --rec_algorithm="ParseQ" --rec_char_dict_path="ppocr/utils/dict/parseq_dict.txt" --max_text_length=25 --use_space_char=False +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持ParseQ,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@InProceedings{bautista2022parseq, + title={Scene Text Recognition with Permuted Autoregressive Sequence Models}, + author={Bautista, Darwin and Atienza, Rowel}, + booktitle={European Conference on Computer Vision}, + pages={178--196}, + month={10}, + year={2022}, + publisher={Springer Nature Switzerland}, + address={Cham}, + doi={10.1007/978-3-031-19815-1_11}, + url={https://doi.org/10.1007/978-3-031-19815-1_11} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rare.en.md b/docs/algorithm/text_recognition/algorithm_rec_rare.en.md new file mode 100644 index 0000000000..599ec232f9 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rare.en.md @@ -0,0 +1,100 @@ +--- +comments: true +--- + +# RARE + +## 1. Introduction + +Paper information: +> [Robust Scene Text Recognition with Automatic Rectification](https://arxiv.org/abs/1603.03915v2) +> Baoguang Shi, Xinggang Wang, Pengyuan Lyu, Cong Yao, Xiang Bai∗ +> CVPR, 2016 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Models|Backbone Networks|Configuration Files|Avg Accuracy|Download Links| +| --- | --- | --- | --- | --- | +|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.60%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.50%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| + +## 2. Environment + +Please refer to [Operating Environment Preparation](../../ppocr/environment.en.md) to configure the PaddleOCR operating environment, and refer to [Project Clone](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Training Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Take the backbone network based on Resnet34_vd as an example: + +### 3.1 Training + +````bash linenums="1" +# Single card training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml +# Multi-card training, specify the card number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml +```` + +### 3.2 Evaluation + +````bash linenums="1" +# GPU evaluation, Global.pretrained_model is the model to be evaluated +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +```` + +### 3.3 Prediction + +````bash linenums="1" +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +```` + +## 4. Inference + +### 4.1 Python Inference + +First, convert the model saved during the RARE text recognition training process into an inference model. Take the model trained on the MJSynth and SynthText text recognition datasets based on the Resnet34_vd backbone network as an example ([Model download address](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar) ), which can be converted using the following command: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rare +```` + +RARE text recognition model inference, you can execute the following commands: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rare/" --rec_image_shape="3, 32, 100" --rec_char_dict_path= "./ppocr/utils/ic15_dict.txt" +```` + +The inference results are as follows: + +![img](./images/word_1-20240704184113913.png) + +````text linenums="1" +Predicts of doc/imgs_words/en/word_1.png:('joint ', 0.9999969601631165) +```` + +### 4.2 C++ Inference + +Not currently supported + +### 4.3 Serving + +Not currently supported + +### 4.4 More + +The RARE model also supports the following inference deployment methods: + +- Paddle2ONNX Inference: After preparing the inference model, refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +````bibtex +@inproceedings{2016Robust, + title={Robust Scene Text Recognition with Automatic Rectification}, + author={ Shi, B. and Wang, X. and Lyu, P. and Cong, Y. and Xiang, B. }, + booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016}, +} +```` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rare.md b/docs/algorithm/text_recognition/algorithm_rec_rare.md new file mode 100644 index 0000000000..3464f9c8fe --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rare.md @@ -0,0 +1,100 @@ +--- +comments: true +--- + +# RARE + +## 1. 算法简介 + +论文信息: +> [Robust Scene Text Recognition with Automatic Rectification](https://arxiv.org/abs/1603.03915v2) +> Baoguang Shi, Xinggang Wang, Pengyuan Lyu, Cong Yao, Xiang Bai∗ +> CVPR, 2016 + +使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Avg Accuracy|下载链接| +| --- | --- | --- | --- | --- | +|RARE|Resnet34_vd|[configs/rec/rec_r34_vd_tps_bilstm_att.yml](../../configs/rec/rec_r34_vd_tps_bilstm_att.yml)|83.60%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| +|RARE|MobileNetV3|[configs/rec/rec_mv3_tps_bilstm_att.yml](../../configs/rec/rec_mv3_tps_bilstm_att.yml)|82.50%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。以基于Resnet34_vd骨干网络为例: + +### 3.1 训练 + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml +``` + +### 3.2 评估 + +```bash linenums="1" +# GPU评估, Global.pretrained_model为待评估模型 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 3.3 预测 + +```bash linenums="1" +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将RARE文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,在MJSynth和SynthText两个文字识别数据集训练得到的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_att.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rare +``` + +RARE文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rare/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +推理结果如下所示: + +![img](./images/word_1-20240704184113913.png) + +```text linenums="1" +Predicts of doc/imgs_words/en/word_1.png:('joint ', 0.9999969601631165) +``` + +### 4.2 C++推理 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +RARE模型还支持以下推理部署方式: + +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.md)教程操作。 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{2016Robust, + title={Robust Scene Text Recognition with Automatic Rectification}, + author={ Shi, B. and Wang, X. and Lyu, P. and Cong, Y. and Xiang, B. }, + booktitle={2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rfl.en.md b/docs/algorithm/text_recognition/algorithm_rec_rfl.en.md new file mode 100644 index 0000000000..ab586c718b --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rfl.en.md @@ -0,0 +1,126 @@ +--- +comments: true +--- + +# RFL + +## 1. Introduction + +Paper: +> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf) +> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan +> ICDAR, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_visual_train.tar)| +|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +#step1:train the CNT branch +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#step2:joint training of CNT and Att branches +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model={path/to/weights}/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the RFL text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar)) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att +``` + +**Note:** + +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to NRTR in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_resnet_rfl_att/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For RFL text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100' +``` + +![img](./images/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{2021Reciprocal, + title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition}, + author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. }, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.06229} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rfl.md b/docs/algorithm/text_recognition/algorithm_rec_rfl.md new file mode 100644 index 0000000000..9ab9bd8c46 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rfl.md @@ -0,0 +1,140 @@ +--- +comments: true +--- + +# 场景文本识别算法-RFL + +## 1. 算法简介 + +论文信息: +> [Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition](https://arxiv.org/abs/2105.06229.pdf) +> Hui Jiang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Wenqi Ren, Fei Wu, and Wenming Tan +> ICDAR, 2021 + +`RFL`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|RFL-CNT|ResNetRFL|[rec_resnet_rfl_visual.yml](../../configs/rec/rec_resnet_rfl_visual.yml)|93.40%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_visual_train.tar)| +|RFL-Att|ResNetRFL|[rec_resnet_rfl_att.yml](../../configs/rec/rec_resnet_rfl_att.yml)|88.63%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +PaddleOCR对代码进行了模块化,训练`RFL`识别模型时需要**更换配置文件**为`RFL`的[配置文件](../../configs/rec/rec_resnet_rfl_att.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#step1:训练CNT分支 +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_visual.yml + +#step2:联合训练CNT和Att分支,注意将pretrained_model的路径设置为本地路径。 +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_visual/best_accuracy +``` + +### 3.2 评估 + +可下载已训练完成的[模型文件](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar),使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_resnet_rfl_att.yml -o Global.pretrained_model=./output/rec/rec_resnet_rfl_att/best_accuracy Global.save_inference_dir=./inference/rec_resnet_rfl_att/ +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 + +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应RFL的`infer_shape`。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_resnet_rfl_att/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_resnet_rfl_att/' --rec_algorithm='RFL' --rec_image_shape='1,32,100' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![img](./images/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999927282333374) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[1,32,100],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中RFL的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持RFL,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{2021Reciprocal, + title = {Reciprocal Feature Learning via Explicit and Implicit Tasks in Scene Text Recognition}, + author = {Jiang, H. and Xu, Y. and Cheng, Z. and Pu, S. and Niu, Y. and Ren, W. and Wu, F. and Tan, W. }, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.06229} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_robustscanner.en.md b/docs/algorithm/text_recognition/algorithm_rec_robustscanner.en.md new file mode 100644 index 0000000000..a4aa3bc7c9 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_robustscanner.en.md @@ -0,0 +1,96 @@ +--- +comments: true +--- + +# RobustScanner + +## 1. Introduction + +Paper: +> [RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](https://arxiv.org/pdf/2007.07542.pdf) +> Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, Wayne +Zhang +> ECCV, 2020 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|RobustScanner|ResNet31|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| + +Note:In addition to using the two text recognition datasets MJSynth and SynthText, [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg) data (extraction code: 627x), and some real data are used in training, the specific data details can refer to the paper. + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r31_robustscanner.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_robustscanner.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the RobustScanner text recognition training process is converted into an inference model. you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner +``` + +For RobustScanner text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r31_robustscanner/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="RobustScanner" --rec_char_dict_path="ppocr/utils/dict90.txt" --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{2020RobustScanner, + title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition}, + author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang}, + journal={ECCV2020}, + year={2020}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_robustscanner.md b/docs/algorithm/text_recognition/algorithm_rec_robustscanner.md new file mode 100644 index 0000000000..6abf33ca43 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_robustscanner.md @@ -0,0 +1,97 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# RobustScanner + +## 1. 算法简介 + +论文信息: +> [RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition](https://arxiv.org/pdf/2007.07542.pdf) +> Xiaoyu Yue, Zhanghui Kuang, Chenhao Lin, Hongbin Sun, Wayne +Zhang +> ECCV, 2020 + +使用MJSynth和SynthText两个合成文字识别数据集训练,在IIIT, SVT, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|RobustScanner|ResNet31|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| + +注:除了使用MJSynth和SynthText两个文字识别数据集外,还加入了[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg)数据(提取码:627x),和部分真实数据,具体数据细节可以参考论文。 + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r31_robustscanner.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_robustscanner.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将RobustScanner文本识别训练过程中保存的模型,转换成inference model。可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner +``` + +RobustScanner文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r31_robustscanner/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="RobustScanner" --rec_char_dict_path="ppocr/utils/dict90.txt" --use_space_char=False +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持RobustScanner,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{2020RobustScanner, + title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition}, + author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang}, + journal={ECCV2020}, + year={2020}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rosetta.en.md b/docs/algorithm/text_recognition/algorithm_rec_rosetta.en.md new file mode 100644 index 0000000000..a49ad8a802 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rosetta.en.md @@ -0,0 +1,100 @@ +--- +comments: true +--- + +# Rosetta + +## 1. Introduction + +Paper information: +> [Rosetta: Large Scale System for Text Detection and Recognition in Images](https://arxiv.org/abs/1910.05085) +> Borisyuk F , Gordo A , V Sivakumar +> KDD, 2018 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Models|Backbone Networks|Configuration Files|Avg Accuracy|Download Links| +| --- | --- | --- | --- | --- | +|Rosetta|Resnet34_vd|[configs/rec/rec_r34_vd_none_none_ctc.yml](../../configs/rec/rec_r34_vd_none_none_ctc.yml)|79.11%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)| +|Rosetta|MobileNetV3|[configs/rec/rec_mv3_none_none_ctc.yml](../../configs/rec/rec_mv3_none_none_ctc.yml)|75.80%|[training model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)| + +## 2. Environment + +Please refer to [Operating Environment Preparation](../../ppocr/environment.en.md) to configure the PaddleOCR operating environment, and refer to [Project Clone](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Training Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Take the backbone network based on Resnet34_vd as an example: + +### 3.1 Training + +```bash linenums="1" +# Single card training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml +# Multi-card training, specify the card number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml +``` + +### 3.2 Evaluation + +```bash linenums="1" +# GPU evaluation, Global.pretrained_model is the model to be evaluated +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 3.3 Prediction + +```bash linenums="1" +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, convert the model saved during the Rosetta text recognition training process into an inference model. Take the model trained on the MJSynth and SynthText text recognition datasets based on the Resnet34_vd backbone network as an example ( [Model download address](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar) ), which can be converted using the following command: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_none_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rosetta +``` + +Rosetta text recognition model inference, you can execute the following commands: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rosetta/" --rec_image_shape="3, 32, 100" --rec_char_dict_path= "./ppocr/utils/ic15_dict.txt" +``` + +The inference results are as follows: + +![img](./images/word_1-20240704183926496.png) + +```bash linenums="1" +Predicts of doc/imgs_words/en/word_1.png:('joint', 0.9999982714653015) +``` + +### 4.2 C++ Inference + +Not currently supported + +### 4.3 Serving + +Not currently supported + +### 4.4 More + +The Rosetta model also supports the following inference deployment methods: + +- Paddle2ONNX Inference: After preparing the inference model, refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{2018Rosetta, + title={Rosetta: Large Scale System for Text Detection and Recognition in Images}, + author={ Borisyuk, Fedor and Gordo, Albert and Sivakumar, Viswanath }, + booktitle={the 24th ACM SIGKDD International Conference}, + year={2018}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_rosetta.md b/docs/algorithm/text_recognition/algorithm_rec_rosetta.md new file mode 100644 index 0000000000..659a904348 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_rosetta.md @@ -0,0 +1,101 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Rosetta + +## 1. 算法简介 + +论文信息: +> [Rosetta: Large Scale System for Text Detection and Recognition in Images](https://arxiv.org/abs/1910.05085) +> Borisyuk F , Gordo A , V Sivakumar +> KDD, 2018 + +使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估, 算法复现效果如下: + +|模型|骨干网络|配置文件|Avg Accuracy|下载链接| +| --- | --- | --- | --- | --- | +|Rosetta|Resnet34_vd|[configs/rec/rec_r34_vd_none_none_ctc.yml](../../configs/rec/rec_r34_vd_none_none_ctc.yml)|79.11%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar)| +|Rosetta|MobileNetV3|[configs/rec/rec_mv3_none_none_ctc.yml](../../configs/rec/rec_mv3_none_none_ctc.yml)|75.80%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_none_ctc_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 以基于Resnet34_vd骨干网络为例: + +### 3.1 训练 + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r34_vd_none_none_ctc.yml +``` + +### 3.2 评估 + +```bash linenums="1" +# GPU评估, Global.pretrained_model为待评估模型 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 3.3 预测 + +```bash linenums="1" +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将Rosetta文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,在MJSynth和SynthText两个文字识别数据集训练得到的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_none_ctc_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_none_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_none_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_rosetta +``` + +Rosetta文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_rosetta/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +推理结果如下所示: + +![img](./images/word_1-20240704184113913.png) + +```bash linenums="1" +Predicts of doc/imgs_words/en/word_1.png:('joint', 0.9999982714653015) +``` + +### 4.2 C++推理 + +暂不支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +Rosetta模型还支持以下推理部署方式: + +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.md)教程操作。 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{2018Rosetta, + title={Rosetta: Large Scale System for Text Detection and Recognition in Images}, + author={ Borisyuk, Fedor and Gordo, Albert and Sivakumar, Viswanath }, + booktitle={the 24th ACM SIGKDD International Conference}, + year={2018}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_sar.en.md b/docs/algorithm/text_recognition/algorithm_rec_sar.en.md new file mode 100644 index 0000000000..6339d2cb7a --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_sar.en.md @@ -0,0 +1,96 @@ +--- +comments: true +--- + +# SAR + +## 1. Introduction + +Paper: +> [Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/abs/1811.00751) +> Hui Li, Peng Wang, Chunhua Shen, Guyu Zhang +> AAAI, 2019 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|SAR|ResNet31|[rec_r31_sar.yml](../../configs/rec/rec_r31_sar.yml)|87.20%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar)| + +Note:In addition to using the two text recognition datasets MJSynth and SynthText, [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg) data (extraction code: 627x), and some real data are used in training, the specific data details can refer to the paper. + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r31_sar.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_sar.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SAR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model=./rec_r31_sar_train/best_accuracy Global.save_inference_dir=./inference/rec_sar +``` + +For SAR text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{Li2019ShowAA, + title={Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition}, + author={Hui Li and Peng Wang and Chunhua Shen and Guyu Zhang}, + journal={ArXiv}, + year={2019}, + volume={abs/1811.00751} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_sar.md b/docs/algorithm/text_recognition/algorithm_rec_sar.md new file mode 100644 index 0000000000..f8a6dcb375 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_sar.md @@ -0,0 +1,96 @@ +--- +comments: true +--- + +# SAR + +## 1. 算法简介 + +论文信息: +> [Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/abs/1811.00751) +> Hui Li, Peng Wang, Chunhua Shen, Guyu Zhang +> AAAI, 2019 + +使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|SAR|ResNet31|[rec_r31_sar.yml](../../configs/rec/rec_r31_sar.yml)|87.20%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar)| + +注:除了使用MJSynth和SynthText两个文字识别数据集外,还加入了[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg)数据(提取码:627x),和部分真实数据,具体数据细节可以参考论文。 + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r31_sar.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r31_sar.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将SAR文本识别训练过程中保存的模型,转换成inference model。( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r31_sar.yml -o Global.pretrained_model=./rec_r31_sar_train/best_accuracy Global.save_inference_dir=./inference/rec_sar +``` + +SAR文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_sar/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SAR" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持SAR,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{Li2019ShowAA, + title={Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition}, + author={Hui Li and Peng Wang and Chunhua Shen and Guyu Zhang}, + journal={ArXiv}, + year={2019}, + volume={abs/1811.00751} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_satrn.en.md b/docs/algorithm/text_recognition/algorithm_rec_satrn.en.md new file mode 100644 index 0000000000..885a3c4a53 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_satrn.en.md @@ -0,0 +1,93 @@ +--- +comments: true +--- + +# SATRN + +## 1. Introduction + +论文信息: +> [On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention](https://arxiv.org/abs/1910.04396) +> Junyeop Lee, Sungrae Park, Jeonghun Baek, Seong Joon Oh, Seonghyeon Kim, Hwalsuk Lee +> CVPR, 2020 +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|SATRN|ShallowCNN|88.05%|[configs/rec/rec_satrn.yml](../../configs/rec/rec_satrn.yml)|[训练模型](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_satrn.yml +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_satrn.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SATRN text recognition training process is converted into an inference model. ( [Model download link](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model=./rec_satrn_train/best_accuracy Global.save_inference_dir=./inference/rec_satrn +``` + +For SATRN text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_satrn/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SATRN" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{lee2019recognizing, + title={On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention}, + author={Junyeop Lee and Sungrae Park and Jeonghun Baek and Seong Joon Oh and Seonghyeon Kim and Hwalsuk Lee}, + year={2019}, + eprint={1910.04396}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_satrn.md b/docs/algorithm/text_recognition/algorithm_rec_satrn.md new file mode 100644 index 0000000000..52415720d5 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_satrn.md @@ -0,0 +1,93 @@ +--- +comments: true +--- + +# SATRN + +## 1. 算法简介 + +论文信息: +> [On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention](https://arxiv.org/abs/1910.04396) +> Junyeop Lee, Sungrae Park, Jeonghun Baek, Seong Joon Oh, Seonghyeon Kim, Hwalsuk Lee +> CVPR, 2020 +参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: + +|模型|骨干网络|Avg Accuracy|配置文件|下载链接| +|---|---|---|---|---| +|SATRN|ShallowCNN|88.05%|[configs/rec/rec_satrn.yml](../../configs/rec/rec_satrn.yml)|[训练模型](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_satrn.yml +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_satrn.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将SATRN文本识别训练过程中保存的模型,转换成inference model。( [模型下载地址](https://pan.baidu.com/s/10J-Bsd881bimKaclKszlaQ?pwd=lk8a) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_satrn.yml -o Global.pretrained_model=./rec_satrn/best_accuracy Global.save_inference_dir=./inference/rec_satrn +``` + +SATRN文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_satrn/" --rec_image_shape="3, 48, 48, 160" --rec_algorithm="SATRN" --rec_char_dict_path="ppocr/utils/dict90.txt" --max_text_length=30 --use_space_char=False +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持SATRN,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{lee2019recognizing, + title={On Recognizing Texts of Arbitrary Shapes with 2D Self-Attention}, + author={Junyeop Lee and Sungrae Park and Jeonghun Baek and Seong Joon Oh and Seonghyeon Kim and Hwalsuk Lee}, + year={2019}, + eprint={1910.04396}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_seed.en.md b/docs/algorithm/text_recognition/algorithm_rec_seed.en.md new file mode 100644 index 0000000000..feceda0f66 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_seed.en.md @@ -0,0 +1,92 @@ +--- +comments: true +--- + +# SEED + +## 1. Introduction + +Paper: +> [SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition](https://arxiv.org/pdf/2005.10977.pdf) + +> Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping + +> CVPR, 2020 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|ACC|config|Download link| +| --- | --- | --- | --- | --- | +|SEED|Aster_Resnet| 85.20% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +The SEED model needs to additionally load the [language model](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) trained by FastText, and install the fasttext dependencies: + +```bash linenums="1" +python3 -m pip install fasttext==0.9.1 +``` + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_resnet_stn_bilstm_att.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_resnet_stn_bilstm_att.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +Not support + +### 4.2 C++ Inference + +Not support + +### 4.3 Serving + +Not support + +### 4.4 More + +Not support + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{qiao2020seed, + title={Seed: Semantics enhanced encoder-decoder framework for scene text recognition}, + author={Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={13528--13537}, + year={2020} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_seed.md b/docs/algorithm/text_recognition/algorithm_rec_seed.md new file mode 100644 index 0000000000..8428f0ad5a --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_seed.md @@ -0,0 +1,91 @@ +--- +comments: true +--- + +# SEED + +## 1. 算法简介 + +论文信息: +> [SEED: Semantics Enhanced Encoder-Decoder Framework for Scene Text Recognition](https://arxiv.org/pdf/2005.10977.pdf) +> Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping +> CVPR, 2020 + +参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: + +|模型|骨干网络|Avg Accuracy|配置文件|下载链接| +|---|---|---|---|---| +|SEED|Aster_Resnet| 85.20% | [configs/rec/rec_resnet_stn_bilstm_att.yml](../../configs/rec/rec_resnet_stn_bilstm_att.yml) | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) | + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) ,并且安装 fasttext 依赖: + +```bash linenums="1" +python3 -m pip install fasttext==0.9.1 +``` + +然后,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_resnet_stn_bilstm_att.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_resnet_stn_bilstm_att.yml + +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +coming soon + +### 4.2 C++推理 + +coming soon + +### 4.3 Serving服务化部署 + +coming soon + +### 4.4 更多推理部署 + +coming soon + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{qiao2020seed, + title={Seed: Semantics enhanced encoder-decoder framework for scene text recognition}, + author={Qiao, Zhi and Zhou, Yu and Yang, Dongbao and Zhou, Yucan and Wang, Weiping}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={13528--13537}, + year={2020} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_spin.en.md b/docs/algorithm/text_recognition/algorithm_rec_spin.en.md new file mode 100644 index 0000000000..1d9a0ef037 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_spin.en.md @@ -0,0 +1,93 @@ +--- +comments: true +--- + +# SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition + +## 1. Introduction + +Paper: +> [SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition](https://arxiv.org/abs/2005.13117) +> Chengwei Zhang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Fei Wu, Futai Zou +> AAAI, 2020 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets. The algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.00%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar) | + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SPIN text recognition training process is converted into an inference model. you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r32_gaspin_bilstm_att +``` + +For SPIN text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r32_gaspin_bilstm_att/" --rec_image_shape="3, 32, 100" --rec_algorithm="SPIN" --rec_char_dict_path="/ppocr/utils/dict/spin_dict.txt" --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{2020SPIN, + title={SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition}, + author={Chengwei Zhang and Yunlu Xu and Zhanzhan Cheng and Shiliang Pu and Yi Niu and Fei Wu and Futai Zou}, + journal={AAAI2020}, + year={2020}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_spin.md b/docs/algorithm/text_recognition/algorithm_rec_spin.md new file mode 100644 index 0000000000..bab687b696 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_spin.md @@ -0,0 +1,94 @@ +--- +comments: true +--- + +# SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition + +## 1. 算法简介 + +论文信息: +> [SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition](https://arxiv.org/abs/2005.13117) +> Chengwei Zhang, Yunlu Xu, Zhanzhan Cheng, Shiliang Pu, Yi Niu, Fei Wu, Futai Zou +> AAAI, 2020 + +SPIN收录于AAAI2020。主要用于OCR识别任务。在任意形状文本识别中,矫正网络是一种较为常见的前置处理模块,但诸如RARE\ASTER\ESIR等只考虑了空间变换,并没有考虑色度变换。本文提出了一种结构Structure-Preserving Inner Offset Network (SPIN),可以在色彩空间上进行变换。该模块是可微分的,可以加入到任意识别器中。 +使用MJSynth和SynthText两个合成文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|SPIN|ResNet32|[rec_r32_gaspin_bilstm_att.yml](../../configs/rec/rec_r32_gaspin_bilstm_att.yml)|90.00%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r32_gaspin_bilstm_att.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将SPIN文本识别训练过程中保存的模型,转换成inference model。可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r32_gaspin_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r32_gaspin_bilstm_att +``` + +SPIN文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_r32_gaspin_bilstm_att/" --rec_image_shape="3, 32, 100" --rec_algorithm="SPIN" --rec_char_dict_path="/ppocr/utils/dict/spin_dict.txt" --use_space_char=Falsee +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持SPIN,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{2020SPIN, + title={SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition}, + author={Chengwei Zhang and Yunlu Xu and Zhanzhan Cheng and Shiliang Pu and Yi Niu and Fei Wu and Futai Zou}, + journal={AAAI2020}, + year={2020}, +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_srn.en.md b/docs/algorithm/text_recognition/algorithm_rec_srn.en.md new file mode 100644 index 0000000000..8800103916 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_srn.en.md @@ -0,0 +1,94 @@ +--- +comments: true +--- + +# SRN + +## 1. Introduction + +Paper: +> [Towards Accurate Scene Text Recognition with Semantic Reasoning Networks](https://arxiv.org/abs/2003.12294#) +> Deli Yu, Xuan Li, Chengquan Zhang, Junyu Han, Jingtuo Liu, Errui Ding +> CVPR,2020 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|SRN|Resnet50_vd_fpn|[rec_r50_fpn_srn.yml](../../configs/rec/rec_r50_fpn_srn.yml)|86.31%|[train model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r50_fpn_srn.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r50_fpn_srn.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SRN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model=./rec_r50_vd_srn_train/best_accuracy Global.save_inference_dir=./inference/rec_srn +``` + +For SRN text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_char_type="ch" --rec_algorithm="SRN" --rec_char_dict_path="ppocr/utils/ic15_dict.txt" --use_space_char=False +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +## Citation + +```bibtex +@article{Yu2020TowardsAS, + title={Towards Accurate Scene Text Recognition With Semantic Reasoning Networks}, + author={Deli Yu and Xuan Li and Chengquan Zhang and Junyu Han and Jingtuo Liu and Errui Ding}, + journal={2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2020}, + pages={12110-12119} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_srn.md b/docs/algorithm/text_recognition/algorithm_rec_srn.md new file mode 100644 index 0000000000..04823bcb96 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_srn.md @@ -0,0 +1,94 @@ +--- +comments: true +--- + +# SRN + +## 1. 算法简介 + +论文信息: +> [Towards Accurate Scene Text Recognition with Semantic Reasoning Networks](https://arxiv.org/abs/2003.12294#) +> Deli Yu, Xuan Li, Chengquan Zhang, Junyu Han, Jingtuo Liu, Errui Ding +> CVPR,2020 + +使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|SRN|Resnet50_vd_fpn|[rec_r50_fpn_srn.yml](../../configs/rec/rec_r50_fpn_srn.yml)|86.31%|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r50_fpn_srn.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r50_fpn_srn.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将SRN文本识别训练过程中保存的模型,转换成inference model。( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r50_fpn_srn.yml -o Global.pretrained_model=./rec_r50_vd_srn_train/best_accuracy Global.save_inference_dir=./inference/rec_srn +``` + +SRN文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./inference/rec_srn/" --rec_image_shape="1,64,256" --rec_algorithm="SRN" --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --use_space_char=False +``` + +### 4.2 C++推理 + +由于C++预处理后处理还未支持SRN,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{Yu2020TowardsAS, + title={Towards Accurate Scene Text Recognition With Semantic Reasoning Networks}, + author={Deli Yu and Xuan Li and Chengquan Zhang and Junyu Han and Jingtuo Liu and Errui Ding}, + journal={2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2020}, + pages={12110-12119} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_starnet.en.md b/docs/algorithm/text_recognition/algorithm_rec_starnet.en.md new file mode 100644 index 0000000000..a5552064f5 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_starnet.en.md @@ -0,0 +1,115 @@ +--- +comments: true +--- + +# STAR-Net + +## 1. Introduction + +Paper information: +> [STAR-Net: a spatial attention residue network for scene text recognition.](http://www.bmva.org/bmvc/2016/papers/paper043/paper043.pdf) +> Wei Liu, Chaofeng Chen, Kwan-Yee K. Wong, Zhizhong Su and Junyu Han. +> BMVC, pages 43.1-43.13, 2016 + +Refer to [DTRB](https://arxiv.org/abs/1904.01906) text Recognition Training and Evaluation Process . Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Models|Backbone Networks|Avg Accuracy|Configuration Files|Download Links| +| --- | --- | --- | --- | --- | +|StarNet|Resnet34_vd|84.44%|[configs/rec/rec_r34_vd_tps_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_tps_bilstm_ctc.yml)|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| +|StarNet|MobileNetV3|81.42%|[configs/rec/rec_mv3_tps_bilstm_ctc.yml](../../configs/rec/rec_mv3_tps_bilstm_ctc.yml)|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| + +## 2. Environment + +Please refer to [Operating Environment Preparation](../../ppocr/environment.en.md) to configure the PaddleOCR operating environment, and refer to [Project Clone](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Training Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. Take the backbone network based on Resnet34_vd as an example: + +### 3.1 Training + +After the data preparation is complete, the training can be started. The training command is as follows: + +````bash linenums="1" +# Single card training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml # Multi-card training, specify the card number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_tps_bilstm_ctc.yml +```` + +### 3.2 Evaluation + +````bash linenums="1" +# GPU evaluation, Global.pretrained_model is the model to be evaluated +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +```` + +### 3.3 Prediction + +````bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +```` + +## 4. Inference + +### 4.1 Python Inference + +First, convert the model saved during the STAR-Net text recognition training process into an inference model. Take the model trained on the MJSynth and SynthText text recognition datasets based on the Resnet34_vd backbone network as an example [Model download address]( https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) , which can be converted using the following command: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_starnet +``` + +STAR-Net text recognition model inference, you can execute the following commands: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +![img](./images/word_336.png) + +The inference results are as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) +``` + +**Attention** Since the above model refers to the [DTRB](https://arxiv.org/abs/1904.01906) text recognition training and evaluation process, it is different from the ultra-lightweight Chinese recognition model training in two aspects: + +- The image resolutions used during training are different. The image resolutions used for training the above models are [3, 32, 100], while for Chinese model training, in order to ensure the recognition effect of long texts, the image resolutions used during training are [ 3, 32, 320]. The default shape parameter of the predictive inference program is the image resolution used for training Chinese, i.e. [3, 32, 320]. Therefore, when inferring the above English model here, it is necessary to set the shape of the recognized image through the parameter rec_image_shape. + +- Character list, the experiment in the DTRB paper is only for 26 lowercase English letters and 10 numbers, a total of 36 characters. All uppercase and lowercase characters are converted to lowercase characters, and characters not listed above are ignored and considered spaces. Therefore, there is no input character dictionary here, but a dictionary is generated by the following command. Therefore, the parameter rec_char_dict_path needs to be set during inference, which is specified as an English dictionary "./ppocr/utils/ic15_dict.txt". + +```python linenums="1" +self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" +dict_character = list(self.character_str) +``` + +### 4.2 C++ Inference + +After preparing the inference model, refer to the [cpp infer](../../ppocr/infer_deploy/cpp_infer.en.md) tutorial to operate. + +### 4.3 Serving + +After preparing the inference model, refer to the [pdserving](../../ppocr/infer_deploy/paddle_server.en.md) tutorial for Serving deployment, including two modes: Python Serving and C++ Serving. + +### 4.4 More + +The STAR-Net model also supports the following inference deployment methods: + +- Paddle2ONNX Inference: After preparing the inference model, refer to the [paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.en.md) tutorial. + +## 5. FAQ + +## Citation + +```bibtex +@inproceedings{liu2016star, + title={STAR-Net: a spatial attention residue network for scene text recognition.}, + author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu}, + booktitle={BMVC}, + volume={2}, + pages={7}, + year={2016} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_starnet.md b/docs/algorithm/text_recognition/algorithm_rec_starnet.md new file mode 100644 index 0000000000..8f95bdcd6b --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_starnet.md @@ -0,0 +1,118 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# STAR-Net + +## 1. 算法简介 + +论文信息: +> [STAR-Net: a spatial attention residue network for scene text recognition.](http://www.bmva.org/bmvc/2016/papers/paper043/paper043.pdf) +> Wei Liu, Chaofeng Chen, Kwan-Yee K. Wong, Zhizhong Su and Junyu Han. +> BMVC, pages 43.1-43.13, 2016 + +参考[DTRB](https://arxiv.org/abs/1904.01906) 文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: + +|模型|骨干网络|Avg Accuracy|配置文件|下载链接| +|---|---|---|---|---| +|StarNet|Resnet34_vd|84.44%|[configs/rec/rec_r34_vd_tps_bilstm_ctc.yml](../../configs/rec/rec_r34_vd_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| +|StarNet|MobileNetV3|81.42%|[configs/rec/rec_mv3_tps_bilstm_ctc.yml](../../configs/rec/rec_mv3_tps_bilstm_ctc.yml)|[训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练不同的识别模型只需要**更换配置文件**即可。 + +### 训练 + +在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +# 单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c rec_r34_vd_tps_bilstm_ctc.yml +``` + +### 评估 + +```bash linenums="1" +# GPU 评估, Global.pretrained_model 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### 预测 + +```bash linenums="1" +# 预测使用的配置文件必须与训练一致 +python3 tools/infer_rec.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将 STAR-Net 文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,使用MJSynth和SynthText两个英文文本识别合成数据集训练的[模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) 为例,可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_tps_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_tps_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_starnet +``` + +STAR-Net 文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +![img](./images/word_336.png) + +执行命令后,上面图像的识别结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) +``` + +**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同: + +- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。 + +- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。 + +```python linenums="1" +self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" +dict_character = list(self.character_str) +``` + +### 4.2 C++推理 + +准备好推理模型后,参考[cpp infer](../../ppocr/infer_deploy/cpp_infer.md)教程进行操作即可。 + +### 4.3 Serving服务化部署 + +准备好推理模型后,参考[pdserving](../../ppocr/infer_deploy/paddle_server.md)教程进行Serving服务化部署,包括Python Serving和C++ Serving两种模式。 + +### 4.4 更多推理部署 + +STAR-Net模型还支持以下推理部署方式: + +- Paddle2ONNX推理:准备好推理模型后,参考[paddle2onnx](../../ppocr/infer_deploy/paddle2onnx.md)教程操作。 + +## 5. FAQ + +## 引用 + +```bibtex +@inproceedings{liu2016star, + title={STAR-Net: a spatial attention residue network for scene text recognition.}, + author={Liu, Wei and Chen, Chaofeng and Wong, Kwan-Yee K and Su, Zhizhong and Han, Junyu}, + booktitle={BMVC}, + volume={2}, + pages={7}, + year={2016} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_svtr.en.md b/docs/algorithm/text_recognition/algorithm_rec_svtr.en.md new file mode 100644 index 0000000000..0080846957 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_svtr.en.md @@ -0,0 +1,144 @@ +--- +comments: true +--- + +# SVTR + +## 1. Introduction + +Paper: +> [SVTR: Scene Text Recognition with a Single Visual Model](https://arxiv.org/abs/2205.00159) +> Yongkun Du and Zhineng Chen and Caiyan Jia Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang +> IJCAI, 2022 + +The accuracy (%) and model files of SVTR on the public dataset of scene text recognition are as follows: + +* Chinese dataset from [Chinese Benckmark](https://arxiv.org/abs/2112.15093) , and the Chinese training evaluation strategy of SVTR follows the paper. + +| Model |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg_6 |IC15
2077 |IC13
1015 |IC03
867|IC03
860|Avg_10 | Chinese
scene_test| Download link | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|:-----:|:-----:|:---------------------------------------------:|:-----:|:------:| +| SVTR Tiny | 96.85 | 91.34 | 94.53 | 83.99 | 85.43 | 89.24 | 90.87 | 80.55 | 95.37 | 95.27 | 95.70 | 90.13 | 67.90 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_ch_train.tar) | +| SVTR Small | 95.92 | 93.04 | 95.03 | 84.70 | 87.91 | 92.01 | 91.63 | 82.72 | 94.88 | 96.08 | 96.28 | 91.02 | 69.00 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_ch_train.tar) | +| SVTR Base | 97.08 | 91.50 | 96.03 | 85.20 | 89.92 | 91.67 | 92.33 | 83.73 | 95.66 | 95.62 | 95.81 | 91.61 | 71.40 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_base_none_ctc_en_train.tar) / - | +| SVTR Large | 97.20 | 91.65 | 96.30 | 86.58 | 88.37 | 95.14 | 92.82 | 84.54 | 96.35 | 96.54 | 96.74 | 92.24 | 72.10 | [English](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_en_train.tar) / [Chinese](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_ch_train.tar) | + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +### Dataset Preparation + +[English dataset download](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) +[Chinese dataset download](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download) + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_svtrnet.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet.yml +``` + +### Evaluation + +You can download the model files and configuration files provided by `SVTR`: [download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar), take `SVTR-T` as an example, using the following command to evaluate: + +```bash linenums="1" +# Download the tar archive containing the model files and configuration files of SVTR-T and extract it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar && tar xf rec_svtr_tiny_none_ctc_en_train.tar +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy +``` + +### Prediction + +```bash linenums="1" +python3 tools/infer_rec.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the SVTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_svtrnet.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy Global.save_inference_dir=./inference/rec_svtr_tiny_stn_en +``` + +**Note:** If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_svtr_tiny_stn_en/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For SVTR text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_svtr_tiny_stn_en/' --rec_algorithm='SVTR' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +``` + +![](../imgs_words_en/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +* 1. Speed situation on CPU and GPU + * Since most of the operators used by `SVTR` are matrix multiplication, in the GPU environment, the speed has an advantage, but in the environment where mkldnn is enabled on the CPU, `SVTR` has no advantage over the optimized convolutional network. +* 2. SVTR model convert to ONNX failed + * Ensure `paddle2onnx` and `onnxruntime` versions are up to date, refer to [SVTR model to onnx step-by-step example](https://github.com/PaddlePaddle/PaddleOCR/issues/7821#issuecomment-) for the convert onnx command. 1271214273). +* 3. SVTR model convert to ONNX is successful but the inference result is incorrect + * The possible reason is that the model parameter `out_char_num` is not set correctly, it should be set to W//4, W//8 or W//12, please refer to [Section 3.3.3 of SVTR, a high-precision Chinese scene text recognition model](https://aistudio.baidu.com/aistudio/) projectdetail/5073182?contributionType=1). +* 4. Optimization of long text recognition + * Refer to [Section 3.3 of SVTR, a high-precision Chinese scene text recognition model](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1). +* 5. Notes on the reproduction of the paper results + * Dataset using provided by [ABINet](https://github.com/FangShancheng/ABINet). + * By default, 4 cards of GPUs are used for training, the default Batchsize of a single card is 512, and the total Batchsize is 2048, corresponding to a learning rate of 0.0005. When modifying the Batchsize or changing the number of GPU cards, the learning rate should be modified in equal proportion. +* 6. Exploration Directions for further optimization + * Learning rate adjustment: adjusting to twice the default to keep Batchsize unchanged; or reducing Batchsize to 1/2 the default to keep the learning rate unchanged. + * Data augmentation strategies: optionally `RecConAug` and `RecAug`. + * If STN is not used, `Local` of `mixer` can be replaced by `Conv` and `local_mixer` can all be modified to `[5, 5]`. + * Grid search for optimal `embed_dim`, `depth`, `num_heads` configurations. + * Use the `Post-Normalization strategy`, which is to modify the model configuration `prenorm` to `True`. + +## Citation + +```bibtex +@article{Du2022SVTR, + title = {SVTR: Scene Text Recognition with a Single Visual Model}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {IJCAI}, + year = {2022}, + url = {https://arxiv.org/abs/2205.00159} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_svtr.md b/docs/algorithm/text_recognition/algorithm_rec_svtr.md new file mode 100644 index 0000000000..bb9dafe7cf --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_svtr.md @@ -0,0 +1,170 @@ +--- +comments: true +--- + +# 场景文本识别算法-SVTR + +## 1. 算法简介 + +论文信息: +> [SVTR: Scene Text Recognition with a Single Visual Model](https://arxiv.org/abs/2205.00159) +> Yongkun Du and Zhineng Chen and Caiyan Jia and Xiaoting Yin and Tianlun Zheng and Chenxia Li and Yuning Du and Yu-Gang Jiang +> IJCAI, 2022 + +场景文本识别旨在将自然图像中的文本转录为数字字符序列,从而传达对场景理解至关重要的高级语义。这项任务由于文本变形、字体、遮挡、杂乱背景等方面的变化具有一定的挑战性。先前的方法为提高识别精度做出了许多工作。然而文本识别器除了准确度外,还因为实际需求需要考虑推理速度等因素。 + +### SVTR算法简介 + +主流的场景文本识别模型通常包含两个模块:用于特征提取的视觉模型和用于文本转录的序列模型。这种架构虽然准确,但复杂且效率较低,限制了在实际场景中的应用。SVTR提出了一种用于场景文本识别的单视觉模型,该模型在patch-wise image tokenization框架内,完全摒弃了序列建模,在精度具有竞争力的前提下,模型参数量更少,速度更快,主要有以下几点贡献: + +1. 首次发现单视觉模型可以达到与视觉语言模型相媲美甚至更高的准确率,并且其具有效率高和适应多语言的优点,在实际应用中很有前景。 +2. SVTR从字符组件的角度出发,逐渐的合并字符组件,自下而上地完成字符的识别。 +3. SVTR引入了局部和全局Mixing,分别用于提取字符组件特征和字符间依赖关系,与多尺度的特征一起,形成多粒度特征描述。 + +SVTR在场景文本识别公开数据集上的精度(%)和模型文件如下: + +* 中文数据集来自于[Chinese Benckmark](https://arxiv.org/abs/2112.15093) ,SVTR的中文训练评估策略遵循该论文。 + +| 模型 |IC13
857 | SVT |IIIT5k
3000 |IC15
1811| SVTP |CUTE80 | Avg_6 |IC15
2077 |IC13
1015 |IC03
867|IC03
860|Avg_10 | Chinese
scene_test| 下载链接 | +|:----------:|:------:|:-----:|:---------:|:------:|:-----:|:-----:|:-----:|:-------:|:-------:|:-----:|:-----:|:------:|:-----:|:-----:| +| SVTR Tiny | 96.85 | 91.34 | 94.53 | 83.99 | 85.43 | 89.24 | 90.87 | 80.55 | 95.37 | 95.27 | 95.70 | 90.13 | 67.90 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_ch_train.tar) | +| SVTR Small | 95.92 | 93.04 | 95.03 | 84.70 | 87.91 | 92.01 | 91.63 | 82.72 | 94.88 | 96.08 | 96.28 | 91.02 | 69.00 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_small_none_ctc_ch_train.tar) | +| SVTR Base | 97.08 | 91.50 | 96.03 | 85.20 | 89.92 | 91.67 | 92.33 | 83.73 | 95.66 | 95.62 | 95.81 | 91.61 | 71.40 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_base_none_ctc_en_train.tar) / - | +| SVTR Large | 97.20 | 91.65 | 96.30 | 86.58 | 88.37 | 95.14 | 92.82 | 84.54 | 96.35 | 96.54 | 96.74 | 92.24 | 72.10 | [英文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_en_train.tar) / [中文](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_large_none_ctc_ch_train.tar) | + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +#### 数据集准备 + +[英文数据集下载](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) +[中文数据集下载](https://github.com/fudanvi/benchmarking-chinese-text-recognition#download) + +#### 启动训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`SVTR`识别模型时需要**更换配置文件**为`SVTR`的[配置文件](../../configs/rec/rec_svtrnet.yml)。 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_svtrnet.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_svtrnet.yml +``` + +### 3.2 评估 + +可下载`SVTR`提供的模型文件和配置文件:[下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ,以`SVTR-T`为例,使用如下命令进行评估: + +```bash linenums="1" +# 下载包含SVTR-T的模型文件和配置文件的tar压缩包并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar && tar xf rec_svtr_tiny_none_ctc_en_train.tar +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。下面以`SVTR-T`在英文数据集训练的模型为例([模型和配置文件下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c ./rec_svtr_tiny_none_ctc_en_train/rec_svtr_tiny_6local_6global_stn_en.yml -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_en_train/best_accuracy Global.save_inference_dir=./inference/rec_svtr_tiny_stn_en +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_svtr_tiny_stn_en/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_svtr_tiny_stn_en/' --rec_algorithm='SVTR' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![](../imgs_words_en/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) +``` + +**注意**: + +* 如果您调整了训练时的输入分辨率,需要通过参数`rec_image_shape`设置为您需要的识别图像形状。 +* 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +* 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中SVTR的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持SVTR,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +* 1. GPU和CPU速度对比 + * 由于`SVTR`使用的算子大多为矩阵相乘,在GPU环境下,速度具有优势,但在CPU开启mkldnn加速环境下,`SVTR`相比于被优化的卷积网络没有优势。 + +* 2. SVTR模型转ONNX失败 + * 保证`paddle2onnx`和`onnxruntime`版本最新,转onnx命令参考[SVTR模型转onnx步骤实例](https://github.com/PaddlePaddle/PaddleOCR/issues/7821#issuecomment-1271214273)。 +* 3. SVTR转ONNX成功但是推理结果不正确 + * 可能的原因模型参数`out_char_num`设置不正确,应设置为W//4、W//8或者W//12,可以参考[高精度中文场景文本识别模型SVTR的3.3.3章节](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1)。 +* 4. 长文本识别优化 + * 参考[高精度中文场景文本识别模型SVTR的3.3章节](https://aistudio.baidu.com/aistudio/projectdetail/5073182?contributionType=1)。 +* 5. 论文结果复现注意事项 + * 数据集使用[ABINet](https://github.com/FangShancheng/ABINet)提供的数据集; + * 默认使用4卡GPU训练,单卡Batchsize默认为512,总Batchsize为2048,对应的学习率为0.0005,当修改Batchsize或者改变GPU卡数,学习率应等比例修改。 +* 6. 进一步优化的探索点 + * 学习率调整:可以调整为默认的两倍保持Batchsize不变;或者将Batchsize减小为默认的1/2,保持学习率不变; + * 数据增强策略:可选`RecConAug`和`RecAug`; + * 如果不使用STN时,可以将`mixer`的`Local`替换为`Conv`、`local_mixer`全部修改为`[5, 5]`; + * 网格搜索最优的`embed_dim`、`depth`、`num_heads`配置; + * 使用`后Normalization策略`,即是将模型配置`prenorm`修改为`True`。 + +## 引用 + +```bibtex +@article{Du2022SVTR, + title = {SVTR: Scene Text Recognition with a Single Visual Model}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {IJCAI}, + year = {2022}, + url = {https://arxiv.org/abs/2205.00159} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_svtrv2.md b/docs/algorithm/text_recognition/algorithm_rec_svtrv2.md new file mode 100644 index 0000000000..22f320d3e5 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_svtrv2.md @@ -0,0 +1,121 @@ +--- +comments: true +--- + +# 场景文本识别算法-SVTRv2 + +## 1. 算法简介 + +### SVTRv2算法简介 + +[PaddleOCR 算法模型挑战赛 - 赛题一:OCR 端到端识别任务](https://aistudio.baidu.com/competition/detail/1131/0/introduction)排行榜第一算法。主要思路:1、检测和识别模型的Backbone升级为RepSVTR;2、识别教师模型升级为SVTRv2,可识别长文本。 + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +训练命令: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml + +# 多卡训练,通过--gpus参数指定卡号 +# Rec 学生模型 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml +# Rec 教师模型 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_svtrv2_gtc.yml +# Rec 蒸馏训练 +python -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/rec/SVTRv2/rec_svtrv2_gtc_distill.yml +``` + +### 3.2 评估 + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c tools/eval.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy Global.infer_img='./doc/imgs_words_en/word_10.png' +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model,以RepSVTR为例,可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/SVTRv2/rec_repsvtr_gtc.yml -o Global.pretrained_model=output/rec_repsvtr_gtc/best_accuracy Global.save_inference_dir=./inference/rec_repsvtr_infer +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否为所正确的字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +./inference/rec_repsvtr_infer/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_repsvtr_infer/' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![](../imgs_words_en/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9999998807907104) +``` + +**注意**: + +- 如果您调整了训练时的输入分辨率,需要通过参数`rec_image_shape`设置为您需要的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中SVTR的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持SVTRv2 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +## 引用 + +```bibtex +@article{Du2022SVTR, + title = {SVTR: Scene Text Recognition with a Single Visual Model}, + author = {Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + booktitle = {IJCAI}, + year = {2022}, + url = {https://arxiv.org/abs/2205.00159} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_visionlan.en.md b/docs/algorithm/text_recognition/algorithm_rec_visionlan.en.md new file mode 100644 index 0000000000..99566d6858 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_visionlan.en.md @@ -0,0 +1,120 @@ +--- +comments: true +--- + +# VisionLAN + +## 1. Introduction + +Paper: +> [From Two to One: A New Scene Text Recognizer with Visual Language Modeling Network](https://arxiv.org/abs/2108.09661) +> Yuxin Wang, Hongtao Xie, Shancheng Fang, Jing Wang, Shenggao Zhu, Yongdong Zhang +> ICCV, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.30%|[model link](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_r45_visionlan.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_visionlan.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 tools/eval.py -c configs/rec/rec_r45_visionlan.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_r45_visionlan.yml -o Global.infer_img='./doc/imgs_words/en/word_2.png' Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the VisionLAN text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r45_visionlan.yml -o Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_visionlan/ +``` + +**Note:** + +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to VisionLAN in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +./inference/rec_r45_visionlan/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For VisionLAN text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False +``` + +![img](./images/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +1. Note that the MJSynth and SynthText datasets come from [VisionLAN repo](https://github.com/wangyuxin87/VisionLAN). +2. We use the pre-trained model provided by the VisionLAN authors for finetune training. The dictionary for the pre-trained model is 'ppocr/utils/ic15_dict.txt'. + +## Citation + +```bibtex +@inproceedings{wang2021two, + title={From Two to One: A New Scene Text Recognizer with Visual Language Modeling Network}, + author={Wang, Yuxin and Xie, Hongtao and Fang, Shancheng and Wang, Jing and Zhu, Shenggao and Zhang, Yongdong}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={14194--14203}, + year={2021} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_visionlan.md b/docs/algorithm/text_recognition/algorithm_rec_visionlan.md new file mode 100644 index 0000000000..1bb4a4f20f --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_visionlan.md @@ -0,0 +1,135 @@ +--- +comments: true +--- + +# 场景文本识别算法-VisionLAN + +## 1. 算法简介 + +论文信息: +> [From Two to One: A New Scene Text Recognizer with Visual Language Modeling Network](https://arxiv.org/abs/2108.09661) +> Yuxin Wang, Hongtao Xie, Shancheng Fang, Jing Wang, Shenggao Zhu, Yongdong Zhang +> ICCV, 2021 + +`VisionLAN`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|VisionLAN|ResNet45|[rec_r45_visionlan.yml](../../configs/rec/rec_r45_visionlan.yml)|90.30%|[预训练、训练模型](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`VisionLAN`识别模型时需要**更换配置文件**为`VisionLAN`的[配置文件](../../configs/rec/rec_r45_visionlan.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_r45_visionlan.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_r45_visionlan.yml +``` + +### 3.2 评估 + +可下载已训练完成的模型文件,使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/eval.py -c configs/rec/rec_r45_visionlan.yml -o Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_r45_visionlan.yml -o Global.infer_img='./doc/imgs_words/en/word_2.png' Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/VisionLAN/rec_r45_visionlan_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_r45_visionlan.yml -o Global.pretrained_model=./rec_r45_visionlan_train/best_accuracy Global.save_inference_dir=./inference/rec_r45_visionlan/ +``` + +**注意:** + +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应VisionLAN的`infer_shape`。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +./inference/rec_r45_visionlan/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words/en/word_2.png' --rec_model_dir='./inference/rec_r45_visionlan/' --rec_algorithm='VisionLAN' --rec_image_shape='3,64,256' --rec_char_dict_path='./ppocr/utils/ic15_dict.txt' --use_space_char=False +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![img](./images/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.9999493) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[3,64,256],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中VisionLAN的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持VisionLAN,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +1. MJSynth和SynthText两种数据集来自于[VisionLAN源repo](https://github.com/wangyuxin87/VisionLAN) 。 +2. 我们使用VisionLAN作者提供的预训练模型进行finetune训练,预训练模型配套字典为'ppocr/utils/ic15_dict.txt'。 + +## 引用 + +```bibtex +@inproceedings{wang2021two, + title={From Two to One: A New Scene Text Recognizer with Visual Language Modeling Network}, + author={Wang, Yuxin and Xie, Hongtao and Fang, Shancheng and Wang, Jing and Zhu, Shenggao and Zhang, Yongdong}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={14194--14203}, + year={2021} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_vitstr.en.md b/docs/algorithm/text_recognition/algorithm_rec_vitstr.en.md new file mode 100644 index 0000000000..9a56504cfa --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_vitstr.en.md @@ -0,0 +1,119 @@ +--- +comments: true +--- + +# ViTSTR + +## 1. Introduction + +Paper: +> [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2105.08582) +> Rowel Atienza +> ICDAR, 2021 + +Using MJSynth and SynthText two text recognition datasets for training, and evaluating on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE datasets, the algorithm reproduction effect is as follows: + +|Model|Backbone|config|Acc|Download link| +| --- | --- | --- | --- | --- | +|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[trained model](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)| + +## 2. Environment + +Please refer to ["Environment Preparation"](../../ppocr/environment.en.md) to configure the PaddleOCR environment, and refer to ["Project Clone"](../../ppocr/blog/clone.en.md)to clone the project code. + +## 3. Model Training / Evaluation / Prediction + +Please refer to [Text Recognition Tutorial](../../ppocr/model_train/recognition.en.md). PaddleOCR modularizes the code, and training different recognition models only requires **changing the configuration file**. + +### Training + +Specifically, after the data preparation is completed, the training can be started. The training command is as follows: + +```bash linenums="1" +# Single GPU training (long training period, not recommended) +python3 tools/train.py -c configs/rec/rec_vitstr_none_ce.yml + +# Multi GPU training, specify the gpu number through the --gpus parameter +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vitstr_none_ce.yml +``` + +### Evaluation + +```bash linenums="1" +# GPU evaluation +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.pretrained_model={path/to/weights}/best_accuracy +``` + +### Prediction + +```bash linenums="1" +# The configuration file used for prediction must match the training +python3 tools/infer_rec.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_vitstr_none_ce_train/best_accuracy +``` + +## 4. Inference and Deployment + +### 4.1 Python Inference + +First, the model saved during the ViTSTR text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/rec_vitstr_none_none_train.tar)) ), you can use the following command to convert: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.pretrained_model=./rec_vitstr_none_ce_train/best_accuracy Global.save_inference_dir=./inference/rec_vitstr +``` + +**Note:** + +- If you are training the model on your own dataset and have modified the dictionary file, please pay attention to modify the `character_dict_path` in the configuration file to the modified dictionary file. +- If you modified the input size during training, please modify the `infer_shape` corresponding to ViTSTR in the `tools/export_model.py` file. + +After the conversion is successful, there are three files in the directory: + +```text linenums="1" +/inference/rec_vitstr/ + ├── inference.pdiparams + ├── inference.pdiparams.info + └── inference.pdmodel +``` + +For ViTSTR text recognition model inference, the following commands can be executed: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_vitstr/' --rec_algorithm='ViTSTR' --rec_image_shape='1,224,224' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt' +``` + +![img](./images/word_10.png) + +After executing the command, the prediction result (recognized text and score) of the image above is printed to the screen, an example is as follows: +The result is as follows: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9998350143432617) +``` + +### 4.2 C++ Inference + +Not supported + +### 4.3 Serving + +Not supported + +### 4.4 More + +Not supported + +## 5. FAQ + +1. In the `ViTSTR` paper, using pre-trained weights on ImageNet1k for initial training, we did not use pre-trained weights in training, and the final accuracy did not change or even improved. + +## Citation + +```bibtex +@article{Atienza2021ViTSTR, + title = {Vision Transformer for Fast and Efficient Scene Text Recognition}, + author = {Rowel Atienza}, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.08582} +} +``` diff --git a/docs/algorithm/text_recognition/algorithm_rec_vitstr.md b/docs/algorithm/text_recognition/algorithm_rec_vitstr.md new file mode 100644 index 0000000000..9f2912e353 --- /dev/null +++ b/docs/algorithm/text_recognition/algorithm_rec_vitstr.md @@ -0,0 +1,135 @@ +--- +comments: true +--- + +# 场景文本识别算法-ViTSTR + +## 1. 算法简介 + +论文信息: +> [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2105.08582) +> Rowel Atienza +> ICDAR, 2021 + +`ViTSTR`使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法复现效果如下: + +|模型|骨干网络|配置文件|Acc|下载链接| +| --- | --- | --- | --- | --- | +|ViTSTR|ViTSTR|[rec_vitstr_none_ce.yml](../../configs/rec/rec_vitstr_none_ce.yml)|79.82%|[训练模型](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar)| + +## 2. 环境配置 + +请先参考[《运行环境准备》](../../ppocr/environment.md)配置PaddleOCR运行环境,参考[《项目克隆》](../../ppocr/blog/clone.md)克隆项目代码。 + +## 3. 模型训练、评估、预测 + +### 3.1 模型训练 + +请参考[文本识别训练教程](../../ppocr/model_train/recognition.md)。PaddleOCR对代码进行了模块化,训练`ViTSTR`识别模型时需要**更换配置文件**为`ViTSTR`的[配置文件](../../configs/rec/rec_vitstr_none_ce.yml)。 + +#### 启动训练 + +具体地,在完成数据准备后,便可以启动训练,训练命令如下: + +```bash linenums="1" +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/rec_vitstr_none_ce.yml + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_vitstr_none_ce.yml +``` + +### 3.2 评估 + +可下载已训练完成的模型文件,使用如下命令进行评估: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.pretrained_model=./rec_vitstr_none_ce_train/best_accuracy +``` + +### 3.3 预测 + +使用如下命令进行单张图片预测: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/infer_rec.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.infer_img='./doc/imgs_words_en/word_10.png' Global.pretrained_model=./rec_vitstr_none_ce_train/best_accuracy +# 预测文件夹下所有图像时,可修改infer_img为文件夹,如 Global.infer_img='./doc/imgs_words_en/'。 +``` + +## 4. 推理部署 + +### 4.1 Python推理 + +首先将训练得到best模型,转换成inference model。这里以训练完成的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/rec_vitstr_none_ce_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +# 注意将pretrained_model的路径设置为本地路径。 +python3 tools/export_model.py -c configs/rec/rec_vitstr_none_ce.yml -o Global.pretrained_model=./rec_vitstr_none_ce_train/best_accuracy Global.save_inference_dir=./inference/rec_vitstr/ +``` + +**注意:** + +- 如果您是在自己的数据集上训练的模型,并且调整了字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 +- 如果您修改了训练时的输入大小,请修改`tools/export_model.py`文件中的对应ViTSTR的`infer_shape`。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_vitstr/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +执行如下命令进行模型推理: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir='./doc/imgs_words_en/word_10.png' --rec_model_dir='./inference/rec_vitstr/' --rec_algorithm='ViTSTR' --rec_image_shape='1,224,224' --rec_char_dict_path='./ppocr/utils/EN_symbol_dict.txt' +# 预测文件夹下所有图像时,可修改image_dir为文件夹,如 --image_dir='./doc/imgs_words_en/'。 +``` + +![img](./images/word_10.png) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: +结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('pain', 0.9998350143432617) +``` + +**注意**: + +- 训练上述模型采用的图像分辨率是[1,224,224],需要通过参数`rec_image_shape`设置为您训练时的识别图像形状。 +- 在推理时需要设置参数`rec_char_dict_path`指定字典,如果您修改了字典,请修改该参数为您的字典文件。 +- 如果您修改了预处理方法,需修改`tools/infer/predict_rec.py`中ViTSTR的预处理为您的预处理方法。 + +### 4.2 C++推理部署 + +由于C++预处理后处理还未支持ViTSTR,所以暂未支持 + +### 4.3 Serving服务化部署 + +暂不支持 + +### 4.4 更多推理部署 + +暂不支持 + +## 5. FAQ + +1. 在`ViTSTR`论文中,使用在ImageNet1k上的预训练权重进行初始化训练,我们在训练未采用预训练权重,最终精度没有变化甚至有所提高。 +2. 我们仅仅复现了`ViTSTR`中的tiny版本,如果需要使用small、base版本,可将[ViTSTR源repo](https://github.com/roatienza/deep-text-recognition-benchmark) 中的预训练权重转为Paddle权重使用。 + +## 引用 + +```bibtex +@article{Atienza2021ViTSTR, + title = {Vision Transformer for Fast and Efficient Scene Text Recognition}, + author = {Rowel Atienza}, + booktitle = {ICDAR}, + year = {2021}, + url = {https://arxiv.org/abs/2105.08582} +} +``` diff --git a/docs/algorithm/text_recognition/images/word_1-20240704183926496.png b/docs/algorithm/text_recognition/images/word_1-20240704183926496.png new file mode 100644 index 0000000000..7b915fd6da Binary files /dev/null and b/docs/algorithm/text_recognition/images/word_1-20240704183926496.png differ diff --git a/docs/algorithm/text_recognition/images/word_1-20240704184113913.png b/docs/algorithm/text_recognition/images/word_1-20240704184113913.png new file mode 100644 index 0000000000..7b915fd6da Binary files /dev/null and b/docs/algorithm/text_recognition/images/word_1-20240704184113913.png differ diff --git a/docs/algorithm/text_recognition/images/word_10.png b/docs/algorithm/text_recognition/images/word_10.png new file mode 100644 index 0000000000..07370f757e Binary files /dev/null and b/docs/algorithm/text_recognition/images/word_10.png differ diff --git a/docs/algorithm/text_recognition/images/word_336-20240705082445918.png b/docs/algorithm/text_recognition/images/word_336-20240705082445918.png new file mode 100644 index 0000000000..3bddd294ed Binary files /dev/null and b/docs/algorithm/text_recognition/images/word_336-20240705082445918.png differ diff --git a/docs/algorithm/text_recognition/images/word_336.png b/docs/algorithm/text_recognition/images/word_336.png new file mode 100644 index 0000000000..3bddd294ed Binary files /dev/null and b/docs/algorithm/text_recognition/images/word_336.png differ diff --git "a/docs/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253.md" "b/docs/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..d611db7c2e --- /dev/null +++ "b/docs/applications/PCB\345\255\227\347\254\246\350\257\206\345\210\253.md" @@ -0,0 +1,595 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 基于PP-OCRv3的PCB字符识别 + +## 1. 项目介绍 + +印刷电路板(PCB)是电子产品中的核心器件,对于板件质量的测试与监控是生产中必不可少的环节。在一些场景中,通过PCB中信号灯颜色和文字组合可以定位PCB局部模块质量问题,PCB文字识别中存在如下难点: + +- 裁剪出的PCB图片宽高比例较小 +- 文字区域整体面积也较小 +- 包含垂直、水平多种方向文本 + +针对本场景,PaddleOCR基于全新的PP-OCRv3通过合成数据、微调以及其他场景适配方法完成小字符文本识别任务,满足企业上线要求。PCB检测、识别效果如 **图1** 所示: + +![](./images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880-0096678.png) + +注:欢迎在AIStudio领取免费算力体验线上实训,项目链接: [基于PP-OCRv3实现PCB字符识别](https://aistudio.baidu.com/aistudio/projectdetail/4008973) + +## 2. 安装说明 + +下载PaddleOCR源码,安装依赖环境。 + +```python linenums="1" +# 如仍需安装or安装更新,可以执行以下步骤 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +# git clone https://gitee.com/PaddlePaddle/PaddleOCR +``` + +```python linenums="1" +# 安装依赖包 +pip install -r /home/aistudio/PaddleOCR/requirements.txt +``` + +## 3. 数据准备 + +我们通过图片合成工具生成 **图2** 所示的PCB图片,整图只有高25、宽150左右、文字区域高9、宽45左右,包含垂直和水平2种方向的文本: + +![](./images/bb7a345687814a3d83a29790f2a2b7d081495b3a920b43988c93da6039cad653.jpeg) + +暂时不开源生成的PCB数据集,但是通过更换背景,通过如下代码生成数据即可: + +``` +cd gen_data +python3 gen.py --num_img=10 +``` + +生成图片参数解释: + +``` +num_img:生成图片数量 +font_min_size、font_max_size:字体最大、最小尺寸 +bg_path:文字区域背景存放路径 +det_bg_path:整图背景存放路径 +fonts_path:字体路径 +corpus_path:语料路径 +output_dir:生成图片存储路径 +``` + +这里生成 **100张** 相同尺寸和文本的图片,如 **图3** 所示,方便大家跑通实验。通过如下代码解压数据集: + +![](./images/3277b750159f4b68b2b58506bfec9005d49aeb5fb1d9411e83f96f9ff7eb66a5.png) + +```python linenums="1" +tar xf ./data/data148165/dataset.tar -C ./ +``` + +在生成数据集的时需要生成检测和识别训练需求的格式: + +- **文本检测** + +标注文件格式如下,中间用'\t'分隔: + +``` +" 图像文件名 json.dumps编码的图像标注信息" +ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] +``` + +json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。 `transcription` 表示当前文本框的文字,***当其内容为“###”时,表示该文本框无效,在训练时会跳过。*** + +- **文本识别** + +标注文件的格式如下, txt文件中默认请将图片路径和图片标签用'\t'分割,如用其他方式分割将造成训练报错。 + +``` +" 图像文件名 图像标注信息 " + +train_data/rec/train/word_001.jpg 简单可依赖 +train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 +... +``` + +## 4. 文本检测 + +选用飞桨OCR开发套件[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)中的PP-OCRv3模型进行文本检测和识别。针对检测模型和识别模型,进行了共计9个方面的升级: + +- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 + +- PP-OCRv3的识别模块是基于文本识别算法SVTR优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。 + +更多细节请参考PP-OCRv3[技术报告](../ppocr/blog/PP-OCRv3_introduction.md)。 + +我们使用 **3种方案** 进行检测模型的训练、评估: + +- **PP-OCRv3英文超轻量检测预训练模型直接评估** +- PP-OCRv3英文超轻量检测预训练模型 + **验证集padding**直接评估 +- PP-OCRv3英文超轻量检测预训练模型 + **fine-tune** + +### 4.1 预训练模型直接评估 + +我们首先通过PaddleOCR提供的预训练模型在验证集上进行评估,如果评估指标能满足效果,可以直接使用预训练模型,不再需要训练。 + +使用预训练模型直接评估步骤如下: + +#### 1)下载预训练模型 + +PaddleOCR已经提供了PP-OCR系列模型,部分模型展示如下表所示: + +| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 | +| ------- | ----------------------- | --------------- | ---------------- | ------- | --------- | +| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | + +更多模型下载(包括多语言),可以参[考PP-OCR系列模型下载](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.5/doc/doc_ch/models_list.md) + +这里我们使用PP-OCRv3英文超轻量检测模型,下载并解压预训练模型: + +```python linenums="1" +# 如果更换其他模型,更新下载链接和解压指令就可以 +cd /home/aistudio/PaddleOCR +mkdir pretrain_models +cd pretrain_models +# 下载英文预训练模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar +tar xf en_PP-OCRv3_det_distill_train.tar && rm -rf en_PP-OCRv3_det_distill_train.tar +%cd .. +``` + +**模型评估** + +首先修改配置文件`configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml`中的以下字段: + +```text linenums="1" +Eval.dataset.data_dir:指向验证集图片存放目录,'/home/aistudio/dataset' +Eval.dataset.label_file_list:指向验证集标注文件,'/home/aistudio/dataset/det_gt_val.txt' +Eval.dataset.transforms.DetResizeForTest: 尺寸 + limit_side_len: 48 + limit_type: 'min' +``` + +然后在验证集上进行评估,具体代码如下: + +```python linenums="1" +cd /home/aistudio/PaddleOCR +python tools/eval.py \ + -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Global.checkpoints="./pretrain_models/en_PP-OCRv3_det_distill_train/best_accuracy" +``` + +### **4.2 预训练模型+验证集padding直接评估** + +考虑到PCB图片比较小,宽度只有25左右、高度只有140-170左右,我们在原图的基础上进行padding,再进行检测评估,padding前后效果对比如 **图4** 所示: + + + +将图片都padding到300*300大小,因为坐标信息发生了变化,我们同时要修改标注文件,在`/home/aistudio/dataset`目录里也提供了padding之后的图片,大家也可以尝试训练和评估: + +同上,我们需要修改配置文件`configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml`中的以下字段: + +```text linenums="1" +Eval.dataset.data_dir:指向验证集图片存放目录,'/home/aistudio/dataset' +Eval.dataset.label_file_list:指向验证集标注文件,/home/aistudio/dataset/det_gt_padding_val.txt +Eval.dataset.transforms.DetResizeForTest: 尺寸 + limit_side_len: 1100 + limit_type: 'min' +``` + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +将下载或训练完成的模型放置在对应目录下即可完成模型推理 + +```python linenums="1" +cd /home/aistudio/PaddleOCR +python tools/eval.py \ + -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Global.checkpoints="./pretrain_models/en_PP-OCRv3_det_distill_train/best_accuracy" +``` + +### **4.3 预训练模型+fine-tune** + +基于预训练模型,在生成的1500图片上进行fine-tune训练和评估,其中train数据1200张,val数据300张,修改配置文件`configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml`中的以下字段: + +```yaml linenums="1" +Global.epoch_num: 这里设置为1,方便快速跑通,实际中根据数据量调整该值 +Global.save_model_dir:模型保存路径 +Global.pretrained_model:指向预训练模型路径,'./pretrain_models/en_PP-OCRv3_det_distill_train/student.pdparams' +Optimizer.lr.learning_rate:调整学习率,本实验设置为0.0005 +Train.dataset.data_dir:指向训练集图片存放目录,'/home/aistudio/dataset' +Train.dataset.label_file_list:指向训练集标注文件,'/home/aistudio/dataset/det_gt_train.txt' +Train.dataset.transforms.EastRandomCropData.size:训练尺寸改为[480,64] +Eval.dataset.data_dir:指向验证集图片存放目录,'/home/aistudio/dataset/' +Eval.dataset.label_file_list:指向验证集标注文件,'/home/aistudio/dataset/det_gt_val.txt' +Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数 + limit_side_len: 64 + limit_type:'min' +``` + +执行下面命令启动训练: + +```python linenums="1" +cd /home/aistudio/PaddleOCR/ +python tools/train.py \ + -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml +``` + +**模型评估** + +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`: + +```python linenums="1" +cd /home/aistudio/PaddleOCR/ +python3 tools/eval.py \ + -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ + -o Global.checkpoints="./output/ch_PP-OCR_V3_det/latest" +``` + +使用训练好的模型进行评估,指标如下所示: + +| 序号 | 方案 | hmean | 效果提升 | 实验分析 | +| -------- | -------- | -------- | -------- | -------- | +| 1 | PP-OCRv3英文超轻量检测预训练模型 | 64.64% | - | 提供的预训练模型具有泛化能力 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding | 72.13% |+7.49% | padding可以提升尺寸较小图片的检测效果| +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.87% | fine-tune会提升垂类场景效果 | + +注:上述实验结果均是在1500张图片(1200张训练集,300张测试集)上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 + +## 5. 文本识别 + +我们分别使用如下4种方案进行训练、评估: + +- **方案1**:**PP-OCRv3中英文超轻量识别预训练模型直接评估** +- **方案2**:PP-OCRv3中英文超轻量检测预训练模型 + **fine-tune** +- **方案3**:PP-OCRv3中英文超轻量检测预训练模型 + fine-tune + **公开通用识别数据集** +- **方案4**:PP-OCRv3中英文超轻量检测预训练模型 + fine-tune + **增加PCB图像数量** + +### **5.1 预训练模型直接评估** + +同检测模型,我们首先使用PaddleOCR提供的识别预训练模型在PCB验证集上进行评估。 + +使用预训练模型直接评估步骤如下: + +**1)下载预训练模型** + +我们使用PP-OCRv3中英文超轻量文本识别模型,下载并解压预训练模型: + +```python linenums="1" +# 如果更换其他模型,更新下载链接和解压指令就可以 +cd /home/aistudio/PaddleOCR/pretrain_models/ +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +tar xf ch_PP-OCRv3_rec_train.tar && rm -rf ch_PP-OCRv3_rec_train.tar +cd .. +``` + +**模型评估** +首先修改配置文件`configs/det/ch_PP-OCRv3/ch_PP-OCRv2_rec_distillation.yml`中的以下字段: + +```text linenums="1" +Metric.ignore_space: True:忽略空格 +Eval.dataset.data_dir:指向验证集图片存放目录,'/home/aistudio/dataset' +Eval.dataset.label_file_list:指向验证集标注文件,'/home/aistudio/dataset/rec_gt_val.txt' +``` + +我们使用下载的预训练模型进行评估: + +```python linenums="1" +cd /home/aistudio/PaddleOCR +python3 tools/eval.py \ + -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml \ + -o Global.checkpoints=pretrain_models/ch_PP-OCRv3_rec_train/best_accuracy + +``` + +### **5.2 三种fine-tune方案** + +方案2、3、4训练和评估方式是相同的,因此在我们了解每个技术方案之后,再具体看修改哪些参数是相同,哪些是不同的。 + +**方案介绍:** + +1) **方案2**:预训练模型 + **fine-tune** + +- 在预训练模型的基础上进行fine-tune,使用1500张PCB进行训练和评估,其中训练集1200张,验证集300张。 + +2) **方案3**:预训练模型 + fine-tune + **公开通用识别数据集** + +- 当识别数据比较少的情况,可以考虑添加公开通用识别数据集。在方案2的基础上,添加公开通用识别数据集,如lsvt、rctw等。 + +3)**方案4**:预训练模型 + fine-tune + **增加PCB图像数量** + +- 如果能够获取足够多真实场景,我们可以通过增加数据量提升模型效果。在方案2的基础上,增加PCB的数量到2W张左右。 + +**参数修改:** + +接着我们看需要修改的参数,以上方案均需要修改配置文件`configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml`的参数,**修改一次即可**: + +```yaml linenums="1" +Global.pretrained_model:指向预训练模型路径,'pretrain_models/ch_PP-OCRv3_rec_train/best_accuracy' +Optimizer.lr.values:学习率,本实验设置为0.0005 +Train.loader.batch_size_per_card: batch size,默认128,因为数据量小于128,因此我们设置为8,数据量大可以按默认的训练 +Eval.loader.batch_size_per_card: batch size,默认128,设置为4 +Metric.ignore_space: 忽略空格,本实验设置为True +``` + +**更换不同的方案**每次需要修改的参数: + +```yaml linenums="1" +Global.epoch_num: 这里设置为1,方便快速跑通,实际中根据数据量调整该值 +Global.save_model_dir:指向模型保存路径 +Train.dataset.data_dir:指向训练集图片存放目录 +Train.dataset.label_file_list:指向训练集标注文件 +Eval.dataset.data_dir:指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +同时**方案3**修改以下参数 + +```text linenums="1" +Eval.dataset.label_file_list:添加公开通用识别数据标注文件 +Eval.dataset.ratio_list:数据和公开通用识别数据每次采样比例,按实际修改即可 +``` + +如 **图5** 所示: + +![](./images/0fa18b25819042d9bbf3397c3af0e21433b23d52f7a84b0a8681b8e6a308d433.png) + +我们提取Student模型的参数,在PCB数据集上进行fine-tune,可以参考如下代码: + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("./pretrain_models/ch_PP-OCRv3_rec_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("student_model."):]: all_params[key] for key in all_params if "student_model." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "./pretrain_models/ch_PP-OCRv3_rec_train/student.pdparams") +``` + +修改参数后,**每个方案**都执行如下命令启动训练: + +```python linenums="1" +cd /home/aistudio/PaddleOCR/ +python3 tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml +``` + +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`: + +```python linenums="1" +cd /home/aistudio/PaddleOCR/ +python3 tools/eval.py \ + -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml \ + -o Global.checkpoints=./output/rec_ppocr_v3/latest +``` + +所有方案评估指标如下: + +| 序号 | 方案 | acc | 效果提升 | 实验分析 | +| -------- | -------- | -------- | -------- | -------- | +| 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% |-4.65% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试)| +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.33% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +22.99% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | + +注:上述实验结果均是在1500张图片(1200张训练集,300张测试集)、2W张图片、添加公开通用识别数据集上训练、评估的得到,AIstudio只提供了100张数据,所以指标有所差异属于正常,只要策略有效、规律相同即可。 + +## 6. 模型导出 + +inference 模型(paddle.jit.save保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +```python linenums="1" +# 导出检测模型 +python3 tools/export_model.py \ + -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ + -o Global.pretrained_model="./output/ch_PP-OCR_V3_det/latest" \ + Global.save_inference_dir="./inference_model/ch_PP-OCR_V3_det/" +``` + +因为上述模型只训练了1个epoch,因此我们使用训练最优的模型进行预测,存储在`/home/aistudio/best_models/`目录下,解压即可 + +```python linenums="1" +cd /home/aistudio/best_models/ +wget https://paddleocr.bj.bcebos.com/fanliku/PCB/det_ppocr_v3_en_infer_PCB.tar +tar xf /home/aistudio/best_models/det_ppocr_v3_en_infer_PCB.tar -C /home/aistudio/PaddleOCR/pretrain_models/ +``` + +```python linenums="1" +# 检测模型inference模型预测 +cd /home/aistudio/PaddleOCR/ +python3 tools/infer/predict_det.py \ + --image_dir="/home/aistudio/dataset/imgs/0000.jpg" \ + --det_algorithm="DB" \ + --det_model_dir="./pretrain_models/det_ppocr_v3_en_infer_PCB/" \ + --det_limit_side_len=48 \ + --det_limit_type='min' \ + --det_db_unclip_ratio=2.5 \ + --use_gpu=True +``` + +结果存储在`inference_results`目录下,检测如下图所示: + +![](./images/5939ae15a1f0445aaeec15c68107dbd897740a1ddd284bf8b583bb6242099157.jpeg) + +同理,导出识别模型并进行推理。 + +```python linenums="1" +# 导出识别模型 +python3 tools/export_model.py \ + -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml \ + -o Global.pretrained_model="./output/rec_ppocr_v3/latest" \ + Global.save_inference_dir="./inference_model/rec_ppocr_v3/" + +``` + +同检测模型,识别模型也只训练了1个epoch,因此我们使用训练最优的模型进行预测,存储在`/home/aistudio/best_models/`目录下,解压即可 + +```python linenums="1" +cd /home/aistudio/best_models/ +wget https://paddleocr.bj.bcebos.com/fanliku/PCB/rec_ppocr_v3_ch_infer_PCB.tar +tar xf /home/aistudio/best_models/rec_ppocr_v3_ch_infer_PCB.tar -C /home/aistudio/PaddleOCR/pretrain_models/ +``` + +```python linenums="1" +# 识别模型inference模型预测 +cd /home/aistudio/PaddleOCR/ +python3 tools/infer/predict_rec.py \ + --image_dir="../test_imgs/0000_rec.jpg" \ + --rec_model_dir="./pretrain_models/rec_ppocr_v3_ch_infer_PCB" \ + --rec_image_shape="3, 48, 320" \ + --use_space_char=False \ + --use_gpu=True +``` + +```python linenums="1" +# 检测+识别模型inference模型预测 +cd /home/aistudio/PaddleOCR/ +python3 tools/infer/predict_system.py \ + --image_dir="../test_imgs/0000.jpg" \ + --det_model_dir="./pretrain_models/det_ppocr_v3_en_infer_PCB" \ + --det_limit_side_len=48 \ + --det_limit_type='min' \ + --det_db_unclip_ratio=2.5 \ + --rec_model_dir="./pretrain_models/rec_ppocr_v3_ch_infer_PCB" \ + --rec_image_shape="3, 48, 320" \ + --draw_img_save_dir=./det_rec_infer/ \ + --use_space_char=False \ + --use_angle_cls=False \ + --use_gpu=True + +``` + +端到端预测结果存储在`det_res_infer`文件夹内,结果如下图所示: + +![](./images/c570f343c29846c792da56ebaca16c50708477514dd048cea8bef37ffa85d03f.jpeg) + +## 7. 端对端评测 + +接下来介绍文本检测+文本识别的端对端指标评估方式。主要分为三步: + +1)首先运行`tools/infer/predict_system.py`,将`image_dir`改为需要评估的数据文件家,得到保存的结果: + +```python linenums="1" +# 检测+识别模型inference模型预测 +python3 tools/infer/predict_system.py \ + --image_dir="../dataset/imgs/" \ + --det_model_dir="./pretrain_models/det_ppocr_v3_en_infer_PCB" \ + --det_limit_side_len=48 \ + --det_limit_type='min' \ + --det_db_unclip_ratio=2.5 \ + --rec_model_dir="./pretrain_models/rec_ppocr_v3_ch_infer_PCB" \ + --rec_image_shape="3, 48, 320" \ + --draw_img_save_dir=./det_rec_infer/ \ + --use_space_char=False \ + --use_angle_cls=False \ + --use_gpu=True +``` + +得到保存结果,文本检测识别可视化图保存在`det_rec_infer/`目录下,预测结果保存在`det_rec_infer/system_results.txt`中,格式如下:`0018.jpg [{"transcription": "E295", "points": [[88, 33], [137, 33], [137, 40], [88, 40]]}]` + +2)然后将步骤一保存的数据转换为端对端评测需要的数据格式: 修改 `tools/end2end/convert_ppocr_label.py`中的代码,convert_label函数中设置输入标签路径,Mode,保存标签路径等,对预测数据的GTlabel和预测结果的label格式进行转换。 + +```python linenums="1" +ppocr_label_gt = "/home/aistudio/dataset/det_gt_val.txt" +convert_label(ppocr_label_gt, "gt", "./save_gt_label/") + +ppocr_label_gt = "/home/aistudio/PaddleOCR/PCB_result/det_rec_infer/system_results.txt" +convert_label(ppocr_label_gt, "pred", "./save_PPOCRV2_infer/") +``` + +运行`convert_ppocr_label.py`: + +```bash linenums="1" +python3 tools/end2end/convert_ppocr_label.py +``` + +得到如下结果: + +```text linenums="1" +├── ./save_gt_label/ +├── ./save_PPOCRV2_infer/ +``` + +3) 最后,执行端对端评测,运行`tools/end2end/eval_end2end.py`计算端对端指标,运行方式如下: + +```python linenums="1" +pip install editdistance +python3 tools/end2end/eval_end2end.py ./save_gt_label/ ./save_PPOCRV2_infer/ +``` + +使用`预训练模型+fine-tune'检测模型`、`预训练模型 + 2W张PCB图片funetune`识别模型,在300张PCB图片上评估得到如下结果,fmeasure为主要关注的指标: + +![](./images/37206ea48a244212ae7a821d50d1fd51faf3d7fe97ac47a29f04dfcbb377b019.png) + +注: 使用上述命令不能跑出该结果,因为数据集不相同,可以更换为自己训练好的模型,按上述流程运行 + +## 8. Jetson部署 + +我们只需要以下步骤就可以完成Jetson nano部署模型,简单易操作: + +**1、在Jetson nano开发版上环境准备:** + +- 安装PaddlePaddle + +- 下载PaddleOCR并安装依赖 + +**2、执行预测** + +- 将推理模型下载到jetson + +- 执行检测、识别、串联预测即可 + +详细[参考流程](../ppocr/infer_deploy/Jetson_infer.md)。 + +## 9. 总结 + +检测实验分别使用PP-OCRv3预训练模型在PCB数据集上进行了直接评估、验证集padding、 fine-tune 3种方案,识别实验分别使用PP-OCRv3预训练模型在PCB数据集上进行了直接评估、 fine-tune、添加公开通用识别数据集、增加PCB图片数量4种方案,指标对比如下: + +- 检测 + +| 序号 | 方案 | hmean | 效果提升 | 实验分析 | +| ---- | -------------------------------------------------------- | ------ | -------- | ------------------------------------- | +| 1 | PP-OCRv3英文超轻量检测预训练模型直接评估 | 64.64% | - | 提供的预训练模型具有泛化能力 | +| 2 | PP-OCRv3英文超轻量检测预训练模型 + 验证集padding直接评估 | 72.13% | +7.49% | padding可以提升尺寸较小图片的检测效果 | +| 3 | PP-OCRv3英文超轻量检测预训练模型 + fine-tune | 100.00% | +27.87% | fine-tune会提升垂类场景效果 | + +- 识别 + +| 序号 | 方案 | acc | 效果提升 | 实验分析 | +| ---- | ------------------------------------------------------------ | ------ | -------- | ------------------------------------------------------------ | +| 1 | PP-OCRv3中英文超轻量识别预训练模型直接评估 | 46.67% | - | 提供的预训练模型具有泛化能力 | +| 2 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune | 42.02% | -4.65% | 在数据量不足的情况,反而比预训练模型效果低(也可以通过调整超参数再试试) | +| 3 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 公开通用识别数据集 | 77.00% | +30.33% | 在数据量不足的情况下,可以考虑补充公开数据训练 | +| 4 | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 99.99% | +22.99% | 如果能获取更多数据量的情况,可以通过增加数据量提升效果 | + +- 端到端 + +| det | rec | fmeasure | +| --------------------------------------------- | ------------------------------------------------------------ | -------- | +| PP-OCRv3英文超轻量检测预训练模型 + fine-tune | PP-OCRv3中英文超轻量识别预训练模型 + fine-tune + 增加PCB图像数量 | 93.30% | + +*结论* + +PP-OCRv3的检测模型在未经过fine-tune的情况下,在PCB数据集上也有64.64%的精度,说明具有泛化能力。验证集padding之后,精度提升7.5%,在图片尺寸较小的情况,我们可以通过padding的方式提升检测效果。经过 fine-tune 后能够极大的提升检测效果,精度达到100%。 + +PP-OCRv3的识别模型方案1和方案2对比可以发现,当数据量不足的情况,预训练模型精度可能比fine-tune效果还要高,所以我们可以先尝试预训练模型直接评估。如果在数据量不足的情况下想进一步提升模型效果,可以通过添加公开通用识别数据集,识别效果提升30%,非常有效。最后如果我们能够采集足够多的真实场景数据集,可以通过增加数据量提升模型效果,精度达到99.99%。 + +## 更多资源 + +- 更多深度学习知识、产业案例、面试宝典等,请参考:[awesome-DeepLearning](https://github.com/paddlepaddle/awesome-DeepLearning) + +- 更多PaddleOCR使用教程,请参考:[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph) + +- 飞桨框架相关资料,请参考:[飞桨深度学习平台](https://www.paddlepaddle.org.cn/?fr=paddleEdu_aistudio) + +## 参考 + +- 数据生成代码库: diff --git a/docs/applications/images/0639da09b774458096ae577e82b2c59e89ced6a00f55458f946997ab7472a4f8.jpeg b/docs/applications/images/0639da09b774458096ae577e82b2c59e89ced6a00f55458f946997ab7472a4f8.jpeg new file mode 100644 index 0000000000..4f8e720195 Binary files /dev/null and b/docs/applications/images/0639da09b774458096ae577e82b2c59e89ced6a00f55458f946997ab7472a4f8.jpeg differ diff --git a/docs/applications/images/06af09bde845449ba0a676410f4daa1cdc3983ac95034bdbbafac3b7fd94042f.jpeg b/docs/applications/images/06af09bde845449ba0a676410f4daa1cdc3983ac95034bdbbafac3b7fd94042f.jpeg new file mode 100644 index 0000000000..273f81077e Binary files /dev/null and b/docs/applications/images/06af09bde845449ba0a676410f4daa1cdc3983ac95034bdbbafac3b7fd94042f.jpeg differ diff --git a/docs/applications/images/07c3b060c54e4b00be7de8d41a8a4696ff53835343cc4981aab0555183306e79.jpeg b/docs/applications/images/07c3b060c54e4b00be7de8d41a8a4696ff53835343cc4981aab0555183306e79.jpeg new file mode 100644 index 0000000000..8409f79c00 Binary files /dev/null and b/docs/applications/images/07c3b060c54e4b00be7de8d41a8a4696ff53835343cc4981aab0555183306e79.jpeg differ diff --git a/docs/applications/images/0b056be24f374812b61abf43305774767ae122c8479242f98aa0799b7bfc81d4.jpeg b/docs/applications/images/0b056be24f374812b61abf43305774767ae122c8479242f98aa0799b7bfc81d4.jpeg new file mode 100644 index 0000000000..8bd59226fc Binary files /dev/null and b/docs/applications/images/0b056be24f374812b61abf43305774767ae122c8479242f98aa0799b7bfc81d4.jpeg differ diff --git a/docs/applications/images/0d582de9aa46474791e08654f84a614a6510e98bfe5f4ad3a26501cbf49ec151.jpeg b/docs/applications/images/0d582de9aa46474791e08654f84a614a6510e98bfe5f4ad3a26501cbf49ec151.jpeg new file mode 100644 index 0000000000..bdfeeaf7a7 Binary files /dev/null and b/docs/applications/images/0d582de9aa46474791e08654f84a614a6510e98bfe5f4ad3a26501cbf49ec151.jpeg differ diff --git a/docs/applications/images/0e25da2ccded4af19e95c85c3d3287ab4d53e31a4eed4607b6a4cb637c43f6d3.jpeg b/docs/applications/images/0e25da2ccded4af19e95c85c3d3287ab4d53e31a4eed4607b6a4cb637c43f6d3.jpeg new file mode 100644 index 0000000000..b9fbfeb51d Binary files /dev/null and b/docs/applications/images/0e25da2ccded4af19e95c85c3d3287ab4d53e31a4eed4607b6a4cb637c43f6d3.jpeg differ diff --git a/docs/applications/images/0f650c032b0f4d56bd639713924768cc820635e9977845008d233f465291a29e.jpeg b/docs/applications/images/0f650c032b0f4d56bd639713924768cc820635e9977845008d233f465291a29e.jpeg new file mode 100644 index 0000000000..0ead4514a3 Binary files /dev/null and b/docs/applications/images/0f650c032b0f4d56bd639713924768cc820635e9977845008d233f465291a29e.jpeg differ diff --git a/docs/applications/images/0f7d50a0fb924b408b93e1fbd6ca64148eed34a2e6724280acd3e113fef7dc48.jpeg b/docs/applications/images/0f7d50a0fb924b408b93e1fbd6ca64148eed34a2e6724280acd3e113fef7dc48.jpeg new file mode 100644 index 0000000000..da01579fa2 Binary files /dev/null and b/docs/applications/images/0f7d50a0fb924b408b93e1fbd6ca64148eed34a2e6724280acd3e113fef7dc48.jpeg differ diff --git a/docs/applications/images/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45.jpeg b/docs/applications/images/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45.jpeg new file mode 100644 index 0000000000..f545b5c86d Binary files /dev/null and b/docs/applications/images/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45.jpeg differ diff --git a/docs/applications/images/0fa18b25819042d9bbf3397c3af0e21433b23d52f7a84b0a8681b8e6a308d433.png b/docs/applications/images/0fa18b25819042d9bbf3397c3af0e21433b23d52f7a84b0a8681b8e6a308d433.png new file mode 100644 index 0000000000..b5ab7be9a1 Binary files /dev/null and b/docs/applications/images/0fa18b25819042d9bbf3397c3af0e21433b23d52f7a84b0a8681b8e6a308d433.png differ diff --git a/docs/applications/images/1.jpeg b/docs/applications/images/1.jpeg new file mode 100644 index 0000000000..c6724f6506 Binary files /dev/null and b/docs/applications/images/1.jpeg differ diff --git a/docs/applications/images/12d402e6a06d482a88f979e0ebdfb39f4d3fc8b80517499689ec607ddb04fbf3.jpeg b/docs/applications/images/12d402e6a06d482a88f979e0ebdfb39f4d3fc8b80517499689ec607ddb04fbf3.jpeg new file mode 100644 index 0000000000..fae7721f29 Binary files /dev/null and b/docs/applications/images/12d402e6a06d482a88f979e0ebdfb39f4d3fc8b80517499689ec607ddb04fbf3.jpeg differ diff --git a/docs/applications/images/166ce56d634c4c7589fe68fbc6e7ae663305dcc82ba144c781507341ffae7fe8.jpeg b/docs/applications/images/166ce56d634c4c7589fe68fbc6e7ae663305dcc82ba144c781507341ffae7fe8.jpeg new file mode 100644 index 0000000000..0cfb3c34ee Binary files /dev/null and b/docs/applications/images/166ce56d634c4c7589fe68fbc6e7ae663305dcc82ba144c781507341ffae7fe8.jpeg differ diff --git a/docs/applications/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240704190212828.jpg b/docs/applications/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240704190212828.jpg new file mode 100644 index 0000000000..6a5fd84c52 Binary files /dev/null and b/docs/applications/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240704190212828.jpg differ diff --git a/docs/applications/images/185381131-76b6e260-04fe-46d9-baca-6bdd7fe0d0ce.jpg b/docs/applications/images/185381131-76b6e260-04fe-46d9-baca-6bdd7fe0d0ce.jpg new file mode 100644 index 0000000000..fc5620caaf Binary files /dev/null and b/docs/applications/images/185381131-76b6e260-04fe-46d9-baca-6bdd7fe0d0ce.jpg differ diff --git a/docs/applications/images/185384321-61153faa-e407-45c4-8e7c-a39540248189.jpg b/docs/applications/images/185384321-61153faa-e407-45c4-8e7c-a39540248189.jpg new file mode 100644 index 0000000000..50e76d8aa8 Binary files /dev/null and b/docs/applications/images/185384321-61153faa-e407-45c4-8e7c-a39540248189.jpg differ diff --git a/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d-20240704190305748.jpg b/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d-20240704190305748.jpg new file mode 100644 index 0000000000..a80ea6c574 Binary files /dev/null and b/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d-20240704190305748.jpg differ diff --git a/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d.jpg b/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d.jpg new file mode 100644 index 0000000000..a80ea6c574 Binary files /dev/null and b/docs/applications/images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d.jpg differ diff --git a/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704185610566.jpg b/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704185610566.jpg new file mode 100644 index 0000000000..d406a52da8 Binary files /dev/null and b/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704185610566.jpg differ diff --git a/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704190316813.jpg b/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704190316813.jpg new file mode 100644 index 0000000000..d406a52da8 Binary files /dev/null and b/docs/applications/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704190316813.jpg differ diff --git a/docs/applications/images/190587903-ccdfa6fb-51e8-42de-b08b-a127cb04e304.jpeg b/docs/applications/images/190587903-ccdfa6fb-51e8-42de-b08b-a127cb04e304.jpeg new file mode 100644 index 0000000000..aa899eba86 Binary files /dev/null and b/docs/applications/images/190587903-ccdfa6fb-51e8-42de-b08b-a127cb04e304.jpeg differ diff --git a/docs/applications/images/190596141-74f4feda-b082-46d7-908d-b0bd5839b430.jpg b/docs/applications/images/190596141-74f4feda-b082-46d7-908d-b0bd5839b430.jpg new file mode 100644 index 0000000000..01864b867f Binary files /dev/null and b/docs/applications/images/190596141-74f4feda-b082-46d7-908d-b0bd5839b430.jpg differ diff --git a/docs/applications/images/190597086-2e685200-22d0-4042-9e46-f61f24e02e4e.jpg b/docs/applications/images/190597086-2e685200-22d0-4042-9e46-f61f24e02e4e.jpg new file mode 100644 index 0000000000..790cfa295c Binary files /dev/null and b/docs/applications/images/190597086-2e685200-22d0-4042-9e46-f61f24e02e4e.jpg differ diff --git a/docs/applications/images/190599426-3415b38e-e16e-4e68-9253-2ff531b1b5ca.png b/docs/applications/images/190599426-3415b38e-e16e-4e68-9253-2ff531b1b5ca.png new file mode 100644 index 0000000000..eafb642486 Binary files /dev/null and b/docs/applications/images/190599426-3415b38e-e16e-4e68-9253-2ff531b1b5ca.png differ diff --git a/docs/applications/images/23a5a19c746441309864586e467f995ec8a551a3661640e493fc4d77520309cd.jpeg b/docs/applications/images/23a5a19c746441309864586e467f995ec8a551a3661640e493fc4d77520309cd.jpeg new file mode 100644 index 0000000000..bd16607aa4 Binary files /dev/null and b/docs/applications/images/23a5a19c746441309864586e467f995ec8a551a3661640e493fc4d77520309cd.jpeg differ diff --git a/docs/applications/images/268c707a62c54e93958d2b2ab29e0932953aad41819e44aaaaa05c8ad85c6491.jpeg b/docs/applications/images/268c707a62c54e93958d2b2ab29e0932953aad41819e44aaaaa05c8ad85c6491.jpeg new file mode 100644 index 0000000000..a80ec7cb12 Binary files /dev/null and b/docs/applications/images/268c707a62c54e93958d2b2ab29e0932953aad41819e44aaaaa05c8ad85c6491.jpeg differ diff --git a/docs/applications/images/2854aee557a74079a82dd5cd57e48bc2ce97974d5637477fb4deea137d0e312c.png b/docs/applications/images/2854aee557a74079a82dd5cd57e48bc2ce97974d5637477fb4deea137d0e312c.png new file mode 100644 index 0000000000..1039b1ac02 Binary files /dev/null and b/docs/applications/images/2854aee557a74079a82dd5cd57e48bc2ce97974d5637477fb4deea137d0e312c.png differ diff --git a/docs/applications/images/2aff41ee8fce4e9bac8295cc00720217bde2aeee7ee7473689848bed0b6fde05.jpeg b/docs/applications/images/2aff41ee8fce4e9bac8295cc00720217bde2aeee7ee7473689848bed0b6fde05.jpeg new file mode 100644 index 0000000000..94de02d894 Binary files /dev/null and b/docs/applications/images/2aff41ee8fce4e9bac8295cc00720217bde2aeee7ee7473689848bed0b6fde05.jpeg differ diff --git a/docs/applications/images/2e45f297c9d44ca5b8718ae100a365f7348eaeed4cb8495b904f28a9c8075d8a.jpeg b/docs/applications/images/2e45f297c9d44ca5b8718ae100a365f7348eaeed4cb8495b904f28a9c8075d8a.jpeg new file mode 100644 index 0000000000..afcd1eb3ab Binary files /dev/null and b/docs/applications/images/2e45f297c9d44ca5b8718ae100a365f7348eaeed4cb8495b904f28a9c8075d8a.jpeg differ diff --git a/docs/applications/images/31e3dbee31d441d2a36d45b5af660e832dfa2f437f4d49a1914312a15b6a29a7.jpeg b/docs/applications/images/31e3dbee31d441d2a36d45b5af660e832dfa2f437f4d49a1914312a15b6a29a7.jpeg new file mode 100644 index 0000000000..91aa63ad3e Binary files /dev/null and b/docs/applications/images/31e3dbee31d441d2a36d45b5af660e832dfa2f437f4d49a1914312a15b6a29a7.jpeg differ diff --git a/docs/applications/images/3277b750159f4b68b2b58506bfec9005d49aeb5fb1d9411e83f96f9ff7eb66a5.png b/docs/applications/images/3277b750159f4b68b2b58506bfec9005d49aeb5fb1d9411e83f96f9ff7eb66a5.png new file mode 100644 index 0000000000..15e0467c03 Binary files /dev/null and b/docs/applications/images/3277b750159f4b68b2b58506bfec9005d49aeb5fb1d9411e83f96f9ff7eb66a5.png differ diff --git a/docs/applications/images/37206ea48a244212ae7a821d50d1fd51faf3d7fe97ac47a29f04dfcbb377b019.png b/docs/applications/images/37206ea48a244212ae7a821d50d1fd51faf3d7fe97ac47a29f04dfcbb377b019.png new file mode 100644 index 0000000000..f0a50b47ee Binary files /dev/null and b/docs/applications/images/37206ea48a244212ae7a821d50d1fd51faf3d7fe97ac47a29f04dfcbb377b019.png differ diff --git a/docs/applications/images/39ff30e0ab0442579712255e6a9ea6b5271169c98e624e6eb2b8781f003bfea0.png b/docs/applications/images/39ff30e0ab0442579712255e6a9ea6b5271169c98e624e6eb2b8781f003bfea0.png new file mode 100644 index 0000000000..936dee3a54 Binary files /dev/null and b/docs/applications/images/39ff30e0ab0442579712255e6a9ea6b5271169c98e624e6eb2b8781f003bfea0.png differ diff --git a/docs/applications/images/3bce057a8e0c40a0acbd26b2e29e4e2590a31bc412764be7b9e49799c69cb91c.jpg b/docs/applications/images/3bce057a8e0c40a0acbd26b2e29e4e2590a31bc412764be7b9e49799c69cb91c.jpg new file mode 100644 index 0000000000..d1e60cf7f4 Binary files /dev/null and b/docs/applications/images/3bce057a8e0c40a0acbd26b2e29e4e2590a31bc412764be7b9e49799c69cb91c.jpg differ diff --git a/docs/applications/images/3d762970e2184177a2c633695a31029332a4cd805631430ea797309492e45402.jpeg b/docs/applications/images/3d762970e2184177a2c633695a31029332a4cd805631430ea797309492e45402.jpeg new file mode 100644 index 0000000000..8124a50222 Binary files /dev/null and b/docs/applications/images/3d762970e2184177a2c633695a31029332a4cd805631430ea797309492e45402.jpeg differ diff --git a/docs/applications/images/3dc7f69fac174cde96b9d08b5e2353a1d88dc63e7be9410894c0783660b35b76.jpeg b/docs/applications/images/3dc7f69fac174cde96b9d08b5e2353a1d88dc63e7be9410894c0783660b35b76.jpeg new file mode 100644 index 0000000000..9d134f5da4 Binary files /dev/null and b/docs/applications/images/3dc7f69fac174cde96b9d08b5e2353a1d88dc63e7be9410894c0783660b35b76.jpeg differ diff --git a/docs/applications/images/3de0d475c69746d0a184029001ef07c85fd68816d66d4beaa10e6ef60030f9b4.jpeg b/docs/applications/images/3de0d475c69746d0a184029001ef07c85fd68816d66d4beaa10e6ef60030f9b4.jpeg new file mode 100644 index 0000000000..825c2c8e28 Binary files /dev/null and b/docs/applications/images/3de0d475c69746d0a184029001ef07c85fd68816d66d4beaa10e6ef60030f9b4.jpeg differ diff --git a/docs/applications/images/42d2188d3d6b498880952e12c3ceae1efabf135f8d9f4c31823f09ebe02ba9d2.jpeg b/docs/applications/images/42d2188d3d6b498880952e12c3ceae1efabf135f8d9f4c31823f09ebe02ba9d2.jpeg new file mode 100644 index 0000000000..bf72ad21c0 Binary files /dev/null and b/docs/applications/images/42d2188d3d6b498880952e12c3ceae1efabf135f8d9f4c31823f09ebe02ba9d2.jpeg differ diff --git a/docs/applications/images/456ae2acb27d4a94896c478812aee0bc3551c703d7bd40c9be4dc983c7b3fc8a.png b/docs/applications/images/456ae2acb27d4a94896c478812aee0bc3551c703d7bd40c9be4dc983c7b3fc8a.png new file mode 100644 index 0000000000..5d6ad7b223 Binary files /dev/null and b/docs/applications/images/456ae2acb27d4a94896c478812aee0bc3551c703d7bd40c9be4dc983c7b3fc8a.png differ diff --git a/docs/applications/images/45f288ce8b2c45d8aa5407785b4b40f4876fc3da23744bd7a78060797fba0190.jpeg b/docs/applications/images/45f288ce8b2c45d8aa5407785b4b40f4876fc3da23744bd7a78060797fba0190.jpeg new file mode 100644 index 0000000000..1737e0eaeb Binary files /dev/null and b/docs/applications/images/45f288ce8b2c45d8aa5407785b4b40f4876fc3da23744bd7a78060797fba0190.jpeg differ diff --git a/docs/applications/images/46258d0dc9dc40bab3ea0e70434e4a905646df8a647f4c49921e217de5142def.jpeg b/docs/applications/images/46258d0dc9dc40bab3ea0e70434e4a905646df8a647f4c49921e217de5142def.jpeg new file mode 100644 index 0000000000..4491dd4b72 Binary files /dev/null and b/docs/applications/images/46258d0dc9dc40bab3ea0e70434e4a905646df8a647f4c49921e217de5142def.jpeg differ diff --git a/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269-20240704185744623.png b/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269-20240704185744623.png new file mode 100644 index 0000000000..201198ae32 Binary files /dev/null and b/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269-20240704185744623.png differ diff --git a/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269.png b/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269.png new file mode 100644 index 0000000000..201198ae32 Binary files /dev/null and b/docs/applications/images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269.png differ diff --git a/docs/applications/images/4de19ca3e54343e88961e816cad28bbacdc807f40b9440be914d871b0a914570.jpeg b/docs/applications/images/4de19ca3e54343e88961e816cad28bbacdc807f40b9440be914d871b0a914570.jpeg new file mode 100644 index 0000000000..1c76ef2f97 Binary files /dev/null and b/docs/applications/images/4de19ca3e54343e88961e816cad28bbacdc807f40b9440be914d871b0a914570.jpeg differ diff --git a/docs/applications/images/4f8f5533a2914e0a821f4a639677843c32ec1f08a1b1488d94c0b8bfb6e72d2d.jpeg b/docs/applications/images/4f8f5533a2914e0a821f4a639677843c32ec1f08a1b1488d94c0b8bfb6e72d2d.jpeg new file mode 100644 index 0000000000..2e72a9831f Binary files /dev/null and b/docs/applications/images/4f8f5533a2914e0a821f4a639677843c32ec1f08a1b1488d94c0b8bfb6e72d2d.jpeg differ diff --git a/docs/applications/images/50a49a3c9f8348bfa04e8c8b97d3cce0d0dd6b14040f43939268d120688ef7ca.jpeg b/docs/applications/images/50a49a3c9f8348bfa04e8c8b97d3cce0d0dd6b14040f43939268d120688ef7ca.jpeg new file mode 100644 index 0000000000..0e90e371dd Binary files /dev/null and b/docs/applications/images/50a49a3c9f8348bfa04e8c8b97d3cce0d0dd6b14040f43939268d120688ef7ca.jpeg differ diff --git a/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b-0096905.png b/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b-0096905.png new file mode 100644 index 0000000000..a1c79b4da8 Binary files /dev/null and b/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b-0096905.png differ diff --git a/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b.png b/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b.png new file mode 100644 index 0000000000..a1c79b4da8 Binary files /dev/null and b/docs/applications/images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b.png differ diff --git a/docs/applications/images/560c44b8dd604da7987bd25da0a882156ffcfb7f6bcb44108fe9bde77512e572.jpeg b/docs/applications/images/560c44b8dd604da7987bd25da0a882156ffcfb7f6bcb44108fe9bde77512e572.jpeg new file mode 100644 index 0000000000..3e060015ba Binary files /dev/null and b/docs/applications/images/560c44b8dd604da7987bd25da0a882156ffcfb7f6bcb44108fe9bde77512e572.jpeg differ diff --git a/docs/applications/images/5939ae15a1f0445aaeec15c68107dbd897740a1ddd284bf8b583bb6242099157.jpeg b/docs/applications/images/5939ae15a1f0445aaeec15c68107dbd897740a1ddd284bf8b583bb6242099157.jpeg new file mode 100644 index 0000000000..030ab548f6 Binary files /dev/null and b/docs/applications/images/5939ae15a1f0445aaeec15c68107dbd897740a1ddd284bf8b583bb6242099157.jpeg differ diff --git a/docs/applications/images/59ab0411c8eb4dfd917fb2b6e5b69a17ee7ca48351444aec9ac6104b79ff1028.jpg b/docs/applications/images/59ab0411c8eb4dfd917fb2b6e5b69a17ee7ca48351444aec9ac6104b79ff1028.jpg new file mode 100644 index 0000000000..bb5f304b8c Binary files /dev/null and b/docs/applications/images/59ab0411c8eb4dfd917fb2b6e5b69a17ee7ca48351444aec9ac6104b79ff1028.jpg differ diff --git a/docs/applications/images/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77.jpeg b/docs/applications/images/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77.jpeg new file mode 100644 index 0000000000..9858df6571 Binary files /dev/null and b/docs/applications/images/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77.jpeg differ diff --git a/docs/applications/images/5df160ac39ee4d9e92a937094bc53a737272f9f2abeb4ddfaebb48e8eccf1be2.jpeg b/docs/applications/images/5df160ac39ee4d9e92a937094bc53a737272f9f2abeb4ddfaebb48e8eccf1be2.jpeg new file mode 100644 index 0000000000..ebb6feeb40 Binary files /dev/null and b/docs/applications/images/5df160ac39ee4d9e92a937094bc53a737272f9f2abeb4ddfaebb48e8eccf1be2.jpeg differ diff --git a/docs/applications/images/5ffff2093a144a6993a75eef71634a52276015ee43a04566b9c89d353198c746.jpeg b/docs/applications/images/5ffff2093a144a6993a75eef71634a52276015ee43a04566b9c89d353198c746.jpeg new file mode 100644 index 0000000000..58aa66ff99 Binary files /dev/null and b/docs/applications/images/5ffff2093a144a6993a75eef71634a52276015ee43a04566b9c89d353198c746.jpeg differ diff --git a/docs/applications/images/60b95b4945954f81a080a8f308cee66f83146479cd1142b9b6b1290938fd1df8.jpeg b/docs/applications/images/60b95b4945954f81a080a8f308cee66f83146479cd1142b9b6b1290938fd1df8.jpeg new file mode 100644 index 0000000000..b2ebcd9651 Binary files /dev/null and b/docs/applications/images/60b95b4945954f81a080a8f308cee66f83146479cd1142b9b6b1290938fd1df8.jpeg differ diff --git a/docs/applications/images/6afdbb77e8db4aef9b169e4e94c5d90a9764cfab4f2c4c04aa9afdf4f54d7680.jpeg b/docs/applications/images/6afdbb77e8db4aef9b169e4e94c5d90a9764cfab4f2c4c04aa9afdf4f54d7680.jpeg new file mode 100644 index 0000000000..20e7a228ca Binary files /dev/null and b/docs/applications/images/6afdbb77e8db4aef9b169e4e94c5d90a9764cfab4f2c4c04aa9afdf4f54d7680.jpeg differ diff --git a/docs/applications/images/6f875b6e695e4fe5aedf427beb0d4ce8064ad7cc33c44faaad59d3eb9732639d.jpeg b/docs/applications/images/6f875b6e695e4fe5aedf427beb0d4ce8064ad7cc33c44faaad59d3eb9732639d.jpeg new file mode 100644 index 0000000000..6783c686aa Binary files /dev/null and b/docs/applications/images/6f875b6e695e4fe5aedf427beb0d4ce8064ad7cc33c44faaad59d3eb9732639d.jpeg differ diff --git a/docs/applications/images/75b0e977dfb74a83851f8828460759f337b1b7a0c33c47a08a30f3570e1e2e74.jpeg b/docs/applications/images/75b0e977dfb74a83851f8828460759f337b1b7a0c33c47a08a30f3570e1e2e74.jpeg new file mode 100644 index 0000000000..00fb5f3f2a Binary files /dev/null and b/docs/applications/images/75b0e977dfb74a83851f8828460759f337b1b7a0c33c47a08a30f3570e1e2e74.jpeg differ diff --git a/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7-20240704185943337.png b/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7-20240704185943337.png new file mode 100644 index 0000000000..87c7515b86 Binary files /dev/null and b/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7-20240704185943337.png differ diff --git a/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7.png b/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7.png new file mode 100644 index 0000000000..87c7515b86 Binary files /dev/null and b/docs/applications/images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7.png differ diff --git a/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b-20240708094343198.png b/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b-20240708094343198.png new file mode 100644 index 0000000000..72cfec34cf Binary files /dev/null and b/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b-20240708094343198.png differ diff --git a/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b.png b/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b.png new file mode 100644 index 0000000000..72cfec34cf Binary files /dev/null and b/docs/applications/images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b.png differ diff --git a/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986-0348339.png b/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986-0348339.png new file mode 100644 index 0000000000..88c8dda981 Binary files /dev/null and b/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986-0348339.png differ diff --git a/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986.png b/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986.png new file mode 100644 index 0000000000..88c8dda981 Binary files /dev/null and b/docs/applications/images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986.png differ diff --git a/docs/applications/images/864604967256461aa7c5d32cd240645e9f4c70af773341d5911f22d5a3e87b5f.jpeg b/docs/applications/images/864604967256461aa7c5d32cd240645e9f4c70af773341d5911f22d5a3e87b5f.jpeg new file mode 100644 index 0000000000..60e6db5ba0 Binary files /dev/null and b/docs/applications/images/864604967256461aa7c5d32cd240645e9f4c70af773341d5911f22d5a3e87b5f.jpeg differ diff --git a/docs/applications/images/89ba046177864d8783ced6cb31ba92a66ca2169856a44ee59ac2bb18e44a6c4b.jpeg b/docs/applications/images/89ba046177864d8783ced6cb31ba92a66ca2169856a44ee59ac2bb18e44a6c4b.jpeg new file mode 100644 index 0000000000..13581d6701 Binary files /dev/null and b/docs/applications/images/89ba046177864d8783ced6cb31ba92a66ca2169856a44ee59ac2bb18e44a6c4b.jpeg differ diff --git a/docs/applications/images/89f42eccd600439fa9e28c97ccb663726e4e54ce3a854825b4c3b7d554ea21df.jpeg b/docs/applications/images/89f42eccd600439fa9e28c97ccb663726e4e54ce3a854825b4c3b7d554ea21df.jpeg new file mode 100644 index 0000000000..dd40242afd Binary files /dev/null and b/docs/applications/images/89f42eccd600439fa9e28c97ccb663726e4e54ce3a854825b4c3b7d554ea21df.jpeg differ diff --git a/docs/applications/images/8bb381f164c54ea9b4043cf66fc92ffdea8aaf851bab484fa6e19bd2f93f154f.jpeg b/docs/applications/images/8bb381f164c54ea9b4043cf66fc92ffdea8aaf851bab484fa6e19bd2f93f154f.jpeg new file mode 100644 index 0000000000..42b598e006 Binary files /dev/null and b/docs/applications/images/8bb381f164c54ea9b4043cf66fc92ffdea8aaf851bab484fa6e19bd2f93f154f.jpeg differ diff --git a/docs/applications/images/8d1022ac25d9474daa4fb236235bd58760039d58ad46414f841559d68e0d057f.jpeg b/docs/applications/images/8d1022ac25d9474daa4fb236235bd58760039d58ad46414f841559d68e0d057f.jpeg new file mode 100644 index 0000000000..459b443624 Binary files /dev/null and b/docs/applications/images/8d1022ac25d9474daa4fb236235bd58760039d58ad46414f841559d68e0d057f.jpeg differ diff --git a/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a-20240707183252997.jpeg b/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a-20240707183252997.jpeg new file mode 100644 index 0000000000..d435a27307 Binary files /dev/null and b/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a-20240707183252997.jpeg differ diff --git a/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a.jpeg b/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a.jpeg new file mode 100644 index 0000000000..d435a27307 Binary files /dev/null and b/docs/applications/images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a.jpeg differ diff --git a/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373-20240704185855034.png b/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373-20240704185855034.png new file mode 100644 index 0000000000..6a9305c62d Binary files /dev/null and b/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373-20240704185855034.png differ diff --git a/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373.png b/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373.png new file mode 100644 index 0000000000..6a9305c62d Binary files /dev/null and b/docs/applications/images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373.png differ diff --git a/docs/applications/images/93c66a43a69e472899c1c6732408b7a42e99a43721e94e9ca3c0a64e080306e4.jpeg b/docs/applications/images/93c66a43a69e472899c1c6732408b7a42e99a43721e94e9ca3c0a64e080306e4.jpeg new file mode 100644 index 0000000000..c69632fc27 Binary files /dev/null and b/docs/applications/images/93c66a43a69e472899c1c6732408b7a42e99a43721e94e9ca3c0a64e080306e4.jpeg differ diff --git a/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880-0096678.png b/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880-0096678.png new file mode 100644 index 0000000000..db1d6bd51e Binary files /dev/null and b/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880-0096678.png differ diff --git a/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880.png b/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880.png new file mode 100644 index 0000000000..db1d6bd51e Binary files /dev/null and b/docs/applications/images/95d8e95bf1ab476987f2519c0f8f0c60a0cdc2c444804ed6ab08f2f7ab054880.png differ diff --git a/docs/applications/images/965db9f758614c6f9be301286cd5918f21110603c8aa4a1dbf5371e3afeec782.jpeg b/docs/applications/images/965db9f758614c6f9be301286cd5918f21110603c8aa4a1dbf5371e3afeec782.jpeg new file mode 100644 index 0000000000..fc2029f01f Binary files /dev/null and b/docs/applications/images/965db9f758614c6f9be301286cd5918f21110603c8aa4a1dbf5371e3afeec782.jpeg differ diff --git a/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8-20240704185952731.jpeg b/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8-20240704185952731.jpeg new file mode 100644 index 0000000000..067cd277b2 Binary files /dev/null and b/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8-20240704185952731.jpeg differ diff --git a/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8.jpeg b/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8.jpeg new file mode 100644 index 0000000000..067cd277b2 Binary files /dev/null and b/docs/applications/images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8.jpeg differ diff --git a/docs/applications/images/9a709f19e7174725a8cfb09fd922ade74f8e9eb73ae1438596cbb2facef9c24a.jpeg b/docs/applications/images/9a709f19e7174725a8cfb09fd922ade74f8e9eb73ae1438596cbb2facef9c24a.jpeg new file mode 100644 index 0000000000..3fe74c42d5 Binary files /dev/null and b/docs/applications/images/9a709f19e7174725a8cfb09fd922ade74f8e9eb73ae1438596cbb2facef9c24a.jpeg differ diff --git a/docs/applications/images/9a7a4e19edc24310b46620f2ee7430f918223b93d4f14a15a52973c096926bad.jpeg b/docs/applications/images/9a7a4e19edc24310b46620f2ee7430f918223b93d4f14a15a52973c096926bad.jpeg new file mode 100644 index 0000000000..0b3f6ceaee Binary files /dev/null and b/docs/applications/images/9a7a4e19edc24310b46620f2ee7430f918223b93d4f14a15a52973c096926bad.jpeg differ diff --git a/docs/applications/images/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e.jpeg b/docs/applications/images/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e.jpeg new file mode 100644 index 0000000000..8c3c4d7f36 Binary files /dev/null and b/docs/applications/images/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e.jpeg differ diff --git a/docs/applications/images/9f45d3eef75e4842a0828bb9e518c2438300264aec0646cc9addfce860a04196.png b/docs/applications/images/9f45d3eef75e4842a0828bb9e518c2438300264aec0646cc9addfce860a04196.png new file mode 100644 index 0000000000..f62001d2ed Binary files /dev/null and b/docs/applications/images/9f45d3eef75e4842a0828bb9e518c2438300264aec0646cc9addfce860a04196.png differ diff --git a/docs/applications/images/9fc78bbcdf754898b9b2c7f000ddf562afac786482ab4f2ab063e2242faa542a.jpeg b/docs/applications/images/9fc78bbcdf754898b9b2c7f000ddf562afac786482ab4f2ab063e2242faa542a.jpeg new file mode 100644 index 0000000000..1db776bcde Binary files /dev/null and b/docs/applications/images/9fc78bbcdf754898b9b2c7f000ddf562afac786482ab4f2ab063e2242faa542a.jpeg differ diff --git a/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b-0097611.jpeg b/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b-0097611.jpeg new file mode 100644 index 0000000000..ff0a282351 Binary files /dev/null and b/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b-0097611.jpeg differ diff --git a/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b.jpeg b/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b.jpeg new file mode 100644 index 0000000000..ff0a282351 Binary files /dev/null and b/docs/applications/images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b.jpeg differ diff --git a/docs/applications/images/a5973a8ddeff4bd7ac082f02dc4d0c79de21e721b41641cbb831f23c2cb8fce2.jpeg b/docs/applications/images/a5973a8ddeff4bd7ac082f02dc4d0c79de21e721b41641cbb831f23c2cb8fce2.jpeg new file mode 100644 index 0000000000..3aabbb6c87 Binary files /dev/null and b/docs/applications/images/a5973a8ddeff4bd7ac082f02dc4d0c79de21e721b41641cbb831f23c2cb8fce2.jpeg differ diff --git a/docs/applications/images/a73180425fa14f919ce52d9bf70246c3995acea1831843cca6c17d871b8f5d95.jpeg b/docs/applications/images/a73180425fa14f919ce52d9bf70246c3995acea1831843cca6c17d871b8f5d95.jpeg new file mode 100644 index 0000000000..3a13c7d35f Binary files /dev/null and b/docs/applications/images/a73180425fa14f919ce52d9bf70246c3995acea1831843cca6c17d871b8f5d95.jpeg differ diff --git a/docs/applications/images/ab93d3d90d77437a81c9534b2dd1d3e39ef81e8473054fd3aeff6e837ebfb827.jpeg b/docs/applications/images/ab93d3d90d77437a81c9534b2dd1d3e39ef81e8473054fd3aeff6e837ebfb827.jpeg new file mode 100644 index 0000000000..ddcf375813 Binary files /dev/null and b/docs/applications/images/ab93d3d90d77437a81c9534b2dd1d3e39ef81e8473054fd3aeff6e837ebfb827.jpeg differ diff --git a/docs/applications/images/ad7c02745491498d82e0ce95f4a274f9b3920b2f467646858709359b7af9d869.png b/docs/applications/images/ad7c02745491498d82e0ce95f4a274f9b3920b2f467646858709359b7af9d869.png new file mode 100644 index 0000000000..178add3e81 Binary files /dev/null and b/docs/applications/images/ad7c02745491498d82e0ce95f4a274f9b3920b2f467646858709359b7af9d869.png differ diff --git a/docs/applications/images/b7230e9964074181837e1132029f9da8178bf564ac5c43a9a93a30e975c0d8b4.jpeg b/docs/applications/images/b7230e9964074181837e1132029f9da8178bf564ac5c43a9a93a30e975c0d8b4.jpeg new file mode 100644 index 0000000000..26bed005ba Binary files /dev/null and b/docs/applications/images/b7230e9964074181837e1132029f9da8178bf564ac5c43a9a93a30e975c0d8b4.jpeg differ diff --git a/docs/applications/images/bab32d32bdec4339b9a3e5f911e4b41f77996f3faabc40bd8309b5b20cad31e4.jpeg b/docs/applications/images/bab32d32bdec4339b9a3e5f911e4b41f77996f3faabc40bd8309b5b20cad31e4.jpeg new file mode 100644 index 0000000000..11e5cae380 Binary files /dev/null and b/docs/applications/images/bab32d32bdec4339b9a3e5f911e4b41f77996f3faabc40bd8309b5b20cad31e4.jpeg differ diff --git a/docs/applications/images/bb7a345687814a3d83a29790f2a2b7d081495b3a920b43988c93da6039cad653.jpeg b/docs/applications/images/bb7a345687814a3d83a29790f2a2b7d081495b3a920b43988c93da6039cad653.jpeg new file mode 100644 index 0000000000..628dd9acdc Binary files /dev/null and b/docs/applications/images/bb7a345687814a3d83a29790f2a2b7d081495b3a920b43988c93da6039cad653.jpeg differ diff --git a/docs/applications/images/c07c88f708ad43cc8cd615861626d0e8333c0e3d4dda49ac8cba1f8939fa8a94.jpeg b/docs/applications/images/c07c88f708ad43cc8cd615861626d0e8333c0e3d4dda49ac8cba1f8939fa8a94.jpeg new file mode 100644 index 0000000000..ec071d00dd Binary files /dev/null and b/docs/applications/images/c07c88f708ad43cc8cd615861626d0e8333c0e3d4dda49ac8cba1f8939fa8a94.jpeg differ diff --git a/docs/applications/images/c1a7d197847a4f168848c59b8e625d1d5e8066b778144395a8b9382bb85dc364.jpeg b/docs/applications/images/c1a7d197847a4f168848c59b8e625d1d5e8066b778144395a8b9382bb85dc364.jpeg new file mode 100644 index 0000000000..c93d380cf2 Binary files /dev/null and b/docs/applications/images/c1a7d197847a4f168848c59b8e625d1d5e8066b778144395a8b9382bb85dc364.jpeg differ diff --git a/docs/applications/images/c306b2f028364805a55494d435ab553a76cf5ae5dd3f4649a948ea9aeaeb28b8.png b/docs/applications/images/c306b2f028364805a55494d435ab553a76cf5ae5dd3f4649a948ea9aeaeb28b8.png new file mode 100644 index 0000000000..ccb5c8b21f Binary files /dev/null and b/docs/applications/images/c306b2f028364805a55494d435ab553a76cf5ae5dd3f4649a948ea9aeaeb28b8.png differ diff --git a/docs/applications/images/c570f343c29846c792da56ebaca16c50708477514dd048cea8bef37ffa85d03f.jpeg b/docs/applications/images/c570f343c29846c792da56ebaca16c50708477514dd048cea8bef37ffa85d03f.jpeg new file mode 100644 index 0000000000..b0e78bdd32 Binary files /dev/null and b/docs/applications/images/c570f343c29846c792da56ebaca16c50708477514dd048cea8bef37ffa85d03f.jpeg differ diff --git a/docs/applications/images/c7fc5e631dd44bc8b714630f4e49d9155a831d9e56c64e2482ded87081d0db22.jpeg b/docs/applications/images/c7fc5e631dd44bc8b714630f4e49d9155a831d9e56c64e2482ded87081d0db22.jpeg new file mode 100644 index 0000000000..efeed96302 Binary files /dev/null and b/docs/applications/images/c7fc5e631dd44bc8b714630f4e49d9155a831d9e56c64e2482ded87081d0db22.jpeg differ diff --git a/docs/applications/images/cbda3390cb994f98a3c8a9ba88c90c348497763f6c9f4b4797f7d63d84da5f63.jpeg b/docs/applications/images/cbda3390cb994f98a3c8a9ba88c90c348497763f6c9f4b4797f7d63d84da5f63.jpeg new file mode 100644 index 0000000000..8c3b594637 Binary files /dev/null and b/docs/applications/images/cbda3390cb994f98a3c8a9ba88c90c348497763f6c9f4b4797f7d63d84da5f63.jpeg differ diff --git a/docs/applications/images/char_spacing_compact.jpg b/docs/applications/images/char_spacing_compact.jpg new file mode 100644 index 0000000000..7355792851 Binary files /dev/null and b/docs/applications/images/char_spacing_compact.jpg differ diff --git a/docs/applications/images/d1e7780f0c7745ada4be540decefd6288e4d59257d8141f6842682a4c05d28b6.jpg b/docs/applications/images/d1e7780f0c7745ada4be540decefd6288e4d59257d8141f6842682a4c05d28b6.jpg new file mode 100644 index 0000000000..172b496d83 Binary files /dev/null and b/docs/applications/images/d1e7780f0c7745ada4be540decefd6288e4d59257d8141f6842682a4c05d28b6.jpg differ diff --git a/docs/applications/images/d445cf4d850e4063b9a7fc6a075c12204cf912ff23ec471fa2e268b661b3d693.jpeg b/docs/applications/images/d445cf4d850e4063b9a7fc6a075c12204cf912ff23ec471fa2e268b661b3d693.jpeg new file mode 100644 index 0000000000..a0db62f345 Binary files /dev/null and b/docs/applications/images/d445cf4d850e4063b9a7fc6a075c12204cf912ff23ec471fa2e268b661b3d693.jpeg differ diff --git a/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a-20240704185905678.jpg b/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a-20240704185905678.jpg new file mode 100644 index 0000000000..2f48d60123 Binary files /dev/null and b/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a-20240704185905678.jpg differ diff --git a/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a.png b/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a.png new file mode 100644 index 0000000000..c8a2989e22 Binary files /dev/null and b/docs/applications/images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a.png differ diff --git a/docs/applications/images/d5143df967fa4364a38868793fe7c57b0c0b1213930243babd6ae01423dcbc4d.png b/docs/applications/images/d5143df967fa4364a38868793fe7c57b0c0b1213930243babd6ae01423dcbc4d.png new file mode 100644 index 0000000000..d89db69446 Binary files /dev/null and b/docs/applications/images/d5143df967fa4364a38868793fe7c57b0c0b1213930243babd6ae01423dcbc4d.png differ diff --git a/docs/applications/images/d686a48d465a43d09fbee51924fdca42ee21c50e676646da8559fb9967b94185.png b/docs/applications/images/d686a48d465a43d09fbee51924fdca42ee21c50e676646da8559fb9967b94185.png new file mode 100644 index 0000000000..93fdafc3b0 Binary files /dev/null and b/docs/applications/images/d686a48d465a43d09fbee51924fdca42ee21c50e676646da8559fb9967b94185.png differ diff --git a/docs/applications/images/d7f96effc2434a3ca2d4144ff33c50282b830670c892487d8d7dec151921cce7.jpeg b/docs/applications/images/d7f96effc2434a3ca2d4144ff33c50282b830670c892487d8d7dec151921cce7.jpeg new file mode 100644 index 0000000000..1871630ef1 Binary files /dev/null and b/docs/applications/images/d7f96effc2434a3ca2d4144ff33c50282b830670c892487d8d7dec151921cce7.jpeg differ diff --git a/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684-20240704185600953.jpeg b/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684-20240704185600953.jpeg new file mode 100644 index 0000000000..7e36cb6c8d Binary files /dev/null and b/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684-20240704185600953.jpeg differ diff --git a/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684.jpeg b/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684.jpeg new file mode 100644 index 0000000000..7e36cb6c8d Binary files /dev/null and b/docs/applications/images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684.jpeg differ diff --git a/docs/applications/images/da82ae8ef8fd479aaa38e1049eb3a681cf020dc108fa458eb3ec79da53b45fd1.png b/docs/applications/images/da82ae8ef8fd479aaa38e1049eb3a681cf020dc108fa458eb3ec79da53b45fd1.png new file mode 100644 index 0000000000..f4c5e8e6da Binary files /dev/null and b/docs/applications/images/da82ae8ef8fd479aaa38e1049eb3a681cf020dc108fa458eb3ec79da53b45fd1.png differ diff --git a/docs/applications/images/dc10a070018d4d27946c26ec24a2a85bc3f16422f4964f72a9b63c6170d954e1.jpeg b/docs/applications/images/dc10a070018d4d27946c26ec24a2a85bc3f16422f4964f72a9b63c6170d954e1.jpeg new file mode 100644 index 0000000000..aedcbaa2a8 Binary files /dev/null and b/docs/applications/images/dc10a070018d4d27946c26ec24a2a85bc3f16422f4964f72a9b63c6170d954e1.jpeg differ diff --git a/docs/applications/images/dd721099bd50478f9d5fb13d8dd00fad69c22d6848244fd3a1d3980d7fefc63e.jpeg b/docs/applications/images/dd721099bd50478f9d5fb13d8dd00fad69c22d6848244fd3a1d3980d7fefc63e.jpeg new file mode 100644 index 0000000000..bcbf9c6af4 Binary files /dev/null and b/docs/applications/images/dd721099bd50478f9d5fb13d8dd00fad69c22d6848244fd3a1d3980d7fefc63e.jpeg differ diff --git a/docs/applications/images/dedab7b7fd6543aa9e7f625132b24e3ba3f200e361fa468dac615f7814dfb98d.jpeg b/docs/applications/images/dedab7b7fd6543aa9e7f625132b24e3ba3f200e361fa468dac615f7814dfb98d.jpeg new file mode 100644 index 0000000000..b82acbba3e Binary files /dev/null and b/docs/applications/images/dedab7b7fd6543aa9e7f625132b24e3ba3f200e361fa468dac615f7814dfb98d.jpeg differ diff --git a/docs/applications/images/e0dc05039c7444c5ab1260ff550a408748df8d4cfe864223adf390e51058dbd5.jpeg b/docs/applications/images/e0dc05039c7444c5ab1260ff550a408748df8d4cfe864223adf390e51058dbd5.jpeg new file mode 100644 index 0000000000..c841c4be8b Binary files /dev/null and b/docs/applications/images/e0dc05039c7444c5ab1260ff550a408748df8d4cfe864223adf390e51058dbd5.jpeg differ diff --git a/docs/applications/images/e1e798c87472477fa0bfca0da12bb0c180845a3e167a4761b0d26ff4330a5ccb.jpeg b/docs/applications/images/e1e798c87472477fa0bfca0da12bb0c180845a3e167a4761b0d26ff4330a5ccb.jpeg new file mode 100644 index 0000000000..bb260088f5 Binary files /dev/null and b/docs/applications/images/e1e798c87472477fa0bfca0da12bb0c180845a3e167a4761b0d26ff4330a5ccb.jpeg differ diff --git a/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7-0096772.jpeg b/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7-0096772.jpeg new file mode 100644 index 0000000000..e78ea45fdd Binary files /dev/null and b/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7-0096772.jpeg differ diff --git a/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7.jpeg b/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7.jpeg new file mode 100644 index 0000000000..e78ea45fdd Binary files /dev/null and b/docs/applications/images/e61e6ba685534eda992cea30a63a9c461646040ffd0c4d208a5eebb85897dcf7.jpeg differ diff --git a/docs/applications/images/ee927ad9ebd442bb96f163a7ebbf4bc95e6bedee97324a51887cf82de0851fd3.jpeg b/docs/applications/images/ee927ad9ebd442bb96f163a7ebbf4bc95e6bedee97324a51887cf82de0851fd3.jpeg new file mode 100644 index 0000000000..775136bd7c Binary files /dev/null and b/docs/applications/images/ee927ad9ebd442bb96f163a7ebbf4bc95e6bedee97324a51887cf82de0851fd3.jpeg differ diff --git a/docs/applications/images/f5acbc4f50dd401a8f535ed6a263f94b0edff82c1aed4285836a9ead989b9c13.png b/docs/applications/images/f5acbc4f50dd401a8f535ed6a263f94b0edff82c1aed4285836a9ead989b9c13.png new file mode 100644 index 0000000000..f6a310be73 Binary files /dev/null and b/docs/applications/images/f5acbc4f50dd401a8f535ed6a263f94b0edff82c1aed4285836a9ead989b9c13.png differ diff --git a/docs/applications/images/f99af54fb2d14691a73b1a748e0ca22618aeddfded0c4da58bbbb03edb8c2340.png b/docs/applications/images/f99af54fb2d14691a73b1a748e0ca22618aeddfded0c4da58bbbb03edb8c2340.png new file mode 100644 index 0000000000..83f5e738ef Binary files /dev/null and b/docs/applications/images/f99af54fb2d14691a73b1a748e0ca22618aeddfded0c4da58bbbb03edb8c2340.png differ diff --git a/docs/applications/images/fcdf517af5a6466294d72db7450209378d8efd9b77764e329d3f2aff3579a20c.jpeg b/docs/applications/images/fcdf517af5a6466294d72db7450209378d8efd9b77764e329d3f2aff3579a20c.jpeg new file mode 100644 index 0000000000..bbd3b70397 Binary files /dev/null and b/docs/applications/images/fcdf517af5a6466294d72db7450209378d8efd9b77764e329d3f2aff3579a20c.jpeg differ diff --git a/docs/applications/images/fe350481be0241c58736d487d1bf06c2e65911bf01254a79944be629c4c10091.jpeg b/docs/applications/images/fe350481be0241c58736d487d1bf06c2e65911bf01254a79944be629c4c10091.jpeg new file mode 100644 index 0000000000..6b380b40ef Binary files /dev/null and b/docs/applications/images/fe350481be0241c58736d487d1bf06c2e65911bf01254a79944be629c4c10091.jpeg differ diff --git a/docs/applications/images/steps_en.gif b/docs/applications/images/steps_en.gif new file mode 100644 index 0000000000..e59339350a Binary files /dev/null and b/docs/applications/images/steps_en.gif differ diff --git a/docs/applications/images/svtr_tiny-20240708094336228.png b/docs/applications/images/svtr_tiny-20240708094336228.png new file mode 100644 index 0000000000..29d636172f Binary files /dev/null and b/docs/applications/images/svtr_tiny-20240708094336228.png differ diff --git a/docs/applications/images/test_add_91.jpg b/docs/applications/images/test_add_91.jpg new file mode 100644 index 0000000000..b5ded6e1de Binary files /dev/null and b/docs/applications/images/test_add_91.jpg differ diff --git a/docs/applications/overview.md b/docs/applications/overview.md new file mode 100644 index 0000000000..376567c178 --- /dev/null +++ b/docs/applications/overview.md @@ -0,0 +1,60 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 场景应用 + +PaddleOCR场景应用覆盖通用,制造、金融、交通行业的主要OCR垂类应用,在PP-OCR、PP-Structure的通用能力基础之上,以notebook的形式展示利用场景数据微调、模型优化方法、数据增广等内容,为开发者快速落地OCR应用提供示范与启发。 + +## 教程文档 + +### 通用 + +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| ---------------------- | ------------------------------------------------------------ | -------------- | --------------------------------------- | ------------------------------------------------------------ | +| 高精度中文识别模型SVTR | 比PP-OCRv3识别模型精度高3%,
可用于数据挖掘或对预测效率要求不高的场景。 | [模型下载](#2) | [中文](./高精度中文识别模型.md)/English | img | +| 手写体识别 | 新增字形支持 | [模型下载](#2) | [中文](./手写文字识别.md)/English | | + +### 制造 + +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| -------------- | ------------------------------ | -------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 数码管识别 | 数码管数据合成、漏识别调优 | [模型下载](#2) | [中文](./光功率计数码管字符识别/光功率计数码管字符识别.md)/English | | +| 液晶屏读数识别 | 检测模型蒸馏、Serving部署 | [模型下载](#2) | [中文](./液晶屏读数识别.md)/English | | +| 包装生产日期 | 点阵字符合成、过曝过暗文字识别 | [模型下载](#2) | [中文](./包装生产日期识别.md)/English | | +| PCB文字识别 | 小尺寸文本检测与识别 | [模型下载](#2) | [中文](./PCB字符识别/PCB字符识别.md)/English | | +| 电表识别 | 大分辨率图像检测调优 | [模型下载](#2) | | | +| 液晶屏缺陷检测 | 非文字字符识别 | | | | + +### 金融 + +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| -------------- | ----------------------------- | -------------- | ----------------------------------------- | ------------------------------------------------------------ | +| 表单VQA | 多模态通用表单结构化提取 | [模型下载](#2) | [中文](./多模态表单识别.md)/English | | +| 增值税发票 | 关键信息抽取,SER、RE任务训练 | [模型下载](#2) | [中文](./发票关键信息抽取.md)/English | | +| 印章检测与识别 | 端到端弯曲文本识别 | [模型下载](#2) | [中文](./印章弯曲文字识别.md)/English | | +| 通用卡证识别 | 通用结构化提取 | [模型下载](#2) | [中文](./快速构建卡证类OCR.md)/English | | +| 身份证识别 | 结构化提取、图像阴影 | | | | +| 合同比对 | 密集文本检测、NLP关键信息抽取 | [模型下载](#2) | [中文](./扫描合同关键信息提取.md)/English | | + +### 交通 + +| 类别 | 亮点 | 模型下载 | 教程 | 示例图 | +| ----------------- | ------------------------------ | -------------- | ----------------------------------- | ------------------------------------------------------------ | +| 车牌识别 | 多角度图像、轻量模型、端侧部署 | [模型下载](#2) | [中文](./轻量级车牌识别.md)/English | | +| 驾驶证/行驶证识别 | 尽请期待 | | | | +| 快递单识别 | 尽请期待 | | | | + +## 模型下载 + +如需下载上述场景中已经训练好的垂类模型,可以加入PaddleX官方交流频道获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +如果您是企业开发者且未在上述场景中找到合适的方案,可以填写[OCR应用合作调研问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx),免费与官方团队展开不同层次的合作,包括但不限于问题抽象、确定技术方案、项目答疑、共同研发等。如果您已经使用PaddleOCR落地项目,也可以填写此问卷,与飞桨平台共同宣传推广,提升企业技术品宣。期待您的提交! + + +traffic + diff --git "a/docs/applications/\344\270\255\346\226\207\350\241\250\346\240\274\350\257\206\345\210\253.md" "b/docs/applications/\344\270\255\346\226\207\350\241\250\346\240\274\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..97a2cec68a --- /dev/null +++ "b/docs/applications/\344\270\255\346\226\207\350\241\250\346\240\274\350\257\206\345\210\253.md" @@ -0,0 +1,437 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 智能运营:通用中文表格识别 + +## 1. 背景介绍 + +中文表格识别在金融行业有着广泛的应用,如保险理赔、财报分析和信息录入等领域。当前,金融行业的表格识别主要以手动录入为主,开发一种自动表格识别成为丞待解决的问题。 + +![](./images/d1e7780f0c7745ada4be540decefd6288e4d59257d8141f6842682a4c05d28b6.jpg) + +在金融行业中,表格图像主要有清单类的单元格密集型表格,申请表类的大单元格表格,拍照表格和倾斜表格四种主要形式。 + +![](./images/da82ae8ef8fd479aaa38e1049eb3a681cf020dc108fa458eb3ec79da53b45fd1.png) + +![](./images/5ffff2093a144a6993a75eef71634a52276015ee43a04566b9c89d353198c746.jpg) + +当前的表格识别算法不能很好的处理这些场景下的表格图像。在本例中,我们使用PP-StructureV2最新发布的表格识别模型SLANet来演示如何进行中文表格是识别。同时,为了方便作业流程,我们使用表格属性识别模型对表格图像的属性进行识别,对表格的难易程度进行判断,加快人工进行校对速度。 + +本项目AI Studio链接: + +## 2. 中文表格识别 + +### 2.1 环境准备 + +```bash linenums="1" +# 下载PaddleOCR代码 +! git clone -b dygraph https://gitee.com/paddlepaddle/PaddleOCR +``` + +```bash linenums="1" +# 安装PaddleOCR环境 +! pip install -r PaddleOCR/requirements.txt --force-reinstall +! pip install protobuf==3.19 +``` + +### 2.2 准备数据集 + +本例中使用的数据集采用表格[生成工具](https://github.com/WenmuZhou/TableGeneration)制作。 + +使用如下命令对数据集进行解压,并查看数据集大小 + +```bash linenums="1" +! cd data/data165849 && tar -xf table_gen_dataset.tar && cd - +! wc -l data/data165849/table_gen_dataset/gt.txt +``` + +#### 2.2.1 划分训练测试集 + +使用下述命令将数据集划分为训练集和测试集, 这里将90%划分为训练集,10%划分为测试集 + +```python linenums="1" +import random +with open('/home/aistudio/data/data165849/table_gen_dataset/gt.txt') as f: + lines = f.readlines() +random.shuffle(lines) +train_len = int(len(lines)*0.9) +train_list = lines[:train_len] +val_list = lines[train_len:] + +# 保存结果 +with open('/home/aistudio/train.txt','w',encoding='utf-8') as f: + f.writelines(train_list) +with open('/home/aistudio/val.txt','w',encoding='utf-8') as f: + f.writelines(val_list) +``` + +划分完成后,数据集信息如下 + +|类型|数量|图片地址|标注文件路径| +|---|---|---|---| +|训练集|18000|/home/aistudio/data/data165849/table_gen_dataset|/home/aistudio/train.txt| +|测试集|2000|/home/aistudio/data/data165849/table_gen_dataset|/home/aistudio/val.txt| + +#### 2.2.2 查看数据集 + +```python linenums="1" +import cv2 +import os, json +import numpy as np +from matplotlib import pyplot as plt +%matplotlib inline + +def parse_line(data_dir, line): + data_line = line.strip("\n") + info = json.loads(data_line) + file_name = info['filename'] + cells = info['html']['cells'].copy() + structure = info['html']['structure']['tokens'].copy() + + img_path = os.path.join(data_dir, file_name) + if not os.path.exists(img_path): + print(img_path) + return None + data = { + 'img_path': img_path, + 'cells': cells, + 'structure': structure, + 'file_name': file_name + } + return data + +def draw_bbox(img_path, points, color=(255, 0, 0), thickness=2): + if isinstance(img_path, str): + img_path = cv2.imread(img_path) + img_path = img_path.copy() + for point in points: + cv2.polylines(img_path, [point.astype(int)], True, color, thickness) + return img_path + + +def rebuild_html(data): + html_code = data['structure'] + cells = data['cells'] + to_insert = [i for i, tag in enumerate(html_code) if tag in ('', '>')] + + for i, cell in zip(to_insert[::-1], cells[::-1]): + if cell['tokens']: + text = ''.join(cell['tokens']) + # skip empty text + sp_char_list = ['', '', '\u2028', ' ', '', ''] + text_remove_style = skip_char(text, sp_char_list) + if len(text_remove_style) == 0: + continue + html_code.insert(i + 1, text) + + html_code = ''.join(html_code) + return html_code + + +def skip_char(text, sp_char_list): + """ + skip empty cell + @param text: text in cell + @param sp_char_list: style char and special code + @return: + """ + for sp_char in sp_char_list: + text = text.replace(sp_char, '') + return text + +save_dir = '/home/aistudio/vis' +os.makedirs(save_dir, exist_ok=True) +image_dir = '/home/aistudio/data/data165849/' +html_str = '' + +# 解析标注信息并还原html表格 +data = parse_line(image_dir, val_list[0]) + +img = cv2.imread(data['img_path']) +img_name = ''.join(os.path.basename(data['file_name']).split('.')[:-1]) +img_save_name = os.path.join(save_dir, img_name) +boxes = [np.array(x['bbox']) for x in data['cells']] +show_img = draw_bbox(data['img_path'], boxes) +cv2.imwrite(img_save_name + '_show.jpg', show_img) + +html = rebuild_html(data) +html_str += html +html_str += '
' + +# 显示标注的html字符串 +from IPython.core.display import display, HTML +display(HTML(html_str)) +# 显示单元格坐标 +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() +``` + +### 2.3 训练 + +这里选用PP-StructureV2中的表格识别模型[SLANet](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/configs/table/SLANet.yml) + +SLANet是PP-StructureV2全新推出的表格识别模型,相比PP-StructureV1中TableRec-RARE,在速度不变的情况下精度提升4.7%。TEDS提升2% + +|算法|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.30% |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) | 76.31%| 95.89%|766ms| + +进行训练之前先使用如下命令下载预训练模型 + +```bash linenums="1" +# 进入PaddleOCR工作目录 +os.chdir('/home/aistudio/PaddleOCR') +# 下载英文预训练模型 +! wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar --no-check-certificate +! cd ./pretrain_models/ && tar xf en_ppstructure_mobile_v2.0_SLANet_train.tar && cd ../ +``` + +使用如下命令即可启动训练,需要修改的配置有 + +|字段|修改值|含义| +|---|---|---| +|Global.pretrained_model|./pretrain_models/en_ppstructure_mobile_v2.0_SLANet_train/best_accuracy.pdparams|指向英文表格预训练模型地址| +|Global.eval_batch_step|562|模型多少step评估一次,一般设置为一个epoch总的step数| +|Optimizer.lr.name|Const|学习率衰减器 | +|Optimizer.lr.learning_rate|0.0005|学习率设为之前的0.05倍 | +|Train.dataset.data_dir|/home/aistudio/data/data165849|指向训练集图片存放目录 | +|Train.dataset.label_file_list|/home/aistudio/data/data165849/table_gen_dataset/train.txt|指向训练集标注文件 | +|Train.loader.batch_size_per_card|32|训练时每张卡的batch_size | +|Train.loader.num_workers|1|训练集多进程数据读取的进程数,在aistudio中需要设为1 | +|Eval.dataset.data_dir|/home/aistudio/data/data165849|指向测试集图片存放目录 | +|Eval.dataset.label_file_list|/home/aistudio/data/data165849/table_gen_dataset/val.txt|指向测试集标注文件 | +|Eval.loader.batch_size_per_card|32|测试时每张卡的batch_size | +|Eval.loader.num_workers|1|测试集多进程数据读取的进程数,在aistudio中需要设为1 | + +已经修改好的配置存储在 `/home/aistudio/SLANet_ch.yml` + +```python linenums="1" +import os +os.chdir('/home/aistudio/PaddleOCR') +! python3 tools/train.py -c /home/aistudio/SLANet_ch.yml +``` + +大约在7个epoch后达到最高精度 97.49% + +### 2.4 验证 + +训练完成后,可使用如下命令在测试集上评估最优模型的精度 + +```bash linenums="1" +! python3 tools/eval.py -c /home/aistudio/SLANet_ch.yml -o Global.checkpoints=/home/aistudio/PaddleOCR/output/SLANet_ch/best_accuracy.pdparams +``` + +### 2.5 训练引擎推理 + +使用如下命令可使用训练引擎对单张图片进行推理 + +```bash linenums="1" +import os;os.chdir('/home/aistudio/PaddleOCR') +! python3 tools/infer_table.py -c /home/aistudio/SLANet_ch.yml -o Global.checkpoints=/home/aistudio/PaddleOCR/output/SLANet_ch/best_accuracy.pdparams Global.infer_img=/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg +``` + +```python linenums="1" +import cv2 +from matplotlib import pyplot as plt +%matplotlib inline + +# 显示原图 +show_img = cv2.imread('/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg') +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() + +# 显示预测的单元格 +show_img = cv2.imread('/home/aistudio/PaddleOCR/output/infer/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg') +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() +``` + +### 2.6 模型导出 + +使用如下命令可将模型导出为inference模型 + +```bash linenums="1" +! python3 tools/export_model.py -c /home/aistudio/SLANet_ch.yml -o Global.checkpoints=/home/aistudio/PaddleOCR/output/SLANet_ch/best_accuracy.pdparams Global.save_inference_dir=/home/aistudio/SLANet_ch/infer +``` + +### 2.7 预测引擎推理 + +使用如下命令可使用预测引擎对单张图片进行推理 + +```bash linenums="1" +os.chdir('/home/aistudio/PaddleOCR/ppstructure') +! python3 table/predict_structure.py \ + --table_model_dir=/home/aistudio/SLANet_ch/infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --image_dir=/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg \ + --output=../output/inference +``` + +```python linenums="1" +# 显示原图 +show_img = cv2.imread('/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg') +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() + +# 显示预测的单元格 +show_img = cv2.imread('/home/aistudio/PaddleOCR/output/inference/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg') +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() +``` + +### 2.8 表格识别 + +在表格结构模型训练完成后,可结合OCR检测识别模型,对表格内容进行识别。 + +首先下载PP-OCRv3文字检测识别模型 + +```bash linenums="1" +# 下载PP-OCRv3文本检测识别模型并解压 +! wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar --no-check-certificate +! wget -nc -P ./inference/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar --no-check-certificate +! cd ./inference/ && tar xf ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_rec_slim_infer.tar && cd ../ +``` + +模型下载完成后,使用如下命令进行表格识别 + +```bash linenums="1" +import os;os.chdir('/home/aistudio/PaddleOCR/ppstructure') +! python3 table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_slim_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_slim_infer \ + --table_model_dir=/home/aistudio/SLANet_ch/infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --image_dir=/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg \ + --output=../output/table +``` + +```python linenums="1" +# 显示原图 +show_img = cv2.imread('/home/aistudio/data/data165849/table_gen_dataset/img/no_border_18298_G7XZH93DDCMATGJQ8RW2.jpg') +plt.figure(figsize=(15,15)) +plt.imshow(show_img) +plt.show() + +# 显示预测结果 +from IPython.core.display import display, HTML +display(HTML('
alleadersh不贰过,推从自己参与浙江数。另一方
AnSha自己越共商共建工作协商w.east 抓好改革试点任务
EdimeImisesElec怀天下”。22.26 31.614.30 794.94
ip Profundi:2019年12月1Horspro444.482.41 87679.98
iehaiTrain组长蒋蕊Toafterdec203.4323.54 44266.62
Tyint roudlyRol谢您的好意,我知道ErChows48.9010316
NaFlint一辈的aterreclam7823.869829.237.96 3068
家上下游企业,5Tr景象。当地球上的我们Urelaw799.62354.9612.9833
赛事( uestCh复制的业务模式并Listicjust9.239253.22
Ca Iskole扶贫"之名引导 Papua 7191.901.653.6248
避讳ir但由于Fficeof0.226.377.173397.75
ndaTurk百处遗址gMa1288.342053.662.29885.45
')) +``` + +## 3. 表格属性识别 + +### 3.1 代码、环境、数据准备 + +#### 3.1.1 代码准备 + +首先,我们需要准备训练表格属性的代码,PaddleClas集成了PULC方案,该方案可以快速获得一个在CPU上用时2ms的属性识别模型。PaddleClas代码可以clone下载得到。获取方式如下: + +```bash linenums="1" +! git clone -b develop https://gitee.com/paddlepaddle/PaddleClas +``` + +#### 3.1.2 环境准备 + +其次,我们需要安装训练PaddleClas相关的依赖包 + +```bash linenums="1" +! pip install -r PaddleClas/requirements.txt --force-reinstall +! pip install protobuf==3.20.0 +``` + +#### 3.1.3 数据准备 + +最后,准备训练数据。在这里,我们一共定义了表格的6个属性,分别是表格来源、表格数量、表格颜色、表格清晰度、表格有无干扰、表格角度。其可视化如下: + +![](./images/190587903-ccdfa6fb-51e8-42de-b08b-a127cb04e304.jpg) + +这里,我们提供了一个表格属性的demo子集,可以快速迭代体验。下载方式如下: + +```bash linenums="1" +%cd PaddleClas/dataset +!wget https://paddleclas.bj.bcebos.com/data/PULC/table_attribute.tar +!tar -xf table_attribute.tar +%cd ../PaddleClas/dataset +%cd ../ +``` + +### 3.2 表格属性识别训练 + +表格属性训练整体pipelinie如下: + +![](./images/190599426-3415b38e-e16e-4e68-9253-2ff531b1b5ca.png) + +1.训练过程中,图片经过预处理之后,送入到骨干网络之中,骨干网络将抽取表格图片的特征,最终该特征连接输出的FC层,FC层经过Sigmoid激活函数后和真实标签做交叉熵损失函数,优化器通过对该损失函数做梯度下降来更新骨干网络的参数,经过多轮训练后,骨干网络的参数可以对为止图片做很好的预测; + +2.推理过程中,图片经过预处理之后,送入到骨干网络之中,骨干网络加载学习好的权重后对该表格图片做出预测,预测的结果为一个6维向量,该向量中的每个元素反映了每个属性对应的概率值,通过对该值进一步卡阈值之后,得到最终的输出,最终的输出描述了该表格的6个属性。 + +当准备好相关的数据之后,可以一键启动表格属性的训练,训练代码如下: + +```bash linenums="1" +!python tools/train.py -c ./ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml -o Global.device=cpu -o Global.epochs=10 +``` + +### 3.3 表格属性识别推理和部署 + +#### 3.3.1 模型转换 + +当训练好模型之后,需要将模型转换为推理模型进行部署。转换脚本如下: + +```bash linenums="1" +!python tools/export_model.py -c ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml -o Global.pretrained_model=output/PPLCNet_x1_0/best_model +``` + +执行以上命令之后,会在当前目录上生成`inference`文件夹,该文件夹中保存了当前精度最高的推理模型。 + +#### 3.3.2 模型推理 + +安装推理需要的paddleclas包, 此时需要通过下载安装paddleclas的develop的whl包 + +```bash linenums="1" +!pip install https://paddleclas.bj.bcebos.com/whl/paddleclas-0.0.0-py3-none-any.whl +``` + +进入`deploy`目录下即可对模型进行推理 + +```bash linenums="1" +%cd deploy/ +``` + +推理命令如下: + +```bash linenums="1" +!python python/predict_cls.py -c configs/PULC/table_attribute/inference_table_attribute.yaml -o Global.inference_model_dir="../inference" -o Global.infer_imgs="../dataset/table_attribute/Table_val/val_9.jpg" +!python python/predict_cls.py -c configs/PULC/table_attribute/inference_table_attribute.yaml -o Global.inference_model_dir="../inference" -o Global.infer_imgs="../dataset/table_attribute/Table_val/val_3253.jpg" +``` + +推理的表格图片: + +![](./images/190596141-74f4feda-b082-46d7-908d-b0bd5839b430.jpg) + +预测结果如下: + +```text linenums="1" +val_9.jpg: {'attributes': ['Scanned', 'Little', 'Black-and-White', 'Clear', 'Without-Obstacles', 'Horizontal'], 'output': [1, 1, 1, 1, 1, 1]} +``` + +推理的表格图片: + +![](https://user-images.githubusercontent.com/45199522/190597086-2e685200-22d0-4042-9e46-f61f24e02e4e.jpg) + +预测结果如下: + +```text linenums="1" +val_3253.jpg: {'attributes': ['Photo', 'Little', 'Black-and-White', 'Blurry', 'Without-Obstacles', 'Tilted'], 'output': [0, 1, 1, 0, 1, 0]} +``` + +对比两张图片可以发现,第一张图片比较清晰,表格属性的结果也偏向于比较容易识别,我们可以更相信表格识别的结果,第二张图片比较模糊,且存在倾斜现象,表格识别可能存在错误,需要我们人工进一步校验。通过表格的属性识别能力,可以进一步将“人工”和“智能”很好的结合起来,为表格识别能力的落地的精度提供保障。 diff --git "a/docs/applications/\345\205\211\345\212\237\347\216\207\350\256\241\346\225\260\347\240\201\347\256\241\345\255\227\347\254\246\350\257\206\345\210\253.md" "b/docs/applications/\345\205\211\345\212\237\347\216\207\350\256\241\346\225\260\347\240\201\347\256\241\345\255\227\347\254\246\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..47c13f6463 --- /dev/null +++ "b/docs/applications/\345\205\211\345\212\237\347\216\207\350\256\241\346\225\260\347\240\201\347\256\241\345\255\227\347\254\246\350\257\206\345\210\253.md" @@ -0,0 +1,452 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 光功率计数码管字符识别 + +## 1. 背景介绍 + +光功率计(optical power meter )是指用于测量绝对光功率或通过一段光纤的光功率相对损耗的仪器。在光纤系统中,测量光功率是最基本的,非常像电子学中的万用表;在光纤测量中,光功率计是重负荷常用表。 + +![](./images/1.jpeg) + +目前光功率计缺少将数据直接输出的功能,需要人工读数。这一项工作单调重复,如果可以使用机器替代人工,将节约大量成本。针对上述问题,希望通过摄像头拍照->智能读数的方式高效地完成此任务。 + +为实现智能读数,通常会采取文本检测+文本识别的方案: + +第一步,使用文本检测模型定位出光功率计中的数字部分; + +第二步,使用文本识别模型获得准确的数字和单位信息。 + +本项目主要介绍如何完成第二步文本识别部分,包括:真实评估集的建立、训练数据的合成、基于 PP-OCRv3 和 SVTR_Tiny 两个模型进行训练,以及评估和推理。 + +本项目难点如下: + +- 光功率计数码管字符数据较少,难以获取。 +- 数码管中小数点占像素较少,容易漏识别。 + +针对以上问题, 本例选用 PP-OCRv3 和 SVTR_Tiny 两个高精度模型训练,同时提供了真实数据挖掘案例和数据合成案例。基于 PP-OCRv3 模型,在构建的真实评估集上精度从 52% 提升至 72%,SVTR_Tiny 模型精度可达到 78.9%。 + +aistudio项目链接: [光功率计数码管字符识别](https://aistudio.baidu.com/aistudio/projectdetail/4049044?contributionType=1) + +## 2. PaddleOCR 快速使用 + +PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,助力开发者训练出更好的模型,并应用落地。 + +![](./images/test_add_91.jpg) + +官方提供了适用于通用场景的高精轻量模型,首先使用官方提供的 PP-OCRv3 模型预测图片,验证下当前模型在光功率计场景上的效果。 + +### 准备环境 + +```bash linenums="1" +python3 -m pip install -U pip +python3 -m pip install paddleocr +``` + +### 测试效果 + +测试图: + +![](./images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a.jpeg) + +```bash linenums="1" +paddleocr --lang=ch --det=Fase --image_dir=data +``` + +得到如下测试结果: + +```bash linenums="1" +('.7000', 0.6885431408882141) +``` + +发现数字识别较准,然而对负号和小数点识别不准确。 由于PP-OCRv3的训练数据大多为通用场景数据,在特定的场景上效果可能不够好。因此需要基于场景数据进行微调。 + +下面就主要介绍如何在光功率计(数码管)场景上微调训练。 + +## 3. 开始训练 + +### 3.1 数据准备 + +特定的工业场景往往很难获取开源的真实数据集,光功率计也是如此。在实际工业场景中,可以通过摄像头采集的方法收集大量真实数据,本例中重点介绍数据合成方法和真实数据挖掘方法,如何利用有限的数据优化模型精度。 + +数据集分为两个部分:合成数据,真实数据, 其中合成数据由 text_renderer 工具批量生成得到, 真实数据通过爬虫等方式在百度图片中搜索并使用 PPOCRLabel 标注得到。 + +#### 合成数据 + +本例中数据合成工具使用的是 [text_renderer](https://github.com/Sanster/text_renderer), 该工具可以合成用于文本识别训练的文本行数据: + +![](./images/char_spacing_compact.jpg) + +![](https://github.com/oh-my-ocr/text_renderer/raw/master/example_data/effect_layout_image/color_image.jpg) + +```bash linenums="1" +export https_proxy=http://172.19.57.45:3128 +git clone https://github.com/oh-my-ocr/text_renderer +``` + +```bash linenums="1" +python3 setup.py develop +python3 -m pip install -r docker/requirements.txt +python3 main.py \ + --config example_data/example.py \ + --dataset img \ + --num_processes 2 \ + --log_period 10 +``` + +给定字体和语料,就可以合成较为丰富样式的文本行数据。 光功率计识别场景,目标是正确识别数码管文本,因此需要收集部分数码管字体,训练语料,用于合成文本识别数据。 + +将收集好的语料存放在 example_data 路径下: + +```bash linenums="1" +ln -s ./fonts/DS* text_renderer/example_data/font/ +ln -s ./corpus/digital.txt text_renderer/example_data/text/ +``` + +修改`text_renderer/example_data/font_list/font_list.txt`,选择需要的字体开始合成: + +```bash linenums="1" +python3 main.py \ + --config example_data/digital_example.py \ + --dataset img \ + --num_processes 2 \ + --log_period 10 +``` + +合成图片会被存在目录 text_renderer/example_data/digital/chn_data 下 + +查看合成的数据样例: + +![](./images/7d5774a273f84efba5b9ce7fd3f86e9ef24b6473e046444db69fa3ca20ac0986-0348339.png) + +#### 真实数据挖掘 + +模型训练需要使用真实数据作为评价指标,否则很容易过拟合到简单的合成数据中。没有开源数据的情况下,可以利用部分无标注数据+标注工具获得真实数据。 + +##### 1. 数据搜集 + +使用[爬虫工具](https://github.com/Joeclinton1/google-images-download.git)获得无标注数据 + +##### 2. [PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel) 完成半自动标注 + +PPOCRLabel是一款适用于OCR领域的半自动化图形标注工具,内置PP-OCR模型对数据自动标注和重新识别。使用Python3和PyQT5编写,支持矩形框标注、表格标注、不规则文本标注、关键信息标注模式,导出格式可直接用于PaddleOCR检测和识别模型的训练。 + +![img](./images/steps_en.gif) + +收集完数据后就可以进行分配了,验证集中一般都是真实数据,训练集中包含合成数据+真实数据。本例中标注了155张图片,其中训练集和验证集的数目为100和55。 + +最终 `data` 文件夹应包含以下几部分: + +```text linenums="1" +|-data + |- synth_train.txt + |- real_train.txt + |- real_eval.txt + |- synthetic_data + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + |- real_data + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + ... +``` + +### 3.2 模型选择 + +本案例提供了2种文本识别模型:PP-OCRv3 识别模型 和 SVTR_Tiny: + +[PP-OCRv3 识别模型](../ppocr/blog/PP-OCRv3_introduction.md):PP-OCRv3的识别模块是基于文本识别算法SVTR优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。并进行了一系列结构改进加速模型预测。 + +[SVTR_Tiny](https://arxiv.org/abs/2205.00159):SVTR提出了一种用于场景文本识别的单视觉模型,该模型在patch-wise image tokenization框架内,完全摒弃了序列建模,在精度具有竞争力的前提下,模型参数量更少,速度更快。 + +以上两个策略在自建中文数据集上的精度和速度对比如下: + +| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv2 | 8M | 74.80% | 8.54ms | +| 02 | SVTR_Tiny | 21M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(h32) | 12M | 71.90% | 6.60ms | +| 04 | SVTR_LCNet(h48) | 12M | 73.98% | 7.60ms | +| 05 | + GTC | 12M | 75.80% | 7.60ms | +| 06 | + TextConAug | 12M | 76.30% | 7.60ms | +| 07 | + TextRotNet | 12M | 76.90% | 7.60ms | +| 08 | + UDML | 12M | 78.40% | 7.60ms | +| 09 | + UIM | 12M | 79.40% | 7.60ms | + +### 3.3 开始训练 + +首先下载 PaddleOCR 代码库 + +```bash linenums="1" +git clone -b release/2.5 https://github.com/PaddlePaddle/PaddleOCR.git +``` + +PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 PP-OCRv3 中文识别模型为例: + +#### Step1:下载预训练模型 + +首先下载 pretrain model,您可以下载训练好的模型在自定义数据上进行finetune + +```bash linenums="1" +cd PaddleOCR/ +# 下载PP-OCRv3 中文预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +# 解压模型参数 +cd pretrain_models +tar -xf ch_PP-OCRv3_rec_train.tar && rm -rf ch_PP-OCRv3_rec_train.tar +``` + +#### Step2:自定义字典文件 + +接下来需要提供一个字典({word_dict_name}.txt),使模型在训练时,可以将所有出现的字符映射为字典的索引。 + +因此字典需要包含所有希望被正确识别的字符,{word_dict_name}.txt需要写成如下格式,并以 `utf-8` 编码格式保存: + +```text linenums="1" +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +- +. +``` + +word_dict.txt 每行有一个单字,将字符与数字索引映射在一起,“3.14” 将被映射成 [3, 11, 1, 4] + +- 内置字典 + + PaddleOCR内置了一部分字典,可以按需使用。 + + `ppocr/utils/ppocr_keys_v1.txt` 是一个包含6623个字符的中文字典 + + `ppocr/utils/ic15_dict.txt` 是一个包含36个字符的英文字典 + +- 自定义字典 + + 内置字典面向通用场景,具体的工业场景中,可能需要识别特殊字符,或者只需识别某几个字符,此时自定义字典会更提升模型精度。例如在光功率计场景中,需要识别数字和单位。 + +遍历真实数据标签中的字符,制作字典`digital_dict.txt`如下所示: + +```text linenums="1" +- +. +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +B +E +F +H +L +N +T +W +d +k +m +n +o +z +``` + +#### Step3:修改配置文件 + +为了更好的使用预训练模型,训练推荐使用[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)配置文件,并参考下列说明修改配置文件: + +以 `ch_PP-OCRv3_rec_distillation.yml` 为例: + +```yaml linenums="1" +Global: + ... + # 添加自定义字典,如修改字典请将路径指向新字典 + character_dict_path: ppocr/utils/dict/digital_dict.txt + ... + # 识别空格 + use_space_char: True + + +Optimizer: + ... + # 添加学习率衰减策略 + lr: + name: Cosine + learning_rate: 0.001 + ... + +... + +Train: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./data/ + # 训练集标签文件 + label_file_list: + - ./train_data/digital_img/digital_train.txt #11w + - ./train_data/digital_img/real_train.txt #100 + - ./train_data/digital_img/dbm_img/dbm.txt #3w + ratio_list: + - 0.3 + - 1.0 + - 1.0 + transforms: + ... + - RecResizeImg: + # 修改 image_shape 以适应长文本 + image_shape: [3, 48, 320] + ... + loader: + ... + # 单卡训练的batch_size + batch_size_per_card: 256 + ... + +Eval: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./data + # 验证集标签文件 + label_file_list: + - ./train_data/digital_img/real_val.txt + transforms: + ... + - RecResizeImg: + # 修改 image_shape 以适应长文本 + image_shape: [3, 48, 320] + ... + loader: + # 单卡验证的batch_size + batch_size_per_card: 256 + ... +``` + +**注意,训练/预测/评估时的配置文件请务必与训练一致。** + +#### Step4:启动训练 + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +```bash linenums="1" +# GPU训练 支持单卡,多卡训练 +# 训练数码管数据 训练日志会自动保存为 "{save_model_dir}" 下的train.log + +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model=./pretrain_models/ch_PP-OCRv3_rec_train/best_accuracy + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy +``` + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml` 中修改 `eval_batch_step` 设置评估频率,默认每500个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/ch_PP-OCRv3_rec_distill/best_accuracy` 。 + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +### SVTR_Tiny 训练 + +SVTR_Tiny 训练步骤与上面一致,SVTR支持的配置和模型训练权重可以参考[算法介绍文档](../algorithm/text_recognition/algorithm_rec_svtr.md) + +#### Step1:下载预训练模型 + +```bash linenums="1" +# 下载 SVTR_Tiny 中文识别预训练模型和配置文件 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_ch_train.tar +# 解压模型参数 +tar -xf rec_svtr_tiny_none_ctc_ch_train.tar && rm -rf rec_svtr_tiny_none_ctc_ch_train.tar +``` + +#### Step2:自定义字典文件 + +字典依然使用自定义的 digital_dict.txt + +#### Step3:修改配置文件 + +配置文件中对应修改字典路径和数据路径 + +#### Step4:启动训练 + +```bash linenums="1" +# 单卡训练 +python tools/train.py -c rec_svtr_tiny_none_ctc_ch_train/rec_svtr_tiny_6local_6global_stn_ch.yml \ + -o Global.pretrained_model=./rec_svtr_tiny_none_ctc_ch_train/best_accuracy +``` + +### 3.4 验证效果 + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +将下载或训练完成的模型放置在对应目录下即可完成模型推理 + +#### 指标评估 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml` 修改Eval中的 `label_file_path` 设置。 + +```bash linenums="1" +# GPU 评估, Global.checkpoints 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +#### 测试识别效果 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +```text linenums="1" +output/rec/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── iter_epoch_3.pdopt +├── iter_epoch_3.pdparams +├── iter_epoch_3.states +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` + +其中 best_accuracy.*是评估集上的最优模型;iter_epoch_x.* 是以 `save_epoch_step` 为间隔保存下来的模型;latest.* 是最后一个epoch的模型。 + +```bash linenums="1" +# 预测英文结果 +python3 tools/infer_rec.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=test_digital.png +``` + +预测图片: + +![](./images/8dca91f016884e16ad9216d416da72ea08190f97d87b4be883f15079b7ebab9a-20240707183252997.jpeg) + +得到输入图像的预测结果: + +```yaml linenums="1" +infer_img: test_digital.png + result: ('-70.00', 0.9998967) +``` diff --git "a/docs/applications/\345\214\205\350\243\205\347\224\237\344\272\247\346\227\245\346\234\237\350\257\206\345\210\253.md" "b/docs/applications/\345\214\205\350\243\205\347\224\237\344\272\247\346\227\245\346\234\237\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..7751af5933 --- /dev/null +++ "b/docs/applications/\345\214\205\350\243\205\347\224\237\344\272\247\346\227\245\346\234\237\350\257\206\345\210\253.md" @@ -0,0 +1,673 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 一种基于PaddleOCR的产品包装生产日期识别模型 + +## 1. 项目介绍 + +产品包装生产日期是计算机视觉图像识别技术在工业场景中的一种应用。产品包装生产日期识别技术要求能够将产品生产日期从复杂背景中提取并识别出来,在物流管理、物资管理中得到广泛应用。 + +![](./images/d9e0533cc1df47ffa3bbe99de9e42639a3ebfa5bce834bafb1ca4574bf9db684.jpg) + +- 项目难点 + +1. 没有训练数据 +2. 图像质量层次不齐: 角度倾斜、图片模糊、光照不足、过曝等问题严重 + +针对以上问题, 本例选用PP-OCRv3这一开源超轻量OCR系统进行包装产品生产日期识别系统的开发。直接使用PP-OCRv3进行评估的精度为62.99%。为提升识别精度,我们首先使用数据合成工具合成了3k数据,基于这部分数据进行finetune,识别精度提升至73.66%。由于合成数据与真实数据之间的分布存在差异,为进一步提升精度,我们使用网络爬虫配合数据挖掘策略得到了1k带标签的真实数据,基于真实数据finetune的精度为71.33%。最后,我们综合使用合成数据和真实数据进行finetune,将识别精度提升至86.99%。各策略的精度提升效果如下: + +| 策略 | 精度| +| :--------------- | :-------- | +| PP-OCRv3评估 | 62.99| +| 合成数据finetune | 73.66| +| 真实数据finetune | 71.33| +| 真实+合成数据finetune | 86.99| + +AIStudio项目链接: [一种基于PaddleOCR的包装生产日期识别方法](https://aistudio.baidu.com/aistudio/projectdetail/4287736) + +## 2. 环境搭建 + +本任务基于Aistudio完成, 具体环境如下: + +- 操作系统: Linux +- PaddlePaddle: 2.3 +- PaddleOCR: Release/2.5 +- text_renderer: master + +下载PaddlleOCR代码并安装依赖库: + +```bash linenums="1" +git clone -b dygraph https://gitee.com/paddlepaddle/PaddleOCR + +# 安装依赖库 +cd PaddleOCR +pip install -r PaddleOCR/requirements.txt +``` + +## 3. 数据准备 + +本项目使用人工预标注的300张图像作为测试集。 + +部分数据示例如下: + +![](./images/39ff30e0ab0442579712255e6a9ea6b5271169c98e624e6eb2b8781f003bfea0.png) + +标签文件格式如下: + +```txt +数据路径 标签(中间以制表符分隔) +``` + +|数据集类型|数量| +|---|---| +|测试集| 300| + +数据集[下载链接](https://aistudio.baidu.com/aistudio/datasetdetail/149770),下载后可以通过下方命令解压: + +```bash linenums="1" +tar -xvf data.tar +mv data ${PaddleOCR_root} +``` + +数据解压后的文件结构如下: + +```bash linenums="1" +PaddleOCR +├── data +│ ├── mining_images # 挖掘的真实数据示例 +│ ├── mining_train.list # 挖掘的真实数据文件列表 +│ ├── render_images # 合成数据示例 +│ ├── render_train.list # 合成数据文件列表 +│ ├── val # 测试集数据 +│ └── val.list # 测试集数据文件列表 +| ├── bg # 合成数据所需背景图像 +│ └── corpus # 合成数据所需语料 +``` + +## 4. 直接使用PP-OCRv3模型评估 + +准备好测试数据后,可以使用PaddleOCR的PP-OCRv3模型进行识别。 + +### 下载预训练模型 + +首先需要下载PP-OCR v3中英文识别模型文件,下载链接可以在[link](../ppocr/model_list.md)获取,下载命令: + +```bash linenums="1" +cd ${PaddleOCR_root} +mkdir ckpt +wget -nc -P ckpt https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +pushd ckpt/ +tar -xvf ch_PP-OCRv3_rec_train.tar +popd +``` + +### 模型评估 + +使用以下命令进行PP-OCRv3评估: + +```bash linenums="1" +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml \ + -o Global.checkpoints=ckpt/ch_PP-OCRv3_rec_train/best_accuracy \ + Eval.dataset.data_dir=./data \ + Eval.dataset.label_file_list=["./data/val.list"] + +``` + +其中各参数含义如下: + +```bash linenums="1" +-c: 指定使用的配置文件,ch_PP-OCRv3_rec_distillation.yml对应于OCRv3识别模型。 +-o: 覆盖配置文件中参数 +Global.checkpoints: 指定评估使用的模型文件路径 +Eval.dataset.data_dir: 指定评估数据集路径 +Eval.dataset.label_file_list: 指定评估数据集文件列表 +``` + +## 5. 基于合成数据finetune + +### 5.1 Text Renderer数据合成方法 + +#### 5.1.1 下载Text Renderer代码 + +首先从github或gitee下载Text Renderer代码,并安装相关依赖。 + +```bash linenums="1" +git clone https://gitee.com/wowowoll/text_renderer.git + +# 安装依赖库 +cd text_renderer +pip install -r requirements.txt +``` + +使用text renderer合成数据之前需要准备好背景图片、语料以及字体库,下面将逐一介绍各个步骤。 + +#### 5.1.2 准备背景图片 + +观察日常生活中常见的包装生产日期图片,我们可以发现其背景相对简单。为此我们可以从网上找一下图片,截取部分图像块作为背景图像。 + +本项目已准备了部分图像作为背景图片,在第3部分完成数据准备后,可以得到我们准备好的背景图像,示例如下: + +![](./images/456ae2acb27d4a94896c478812aee0bc3551c703d7bd40c9be4dc983c7b3fc8a.png) + +背景图像存放于如下位置: + +```bash linenums="1" +PaddleOCR +├── data +| ├── bg # 合成数据所需背景图像 +``` + +#### 5.1.3 准备语料 + +观察测试集生产日期图像,我们可以知道如下数据有如下特点: + +1. 由年月日组成,中间可能以“/”、“-”、“:”、“.”或者空格间隔,也可能以汉字年月日分隔 +2. 有些生产日期包含在产品批号中,此时可能包含具体时间、英文字母或数字标识 + +基于以上两点,我们编写语料生成脚本: + +```python linenums="1" +import random +from random import choice +import os + +cropus_num = 2000 #设置语料数量 + +def get_cropus(f): + # 随机生成年份 + year = random.randint(0, 22) + # 随机生成月份 + month = random.randint(1, 12) + # 随机生成日期 + day_dict = {31: [1,3,5,7,8,10,12], 30: [4,6,9,11], 28: [2]} + for item in day_dict: + if month in day_dict[item]: + day = random.randint(0, item) + # 随机生成小时 + hours = random.randint(0, 24) + # 随机生成分钟 + minute = random.randint(0, 60) + # 随机生成秒数 + second = random.randint(0, 60) + + # 随机生成产品标识字符 + length = random.randint(0, 6) + file_id = [] + flag = 0 + my_dict = [i for i in range(48,58)] + [j for j in range(40, 42)] + [k for k in range(65,90)] # 大小写字母 + 括号 + + for i in range(1, length): + if flag: + if i == flag+2: #括号匹配 + file_id.append(')') + flag = 0 + continue + sel = choice(my_dict) + if sel == 41: + continue + if sel == 40: + if i == 1 or i > length-3: + continue + flag = i + my_ascii = chr(sel) + file_id.append(my_ascii) + file_id_str = ''.join(file_id) + + #随机生成产品标识字符 + file_id2 = random.randint(0, 9) + + rad = random.random() + if rad < 0.3: + f.write('20{:02d}{:02d}{:02d} {}'.format(year, month, day, file_id_str)) + elif 0.3 < rad < 0.5: + f.write('20{:02d}年{:02d}月{:02d}日'.format(year, month, day)) + elif 0.5 < rad < 0.7: + f.write('20{:02d}/{:02d}/{:02d}'.format(year, month, day)) + elif 0.7 < rad < 0.8: + f.write('20{:02d}-{:02d}-{:02d}'.format(year, month, day)) + elif 0.8 < rad < 0.9: + f.write('20{:02d}.{:02d}.{:02d}'.format(year, month, day)) + else: + f.write('{:02d}:{:02d}:{:02d} {:02d}'.format(hours, minute, second, file_id2)) + +if __name__ == "__main__": + file_path = '/home/aistudio/text_renderer/my_data/cropus' + if not os.path.exists(file_path): + os.makedirs(file_path) + file_name = os.path.join(file_path, 'books.txt') + f = open(file_name, 'w') + for i in range(cropus_num): + get_cropus(f) + if i < cropus_num-1: + f.write('\n') + + f.close() +``` + +本项目已准备了部分语料,在第3部分完成数据准备后,可以得到我们准备好的语料库,默认位置如下: + +```bash linenums="1" +PaddleOCR +├── data +│ └── corpus #合成数据所需语料 +``` + +#### 5.1.4 下载字体 + +观察包装生产日期,我们可以发现其使用的字体为点阵体。字体可以在如下网址下载: + + +本项目已准备了部分字体,在第3部分完成数据准备后,可以得到我们准备好的字体,默认位置如下: + +```bash linenums="1" +PaddleOCR +├── data +│ └── fonts #合成数据所需字体 +``` + +下载好字体后,还需要在list文件中指定字体文件存放路径,脚本如下: + +```bash linenums="1" +cd text_renderer/my_data/ +touch fonts.list +ls /home/aistudio/PaddleOCR/data/fonts/* > fonts.list +``` + +#### 5.1.5 运行数据合成命令 + +完成数据准备后,my_data文件结构如下: + +```bash linenums="1" +my_data/ +├── cropus +│ └── books.txt #语料库 +├── eng.txt #字符列表 +└── fonts.list #字体列表 +``` + +在运行合成数据命令之前,还有两处细节需要手动修改: + +1. 将默认配置文件`text_renderer/configs/default.yaml`中第9行enable的值设为`true`,即允许合成彩色图像。否则合成的都是灰度图。 + + ```yaml linenums="1" + # color boundary is in R,G,B format + font_color: + + enable: true #false + ``` + +2. 将`text_renderer/textrenderer/renderer.py`第184行作如下修改,取消padding。否则图片两端会有一些空白。 + + ```python linenums="1" + padding = random.randint(s_bbox_width // 10, s_bbox_width // 8) #修改前 + padding = 0 #修改后 + ``` + +运行数据合成命令: + +```bash linenums="1" +cd /home/aistudio/text_renderer/ +python main.py --num_img=3000 \ + --fonts_list='./my_data/fonts.list' \ + --corpus_dir "./my_data/cropus" \ + --corpus_mode "list" \ + --bg_dir "/home/aistudio/PaddleOCR/data/bg/" \ + --img_width 0 +``` + +合成好的数据默认保存在`text_renderer/output`目录下,可进入该目录查看合成的数据。 + +合成数据示例如下 +![](./images/d686a48d465a43d09fbee51924fdca42ee21c50e676646da8559fb9967b94185.png) + +数据合成好后,还需要生成如下格式的训练所需的标注文件, + +``` +图像路径 标签 +``` + +使用如下脚本即可生成标注文件: + +```python linenums="1" +import random + +abspath = '/home/aistudio/text_renderer/output/default/' + +#标注文件生成路径 +fout = open('./render_train.list', 'w', encoding='utf-8') + +with open('./output/default/tmp_labels.txt','r') as f: + lines = f.readlines() + for item in lines: + label = item[9:] + filename = item[:8] + '.jpg' + fout.write(abspath + filename + '\t' + label) + + fout.close() +``` + +经过以上步骤,我们便完成了包装生产日期数据合成。 +数据位于`text_renderer/output`,标注文件位于`text_renderer/render_train.list`。 + +本项目提供了生成好的数据供大家体验,完成步骤3的数据准备后,可得数据路径位于: + +```bash linenums="1" +PaddleOCR +├── data +│ ├── render_images # 合成数据示例 +│ ├── render_train.list #合成数据文件列表 +``` + +### 5.2 模型训练 + +准备好合成数据后,我们可以使用以下命令,利用合成数据进行finetune: + +```bash linenums="1" +cd ${PaddleOCR_root} +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml \ + -o Global.pretrained_model=./ckpt/ch_PP-OCRv3_rec_train/best_accuracy \ + Global.epoch_num=20 \ + Global.eval_batch_step='[0, 20]' \ + Train.dataset.data_dir=./data \ + Train.dataset.label_file_list=['./data/render_train.list'] \ + Train.loader.batch_size_per_card=64 \ + Eval.dataset.data_dir=./data \ + Eval.dataset.label_file_list=["./data/val.list"] \ + Eval.loader.batch_size_per_card=64 + +``` + +其中各参数含义如下: + +```txt +-c: 指定使用的配置文件,ch_PP-OCRv3_rec_distillation.yml对应于OCRv3识别模型。 +-o: 覆盖配置文件中参数 +Global.pretrained_model: 指定finetune使用的预训练模型 +Global.epoch_num: 指定训练的epoch数 +Global.eval_batch_step: 间隔多少step做一次评估 +Train.dataset.data_dir: 训练数据集路径 +Train.dataset.label_file_list: 训练集文件列表 +Train.loader.batch_size_per_card: 训练单卡batch size +Eval.dataset.data_dir: 评估数据集路径 +Eval.dataset.label_file_list: 评估数据集文件列表 +Eval.loader.batch_size_per_card: 评估单卡batch size +``` + +## 6. 基于真实数据finetune + +使用合成数据finetune能提升我们模型的识别精度,但由于合成数据和真实数据之间的分布可能有一定差异,因此作用有限。为进一步提高识别精度,本节介绍如何挖掘真实数据进行模型finetune。 + +数据挖掘的整体思路如下: + +1. 使用python爬虫从网上获取大量无标签数据 +2. 使用模型从大量无标签数据中构建出有效训练集 + +### 6.1 python爬虫获取数据 + +推荐使用[爬虫工具](https://github.com/Joeclinton1/google-images-download)获取无标签图片。图片获取后,可按如下目录格式组织: + +```txt +sprider +├── file.list +├── data +│ ├── 00000.jpg +│ ├── 00001.jpg +... +``` + +### 6.2 数据挖掘 + +我们使用PaddleOCR对获取到的图片进行挖掘,具体步骤如下: + +1. 使用 PP-OCRv3检测模型+svtr-tiny识别模型,对每张图片进行预测。 +2. 使用数据挖掘策略,得到有效图片。 +3. 将有效图片对应的图像区域和标签提取出来,构建训练集。 + +首先下载预训练模型,PP-OCRv3检测模型下载链接: + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +PaddleX官方交流频道: + +完成下载后,可将模型存储于如下位置: + +```bash linenums="1" +PaddleOCR +├── data +│ ├── rec_vit_sub_64_363_all/ # svtr_tiny高精度识别模型 +``` + +```bash linenums="1" +# 下载解压PP-OCRv3检测模型 +cd ${PaddleOCR_root} +wget -nc -P ckpt https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +pushd ckpt +tar -xvf ch_PP-OCRv3_det_infer.tar +popd ckpt +``` + +在使用PPOCRv3检测模型+svtr-tiny识别模型进行预测之前,有如下两处细节需要手动修改: + +1. 将`tools/infer/predict_rec.py`中第110行`imgW`修改为`320` + + ```python linenums="1" + #imgW = int((imgH * max_wh_ratio)) + imgW = 320 + ``` + +2. 将`tools/infer/predict_system.py`第169行添加如下一行,将预测分数也写入结果文件中。 + + ```python linenums="1" + "scores": rec_res[idx][1], + ``` + +模型预测命令: + +```bash linenums="1" +python tools/infer/predict_system.py \ + --image_dir="/home/aistudio/sprider/data" \ + --det_model_dir="./ckpt/ch_PP-OCRv3_det_infer/" \ + --rec_model_dir="/home/aistudio/PaddleOCR/data/rec_vit_sub_64_363_all/" \ + --rec_image_shape="3,32,320" +``` + +获得预测结果后,我们使用数据挖掘策略得到有效图片。具体挖掘策略如下: + +1. 预测置信度高于95% +2. 识别结果包含字符‘20’,即年份 +3. 没有中文,或者有中文并且‘日’和'月'同时在识别结果中 + +```python linenums="1" +# 获取有效预测 + +import json +import re + +zh_pattern = re.compile(u'[\u4e00-\u9fa5]+') #正则表达式,筛选字符是否包含中文 + +file_path = '/home/aistudio/PaddleOCR/inference_results/system_results.txt' +out_path = '/home/aistudio/PaddleOCR/selected_results.txt' +f_out = open(out_path, 'w') + +with open(file_path, "r", encoding='utf-8') as fin: + lines = fin.readlines() + + +for line in lines: + flag = False + # 读取文件内容 + file_name, json_file = line.strip().split('\t') + preds = json.loads(json_file) + res = [] + for item in preds: + transcription = item['transcription'] #获取识别结果 + scores = item['scores'] #获取识别得分 + # 挖掘策略 + if scores > 0.95: + if '20' in transcription and len(transcription) > 4 and len(transcription) < 12: + word = transcription + if not(zh_pattern.search(word) and ('日' not in word or '月' not in word)): + flag = True + res.append(item) + save_pred = file_name + "\t" + json.dumps( + res, ensure_ascii=False) + "\n" + if flag ==True: + f_out.write(save_pred) + +f_out.close() +``` + +然后将有效预测对应的图像区域和标签提取出来,构建训练集。具体实现脚本如下: + +```python linenums="1" +import cv2 +import json +import numpy as np + +PATH = '/home/aistudio/PaddleOCR/inference_results/' #数据原始路径 +SAVE_PATH = '/home/aistudio/mining_images/' #裁剪后数据保存路径 +file_list = '/home/aistudio/PaddleOCR/selected_results.txt' #数据预测结果 +label_file = '/home/aistudio/mining_images/mining_train.list' #输出真实数据训练集标签list + +if not os.path.exists(SAVE_PATH): + os.mkdir(SAVE_PATH) + +f_label = open(label_file, 'w') + + +def get_rotate_crop_image(img, points): + """ + 根据检测结果points,从输入图像img中裁剪出相应的区域 + """ + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + # 形变或倾斜,会做透视变换,reshape成矩形 + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + +def crop_and_get_filelist(file_list): + with open(file_list, "r", encoding='utf-8') as fin: + lines = fin.readlines() + + img_num = 0 + for line in lines: + img_name, json_file = line.strip().split('\t') + preds = json.loads(json_file) + for item in preds: + transcription = item['transcription'] + points = item['points'] + points = np.array(points).astype('float32') + #print('processing {}...'.format(img_name)) + + img = cv2.imread(PATH+img_name) + dst_img = get_rotate_crop_image(img, points) + h, w, c = dst_img.shape + newWidth = int((32. / h) * w) + newImg = cv2.resize(dst_img, (newWidth, 32)) + new_img_name = '{:05d}.jpg'.format(img_num) + cv2.imwrite(SAVE_PATH+new_img_name, dst_img) + f_label.write(SAVE_PATH+new_img_name+'\t'+transcription+'\n') + img_num += 1 + + +crop_and_get_filelist(file_list) +f_label.close() +``` + +### 6.3 模型训练 + +通过数据挖掘,我们得到了真实场景数据和对应的标签。接下来使用真实数据finetune,观察精度提升效果。 + +利用真实数据进行finetune: + +```bash linenums="1" +cd ${PaddleOCR_root} +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml \ + -o Global.pretrained_model=./ckpt/ch_PP-OCRv3_rec_train/best_accuracy \ + Global.epoch_num=20 \ + Global.eval_batch_step='[0, 20]' \ + Train.dataset.data_dir=./data \ + Train.dataset.label_file_list=['./data/mining_train.list'] \ + Train.loader.batch_size_per_card=64 \ + Eval.dataset.data_dir=./data \ + Eval.dataset.label_file_list=["./data/val.list"] \ + Eval.loader.batch_size_per_card=64 +``` + +各参数含义参考第6部分合成数据finetune,只需要对训练数据路径做相应的修改: + +```txt +Train.dataset.data_dir: 训练数据集路径 +Train.dataset.label_file_list: 训练集文件列表 +``` + +示例使用我们提供的真实数据进行finetune,如想换成自己的数据,只需要相应的修改`Train.dataset.data_dir`和`Train.dataset.label_file_list`参数即可。 + +由于数据量不大,这里仅训练20个epoch即可。训练完成后,可以得到合成数据finetune后的精度为best acc=**71.33%**。 + +由于数量比较少,精度会比合成数据finetue的略低。 + +## 7. 基于合成+真实数据finetune + +为进一步提升模型精度,我们结合使用合成数据和挖掘到的真实数据进行finetune。 + +利用合成+真实数据进行finetune,各参数含义参考第6部分合成数据finetune,只需要对训练数据路径做相应的修改: + +```txt +Train.dataset.data_dir: 训练数据集路径 +Train.dataset.label_file_list: 训练集文件列表 +``` + +生成训练list文件: + +```bash linenums="1" +# 生成训练集文件list +cat /home/aistudio/PaddleOCR/data/render_train.list /home/aistudio/PaddleOCR/data/mining_train.list > /home/aistudio/PaddleOCR/data/render_mining_train.list +``` + +启动训练: + +```bash linenums="1" +cd ${PaddleOCR_root} +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml \ + -o Global.pretrained_model=./ckpt/ch_PP-OCRv3_rec_train/best_accuracy \ + Global.epoch_num=40 \ + Global.eval_batch_step='[0, 20]' \ + Train.dataset.data_dir=./data \ + Train.dataset.label_file_list=['./data/render_mining_train.list'] \ + Train.loader.batch_size_per_card=64 \ + Eval.dataset.data_dir=./data \ + Eval.dataset.label_file_list=["./data/val.list"] \ + Eval.loader.batch_size_per_card=64 +``` + +示例使用我们提供的真实+合成数据进行finetune,如想换成自己的数据,只需要相应的修改Train.dataset.data_dir和Train.dataset.label_file_list参数即可。 + +由于数据量不大,这里仅训练40个epoch即可。训练完成后,可以得到合成数据finetune后的精度为best acc=**86.99%**。 + +可以看到,相较于原始PP-OCRv3的识别精度62.99%,使用合成数据+真实数据finetune后,识别精度能提升24%。 + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +PaddleX官方交流频道: + +模型的推理部署方法可以参考repo文档: [docs](../ppocr/infer_deploy/python_infer.md) diff --git "a/docs/applications/\345\215\260\347\253\240\345\274\257\346\233\262\346\226\207\345\255\227\350\257\206\345\210\253.md" "b/docs/applications/\345\215\260\347\253\240\345\274\257\346\233\262\346\226\207\345\255\227\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..2eb227aa64 --- /dev/null +++ "b/docs/applications/\345\215\260\347\253\240\345\274\257\346\233\262\346\226\207\345\255\227\350\257\206\345\210\253.md" @@ -0,0 +1,1033 @@ +--- +typora-copy-images-to: images +comments: true +--- + +## 1. 项目介绍 + +弯曲文字识别在OCR任务中有着广泛的应用,比如:自然场景下的招牌,艺术文字,以及常见的印章文字识别。 + +在本项目中,将以印章识别任务为例,介绍如何使用PaddleDetection和PaddleOCR完成印章检测和印章文字识别任务。 + +项目难点: + +1. 缺乏训练数据 +2. 图像质量参差不齐,图像模糊,文字不清晰 + +针对以上问题,本项目选用PaddleOCR里的PPOCRLabel工具完成数据标注。基于PaddleDetection完成印章区域检测,然后通过PaddleOCR里的端对端OCR算法和两阶段OCR算法分别完成印章文字识别任务。不同任务的精度效果如下: + +| 任务 | 训练数据数量 | 精度 | +| -------- | - | -------- | +| 印章检测 | 1000 | 95.00% | +| 印章文字识别-端对端OCR方法 | 700 | 47.00% | +| 印章文字识别-两阶段OCR方法 | 700 | 55.00% | + +点击进入 [AI Studio 项目](https://aistudio.baidu.com/aistudio/projectdetail/4586113) + +## 2. 环境搭建 + +本项目需要准备PaddleDetection和PaddleOCR的项目运行环境,其中PaddleDetection用于实现印章检测任务,PaddleOCR用于实现文字识别任务 + +### 2.1 准备PaddleDetection环境 + +下载PaddleDetection代码: + +```bash linenums="1" +!git clone https://github.com/PaddlePaddle/PaddleDetection.git +# 如果克隆github代码较慢,请从gitee上克隆代码 +#git clone https://gitee.com/PaddlePaddle/PaddleDetection.git +``` + +安装PaddleDetection依赖 + +```bash linenums="1" +!cd PaddleDetection && pip install -r requirements.txt +``` + +### 2.2 准备PaddleOCR环境 + +下载PaddleOCR代码: + +```bash linenums="1" +!git clone https://github.com/PaddlePaddle/PaddleOCR.git +# 如果克隆github代码较慢,请从gitee上克隆代码 +#git clone https://gitee.com/PaddlePaddle/PaddleOCR.git +``` + +安装PaddleOCR依赖 + +```bash linenums="1" +!cd PaddleOCR && git checkout dygraph && pip install -r requirements.txt +``` + +## 3. 数据集准备 + +### 3.1 数据标注 + +本项目中使用[PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel)工具标注印章检测数据,标注内容包括印章的位置以及印章中文字的位置和文字内容。 + +注:PPOCRLabel的使用方法参考[文档](https://github.com/PFCCLab/PPOCRLabel)。 + +PPOCRlabel标注印章数据步骤: + +- 打开数据集所在文件夹 +- 按下快捷键Q进行4点(多点)标注——针对印章文本识别, + - 印章弯曲文字包围框采用偶数点标注(比如4点,8点,16点),按照阅读顺序,以16点标注为例,从文字左上方开始标注->到文字右上方标注8个点->到文字右下方->文字左下方8个点,一共8个点,形成包围曲线,参考下图。如果文字弯曲程度不高,为了减小标注工作量,可以采用4点、8点标注,需要注意的是,文字上下点数相同。(总点数尽量不要超过18个) + - 对于需要识别的印章中非弯曲文字,采用4点框标注即可 + - 对应包围框的文字部分默认是”待识别”,需要修改为包围框内的具体文字内容 +- 快捷键W进行矩形标注——针对印章区域检测,印章检测区域保证标注框包围整个印章,包围框对应文字可以设置为'印章区域',方便后续处理。 +- 针对印章中的水平文字可以视情况考虑矩形或四点标注:保证按行标注即可。如果背景文字与印章文字比较接近,标注时尽量避开背景文字。 +- 标注完成后修改右侧文本结果,确认无误后点击下方check(或CTRL+V),确认本张图片的标注。 +- 所有图片标注完成后,在顶部菜单栏点击File -> Export Label导出label.txt。 + +标注完成后,可视化效果如下: +![](./images/f5acbc4f50dd401a8f535ed6a263f94b0edff82c1aed4285836a9ead989b9c13.png) + +数据标注完成后,标签中包含印章检测的标注和印章文字识别的标注,如下所示: + +```text linenums="1" +img/1.png [{"transcription": "印章区域", "points": [[87, 245], [214, 245], [214, 369], [87, 369]], "difficult": false}, {"transcription": "国家税务总局泸水市税务局第二税务分局", "points": [[110, 314], [116, 290], [131, 275], [152, 273], [170, 277], [181, 289], [186, 303], [186, 312], [201, 311], [198, 289], [189, 272], [175, 259], [152, 252], [124, 257], [100, 280], [94, 312]], "difficult": false}, {"transcription": "征税专用章", "points": [[117, 334], [183, 334], [183, 352], [117, 352]], "difficult": false}] +``` + +标注中包含表示'印章区域'的坐标和'印章文字'坐标以及文字内容。 + +### 3.2 数据处理 + +标注时为了方便标注,没有区分印章区域的标注框和文字区域的标注框,可以通过python代码完成标签的划分。 + +在本项目的'/home/aistudio/work/seal_labeled_datas'目录下,存放了标注的数据示例,如下: + +![](./images/3d762970e2184177a2c633695a31029332a4cd805631430ea797309492e45402.jpeg) + +标签文件'/home/aistudio/work/seal_labeled_datas/Label.txt'中的标注内容如下: + +```text linenums="1" +img/test1.png [{"transcription": "待识别", "points": [[408, 232], [537, 232], [537, 352], [408, 352]], "difficult": false}, {"transcription": "电子回单", "points": [[437, 305], [504, 305], [504, 322], [437, 322]], "difficult": false}, {"transcription": "云南省农村信用社", "points": [[417, 290], [434, 295], [438, 281], [446, 267], [455, 261], [472, 258], [489, 264], [498, 277], [502, 295], [526, 289], [518, 267], [503, 249], [475, 232], [446, 239], [429, 255], [418, 275]], "difficult": false}, {"transcription": "专用章", "points": [[437, 319], [503, 319], [503, 338], [437, 338]], "difficult": false}] +``` + +为了方便训练,我们需要通过python代码将用于训练印章检测和训练印章文字识别的标注区分开。 + +
+ +```python linenums="1" +import numpy as np +import json +import cv2 +import os +from shapely.geometry import Polygon + + +def poly2box(poly): + xmin = np.min(np.array(poly)[:, 0]) + ymin = np.min(np.array(poly)[:, 1]) + xmax = np.max(np.array(poly)[:, 0]) + ymax = np.max(np.array(poly)[:, 1]) + return np.array([[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]) + + +def draw_text_det_res(dt_boxes, src_im, color=(255, 255, 0)): + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(src_im, [box], True, color=color, thickness=2) + return src_im + +class LabelDecode(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + label = json.loads(data['label']) + + nBox = len(label) + seal_boxes = self.get_seal_boxes(label) + + gt_label = [] + + for seal_box in seal_boxes: + seal_anno = {'seal_box': seal_box} + boxes, txts, txt_tags = [], [], [] + + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + try: + ints = self.get_intersection(box, seal_box) + except Exception as E: + print(E) + continue + + if abs(Polygon(box).area - self.get_intersection(box, seal_box)) < 1e-3 and \ + abs(Polygon(box).area - self.get_union(box, seal_box)) > 1e-3: + + boxes.append(box) + txts.append(txt) + if txt in ['*', '###', '待识别']: + txt_tags.append(True) + else: + txt_tags.append(False) + + seal_anno['polys'] = boxes + seal_anno['texts'] = txts + seal_anno['ignore_tags'] = txt_tags + + gt_label.append(seal_anno) + + return gt_label + + def get_seal_boxes(self, label): + + nBox = len(label) + seal_box = [] + for bno in range(0, nBox): + box = label[bno]['points'] + if len(box) == 4: + seal_box.append(box) + + if len(seal_box) == 0: + return None + + seal_box = self.valid_seal_box(seal_box) + return seal_box + + + def is_seal_box(self, box, boxes): + is_seal = True + for poly in boxes: + if list(box.shape()) != list(box.shape.shape()): + if abs(Polygon(box).area - self.get_intersection(box, poly)) < 1e-3: + return False + else: + if np.sum(np.array(box) - np.array(poly)) < 1e-3: + # continue when the box is same with poly + continue + if abs(Polygon(box).area - self.get_intersection(box, poly)) < 1e-3: + return False + return is_seal + + + def valid_seal_box(self, boxes): + if len(boxes) == 1: + return boxes + + new_boxes = [] + flag = True + for k in range(0, len(boxes)): + flag = True + tmp_box = boxes[k] + for i in range(0, len(boxes)): + if k == i: continue + if abs(Polygon(tmp_box).area - self.get_intersection(tmp_box, boxes[i])) < 1e-3: + flag = False + continue + if flag: + new_boxes.append(tmp_box) + + return new_boxes + + + def get_union(self, pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(self, pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(self, pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def expand_points_num(self, boxes): + max_points_num = 0 + for box in boxes: + if len(box) > max_points_num: + max_points_num = len(box) + ex_boxes = [] + for box in boxes: + ex_box = box + [box[-1]] * (max_points_num - len(box)) + ex_boxes.append(ex_box) + return ex_boxes + + +def gen_extract_label(data_dir, label_file, seal_gt, seal_ppocr_gt): + label_decode_func = LabelDecode() + gts = open(label_file, "r").readlines() + + seal_gt_list = [] + seal_ppocr_list = [] + + for idx, line in enumerate(gts): + img_path, label = line.strip().split("\t") + data = {'label': label, 'img_path':img_path} + res = label_decode_func(data) + src_img = cv2.imread(os.path.join(data_dir, img_path)) + if res is None: + print("ERROR! res is None!") + continue + + anno = [] + for i, gt in enumerate(res): + # print(i, box, type(box), ) + anno.append({'polys': gt['seal_box'], 'cls':1}) + + seal_gt_list.append(f"{img_path}\t{json.dumps(anno)}\n") + seal_ppocr_list.append(f"{img_path}\t{json.dumps(res)}\n") + + if not os.path.exists(os.path.dirname(seal_gt)): + os.makedirs(os.path.dirname(seal_gt)) + if not os.path.exists(os.path.dirname(seal_ppocr_gt)): + os.makedirs(os.path.dirname(seal_ppocr_gt)) + + with open(seal_gt, "w") as f: + f.writelines(seal_gt_list) + f.close() + + with open(seal_ppocr_gt, 'w') as f: + f.writelines(seal_ppocr_list) + f.close() + +def vis_seal_ppocr(data_dir, label_file, save_dir): + + datas = open(label_file, 'r').readlines() + for idx, line in enumerate(datas): + img_path, label = line.strip().split('\t') + img_path = os.path.join(data_dir, img_path) + + label = json.loads(label) + src_im = cv2.imread(img_path) + if src_im is None: + continue + + for anno in label: + seal_box = anno['seal_box'] + txt_boxes = anno['polys'] + + # vis seal box + src_im = draw_text_det_res([seal_box], src_im, color=(255, 255, 0)) + src_im = draw_text_det_res(txt_boxes, src_im, color=(255, 0, 0)) + + save_path = os.path.join(save_dir, os.path.basename(img_path)) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + # print(src_im.shape) + cv2.imwrite(save_path, src_im) + + +def draw_html(img_dir, save_name): + import glob + + images_dir = glob.glob(img_dir + "/*") + print(len(images_dir)) + + html_path = save_name + with open(html_path, 'w') as html: + html.write('\n\n') + html.write('\n') + html.write("") + + html.write("\n") + html.write(f'\n") + html.write(f'' % (base)) + html.write("\n") + + html.write('\n') + html.write('
\n GT') + + for i, filename in enumerate(sorted(images_dir)): + if filename.endswith("txt"): continue + print(filename) + + base = "{}".format(filename) + if True: + html.write("
{filename}\n GT') + html.write('GT 310\n
\n') + html.write('\n\n') + print("ok") + + +def crop_seal_from_img(label_file, data_dir, save_dir, save_gt_path): + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + datas = open(label_file, 'r').readlines() + all_gts = [] + count = 0 + for idx, line in enumerate(datas): + img_path, label = line.strip().split('\t') + img_path = os.path.join(data_dir, img_path) + + label = json.loads(label) + src_im = cv2.imread(img_path) + if src_im is None: + continue + + for c, anno in enumerate(label): + seal_poly = anno['seal_box'] + txt_boxes = anno['polys'] + txts = anno['texts'] + ignore_tags = anno['ignore_tags'] + + box = poly2box(seal_poly) + img_crop = src_im[box[0][1]:box[2][1], box[0][0]:box[2][0], :] + + save_path = os.path.join(save_dir, f"{idx}_{c}.jpg") + cv2.imwrite(save_path, np.array(img_crop)) + + img_gt = [] + for i in range(len(txts)): + txt_boxes_crop = np.array(txt_boxes[i]) + txt_boxes_crop[:, 1] -= box[0, 1] + txt_boxes_crop[:, 0] -= box[0, 0] + img_gt.append({'transcription': txts[i], "points": txt_boxes_crop.tolist(), "ignore_tag": ignore_tags[i]}) + + if len(img_gt) >= 1: + count += 1 + save_gt = f"{os.path.basename(save_path)}\t{json.dumps(img_gt)}\n" + + all_gts.append(save_gt) + + print(f"The num of all image: {len(all_gts)}, and the number of useful image: {count}") + if not os.path.exists(os.path.dirname(save_gt_path)): + os.makedirs(os.path.dirname(save_gt_path)) + + with open(save_gt_path, "w") as f: + f.writelines(all_gts) + f.close() + print("Done") + + +if __name__ == "__main__": + # 数据处理 + gen_extract_label("./seal_labeled_datas", "./seal_labeled_datas/Label.txt", "./seal_ppocr_gt/seal_det_img.txt", "./seal_ppocr_gt/seal_ppocr_img.txt") + vis_seal_ppocr("./seal_labeled_datas", "./seal_ppocr_gt/seal_ppocr_img.txt", "./seal_ppocr_gt/seal_ppocr_vis/") + draw_html("./seal_ppocr_gt/seal_ppocr_vis/", "./vis_seal_ppocr.html") + seal_ppocr_img_label = "./seal_ppocr_gt/seal_ppocr_img.txt" + crop_seal_from_img(seal_ppocr_img_label, "./seal_labeled_datas/", "./seal_img_crop", "./seal_img_crop/label.txt") +``` + +
+ +处理完成后,生成的文件如下: + +```text linenums="1" +├── seal_img_crop/ +│ ├── 0_0.jpg +│ ├── ... +│ └── label.txt +├── seal_ppocr_gt/ +│ ├── seal_det_img.txt +│ ├── seal_ppocr_img.txt +│ └── seal_ppocr_vis/ +│ ├── test1.png +│ ├── ... +└── vis_seal_ppocr.html + +``` + +其中`seal_img_crop/label.txt`文件为印章识别标签文件,其内容格式为: + +```text linenums="1" +0_0.jpg [{"transcription": "\u7535\u5b50\u56de\u5355", "points": [[29, 73], [96, 73], [96, 90], [29, 90]], "ignore_tag": false}, {"transcription": "\u4e91\u5357\u7701\u519c\u6751\u4fe1\u7528\u793e", "points": [[9, 58], [26, 63], [30, 49], [38, 35], [47, 29], [64, 26], [81, 32], [90, 45], [94, 63], [118, 57], [110, 35], [95, 17], [67, 0], [38, 7], [21, 23], [10, 43]], "ignore_tag": false}, {"transcription": "\u4e13\u7528\u7ae0", "points": [[29, 87], [95, 87], [95, 106], [29, 106]], "ignore_tag": false}] +``` + +可以直接用于PaddleOCR的PGNet算法的训练。 + +`seal_ppocr_gt/seal_det_img.txt`为印章检测标签文件,其内容格式为: + +```text linenums="1" +img/test1.png [{"polys": [[408, 232], [537, 232], [537, 352], [408, 352]], "cls": 1}] +``` + +为了使用PaddleDetection工具完成印章检测模型的训练,需要将`seal_det_img.txt`转换为COCO或者VOC的数据标注格式。 + +可以直接使用下述代码将印章检测标注转换成VOC格式。 + +
+ +```python linenums="1" +import numpy as np +import json +import cv2 +import os +from shapely.geometry import Polygon + +seal_train_gt = "./seal_ppocr_gt/seal_det_img.txt" +# 注:仅用于示例,实际使用中需要分别转换训练集和测试集的标签 +seal_valid_gt = "./seal_ppocr_gt/seal_det_img.txt" + +def gen_main_train_txt(mode='train'): + if mode == "train": + file_path = seal_train_gt + if mode in ['valid', 'test']: + file_path = seal_valid_gt + + save_path = f"./seal_VOC/ImageSets/Main/{mode}.txt" + save_train_path = f"./seal_VOC/{mode}.txt" + if not os.path.exists(os.path.dirname(save_path)): + os.makedirs(os.path.dirname(save_path)) + + datas = open(file_path, 'r').readlines() + img_names = [] + train_names = [] + for line in datas: + img_name = line.strip().split('\t')[0] + img_name = os.path.basename(img_name) + (i_name, extension) = os.path.splitext(img_name) + t_name = 'JPEGImages/'+str(img_name)+' '+'Annotations/'+str(i_name)+'.xml\n' + train_names.append(t_name) + img_names.append(i_name + "\n") + + with open(save_train_path, "w") as f: + f.writelines(train_names) + f.close() + + with open(save_path, "w") as f: + f.writelines(img_names) + f.close() + + print(f"{mode} save done") + + +def gen_xml_label(mode='train'): + if mode == "train": + file_path = seal_train_gt + if mode in ['valid', 'test']: + file_path = seal_valid_gt + + datas = open(file_path, 'r').readlines() + img_names = [] + train_names = [] + anno_path = "./seal_VOC/Annotations" + img_path = "./seal_VOC/JPEGImages" + + if not os.path.exists(anno_path): + os.makedirs(anno_path) + if not os.path.exists(img_path): + os.makedirs(img_path) + + for idx, line in enumerate(datas): + img_name, label = line.strip().split('\t') + img = cv2.imread(os.path.join("./seal_labeled_datas", img_name)) + cv2.imwrite(os.path.join(img_path, os.path.basename(img_name)), img) + height, width, c = img.shape + img_name = os.path.basename(img_name) + (i_name, extension) = os.path.splitext(img_name) + label = json.loads(label) + + xml_file = open(("./seal_VOC/Annotations" + '/' + i_name + '.xml'), 'w') + xml_file.write('\n') + xml_file.write(' seal_VOC\n') + xml_file.write(' ' + str(img_name) + '\n') + xml_file.write(' ' + 'Annotations/' + str(img_name) + '\n') + xml_file.write(' \n') + xml_file.write(' ' + str(width) + '\n') + xml_file.write(' ' + str(height) + '\n') + xml_file.write(' 3\n') + xml_file.write(' \n') + xml_file.write(' 0\n') + + for anno in label: + poly = anno['polys'] + if anno['cls'] == 1: + gt_cls = 'redseal' + xmin = np.min(np.array(poly)[:, 0]) + ymin = np.min(np.array(poly)[:, 1]) + xmax = np.max(np.array(poly)[:, 0]) + ymax = np.max(np.array(poly)[:, 1]) + xmin,ymin,xmax,ymax= int(xmin),int(ymin),int(xmax),int(ymax) + xml_file.write(' \n') + xml_file.write(' '+str(gt_cls)+'\n') + xml_file.write(' Unspecified\n') + xml_file.write(' 0\n') + xml_file.write(' 0\n') + xml_file.write(' \n') + xml_file.write(' '+str(xmin)+'\n') + xml_file.write(' '+str(ymin)+'\n') + xml_file.write(' '+str(xmax)+'\n') + xml_file.write(' '+str(ymax)+'\n') + xml_file.write(' \n') + xml_file.write(' \n') + xml_file.write('') + xml_file.close() + print(f'{mode} xml save done!') + + +gen_main_train_txt() +gen_main_train_txt('valid') +gen_xml_label('train') +gen_xml_label('valid') + +``` + +
+ +数据处理完成后,转换为VOC格式的印章检测数据存储在~/data/seal_VOC目录下,目录组织结构为: + +```text linenums="1" +├── Annotations/ +├── ImageSets/ +│   └── Main/ +│   ├── train.txt +│   └── valid.txt +├── JPEGImages/ +├── train.txt +└── valid.txt +└── label_list.txt +``` + +Annotations下为数据的标签,JPEGImages目录下为图像文件,label_list.txt为标注检测框类别标签文件。 + +在接下来一节中,将介绍如何使用PaddleDetection工具库完成印章检测模型的训练。 + +## 4. 印章检测实践 + +在实际应用中,印章多是出现在合同,发票,公告等场景中,印章文字识别的任务需要排除图像中背景文字的影响,因此需要先检测出图像中的印章区域。 + +借助PaddleDetection目标检测库可以很容易的实现印章检测任务,使用PaddleDetection训练印章检测任务流程如下: + +- 选择算法 +- 修改数据集配置路径 +- 启动训练 + +**算法选择** + +PaddleDetection中有许多检测算法可以选择,考虑到每条数据中印章区域较为清晰,且考虑到性能需求。在本项目中,我们采用mobilenetv3为backbone的ppyolo算法完成印章检测任务,对应的配置文件是:configs/ppyolo/ppyolo_mbv3_large.yml + +**修改配置文件** + +配置文件中的默认数据路径是COCO, +需要修改为印章检测的数据路径,主要修改如下: +在配置文件'configs/ppyolo/ppyolo_mbv3_large.yml'末尾增加如下内容: + +```yaml linenums="1" +metric: VOC +map_type: 11point +num_classes: 2 + +TrainDataset: + !VOCDataSet + dataset_dir: dataset/seal_VOC + anno_path: train.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +EvalDataset: + !VOCDataSet + dataset_dir: dataset/seal_VOC + anno_path: test.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +TestDataset: + !ImageFolder + anno_path: dataset/seal_VOC/label_list.txt +``` + +配置文件中设置的数据路径在PaddleDetection/dataset目录下,我们可以将处理后的印章检测训练数据移动到PaddleDetection/dataset目录下或者创建一个软连接。 + +```bash linenums="1" +!ln -s seal_VOC ./PaddleDetection/dataset/ +``` + +另外图象中印章数量比较少,可以调整NMS后处理的检测框数量,即keep_top_k,nms_top_k 从100,1000,调整为10,100。在配置文件'configs/ppyolo/ppyolo_mbv3_large.yml'末尾增加如下内容完成后处理参数的调整 + +```yaml linenums="1" +BBoxPostProcess: + decode: + name: YOLOBox + conf_thresh: 0.005 + downsample_ratio: 32 + clip_bbox: true + scale_x_y: 1.05 + nms: + name: MultiClassNMS + keep_top_k: 10 # 修改前100 + nms_threshold: 0.45 + nms_top_k: 100 # 修改前1000 + score_threshold: 0.005 +``` + +修改完成后,需要在PaddleDetection中增加印章数据的处理代码,即在PaddleDetection/ppdet/data/source/目录下创建seal.py文件,文件中填充如下代码: + +
+ +```python linenums="1" +import os +import numpy as np +from ppdet.core.workspace import register, serializable +from .dataset import DetDataset +import cv2 +import json + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class SealDataSet(DetDataset): + """ + Load dataset with COCO format. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): coco annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + load_crowd (bool): whether to load crowded ground-truth. + False as default + allow_empty (bool): whether to load empty entry. False as default + empty_ratio (float): the ratio of empty record number to total + record's, if empty_ratio is out of [0. ,1.), do not sample the + records and use all the empty entries. 1. as default + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + load_crowd=False, + allow_empty=False, + empty_ratio=1.): + super(SealDataSet, self).__init__(dataset_dir, image_dir, anno_path, + data_fields, sample_num) + self.load_image_only = False + self.load_semantic = False + self.load_crowd = load_crowd + self.allow_empty = allow_empty + self.empty_ratio = empty_ratio + + def _sample_empty(self, records, num): + # if empty_ratio is out of [0. ,1.), do not sample the records + if self.empty_ratio < 0. or self.empty_ratio >= 1.: + return records + import random + sample_num = min( + int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) + records = random.sample(records, sample_num) + return records + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + records = [] + empty_records = [] + ct = 0 + + assert anno_path.endswith('.txt'), \ + 'invalid seal_gt file: ' + anno_path + + all_datas = open(anno_path, 'r').readlines() + + for idx, line in enumerate(all_datas): + im_path, label = line.strip().split('\t') + img_path = os.path.join(image_dir, im_path) + label = json.loads(label) + im_h, im_w, im_c = cv2.imread(img_path).shape + + coco_rec = { + 'im_file': img_path, + 'im_id': np.array([idx]), + 'h': im_h, + 'w': im_w, + } if 'image' in self.data_fields else {} + + if not self.load_image_only: + bboxes = [] + for anno in label: + poly = anno['polys'] + # poly to box + x1 = np.min(np.array(poly)[:, 0]) + y1 = np.min(np.array(poly)[:, 1]) + x2 = np.max(np.array(poly)[:, 0]) + y2 = np.max(np.array(poly)[:, 1]) + eps = 1e-5 + if x2 - x1 > eps and y2 - y1 > eps: + clean_box = [ + round(float(x), 3) for x in [x1, y1, x2, y2] + ] + anno = {'clean_box': clean_box, 'gt_cls':int(anno['cls'])} + bboxes.append(anno) + else: + logger.info("invalid box") + + num_bbox = len(bboxes) + if num_bbox <= 0: + continue + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + # gt_poly = [None] * num_bbox + + for i, box in enumerate(bboxes): + gt_class[i][0] = box['gt_cls'] + gt_bbox[i, :] = box['clean_box'] + is_crowd[i][0] = 0 + + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + # 'gt_poly': gt_poly, + } + + for k, v in gt_rec.items(): + if k in self.data_fields: + coco_rec[k] = v + + records.append(coco_rec) + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + self.roidbs = records +``` + +
+ +**启动训练** + +启动单卡训练的命令为: + +```bash linenums="1" +!python3 tools/train.py -c configs/ppyolo/ppyolo_mbv3_large.yml --eval + +# 分布式训练命令为: +!python3 -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/ppyolo/ppyolo_mbv3_large.yml --eval +``` + +训练完成后,日志中会打印模型的精度: + +```bash linenums="1" +[07/05 11:42:09] ppdet.engine INFO: Eval iter: 0 +[07/05 11:42:14] ppdet.metrics.metrics INFO: Accumulating evaluatation results... +[07/05 11:42:14] ppdet.metrics.metrics INFO: mAP(0.50, 11point) = 99.31% +[07/05 11:42:14] ppdet.engine INFO: Total sample number: 112, averge FPS: 26.45840794253432 +[07/05 11:42:14] ppdet.engine INFO: Best test bbox ap is 0.996. +``` + +我们可以使用训练好的模型观察预测结果: + +```bash linenums="1" +!python3 tools/infer.py -c configs/ppyolo/ppyolo_mbv3_large.yml -o weights=./output/ppyolo_mbv3_large/model_final.pdparams --img_dir=./test.jpg +``` + +预测结果如下: + +![](./images/0f650c032b0f4d56bd639713924768cc820635e9977845008d233f465291a29e.jpeg) + +## 5. 印章文字识别实践 + +在使用ppyolo检测到印章区域后,接下来借助PaddleOCR里的文字识别能力,完成印章中文字的识别。 + +PaddleOCR中的OCR算法包含文字检测算法,文字识别算法以及OCR端对端算法。 + +文字检测算法负责检测到图像中的文字,再由文字识别模型识别出检测到的文字,进而实现OCR的任务。文字检测+文字识别串联完成OCR任务的架构称为两阶段的OCR算法。相对应的端对端的OCR方法可以用一个算法同时完成文字检测和识别的任务。 + +| 文字检测 | 文字识别 | 端对端算法 | +| -------- | -------- | -------- | +| DB\DB++\EAST\SAST\PSENet | SVTR\CRNN\NRTN\Abinet\SAR\... | PGNet | + +本节中将分别介绍端对端的文字检测识别算法以及两阶段的文字检测识别算法在印章检测识别任务上的实践。 + +### 5.1 端对端印章文字识别实践 + +本节介绍使用PaddleOCR里的PGNet算法完成印章文字识别。 + +PGNet属于端对端的文字检测识别算法,在PaddleOCR中的配置文件为: +[PaddleOCR/configs/e2e/e2e_r50_vd_pg.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/configs/e2e/e2e_r50_vd_pg.yml) + +使用PGNet完成文字检测识别任务的步骤为: + +- 修改配置文件 +- 启动训练 + +PGNet默认配置文件的数据路径为totaltext数据集路径,本次训练中,需要修改为上一节数据处理后得到的标签文件和数据目录: + +训练数据配置修改后如下: + +```yaml linenums="1" +Train: + dataset: + name: PGDataSet + data_dir: ./train_data/seal_ppocr + label_file_list: [./train_data/seal_ppocr/seal_ppocr_img.txt] + ratio_list: [1.0] +``` + +测试数据集配置修改后如下: + +```yaml linenums="1" +Eval: + dataset: + name: PGDataSet + data_dir: ./train_data/seal_ppocr_test + label_file_list: [./train_data/seal_ppocr_test/seal_ppocr_img.txt] +``` + +启动训练的命令为: + +```bash linenums="1" +!python3 tools/train.py -c configs/e2e/e2e_r50_vd_pg.yml +``` + +模型训练完成后,可以得到最终的精度为47.4%。数据量较少,以及数据质量较差会影响模型的训练精度,如果有更多的数据参与训练,精度将进一步提升。 + +如需获取已训练模型,请点击文末的链接,加入官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁 + +### 5.2 两阶段印章文字识别实践 + +上一节介绍了使用PGNet实现印章识别任务的训练流程。本小节将介绍使用PaddleOCR里的文字检测和文字识别算法分别完成印章文字的检测和识别。 + +#### 5.2.1 印章文字检测 + +PaddleOCR中包含丰富的文字检测算法,包含DB,DB++,EAST,SAST,PSENet等等。其中DB,DB++,PSENet均支持弯曲文字检测,本项目中,使用DB++作为印章弯曲文字检测算法。 + +PaddleOCR中发布的db++文字检测算法模型是英文文本检测模型,因此需要重新训练模型。 + +修改[DB++配置文件](DB++的默认配置文件位于[configs/det/det_r50_db++_icdar15.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/configs/det/det_r50_db%2B%2B_icdar15.yml) +中的数据路径: + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/seal_ppocr + label_file_list: [./train_data/seal_ppocr/seal_ppocr_img.txt] + ratio_list: [1.0] +``` + +测试数据集配置修改后如下: + +```yaml linenums="1" +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/seal_ppocr_test + label_file_list: [./train_data/seal_ppocr_test/seal_ppocr_img.txt] +``` + +启动训练: + +```bash linenums="1" +!python3 tools/train.py -c configs/det/det_r50_db++_icdar15.yml -o Global.epoch_num=100 +``` + +考虑到数据较少,通过Global.epoch_num设置仅训练100个epoch。 +模型训练完成后,在测试集上预测的可视化效果如下: + +![](./images/498119182f0a414ab86ae2de752fa31c9ddc3a74a76847049cc57884602cb269-20240704185744623.png) + +如需获取已训练模型,请点击文末的链接,加入官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁 + +#### 5.2.2 印章文字识别 + +上一节中完成了印章文字的检测模型训练,本节介绍印章文字识别模型的训练。识别模型采用SVTR算法,SVTR算法是IJCAI收录的文字识别算法,SVTR模型具备超轻量高精度的特点。 + +在启动训练之前,需要准备印章文字识别需要的数据集,需要使用如下代码,将印章中的文字区域剪切出来构建训练集。 + +```python linenums="1" +import cv2 +import numpy as np + +def get_rotate_crop_image(img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + +def run(data_dir, label_file, save_dir): + datas = open(label_file, 'r').readlines() + for idx, line in enumerate(datas): + img_path, label = line.strip().split('\t') + img_path = os.path.join(data_dir, img_path) + + label = json.loads(label) + src_im = cv2.imread(img_path) + if src_im is None: + continue + + for anno in label: + seal_box = anno['seal_box'] + txt_boxes = anno['polys'] + crop_im = get_rotate_crop_image(src_im, text_boxes) + + save_path = os.path.join(save_dir, f'{idx}.png') + if not os.path.exists(save_dir): + os.makedirs(save_dir) + # print(src_im.shape) + cv2.imwrite(save_path, crop_im) + +``` + +数据处理完成后,即可配置训练的配置文件。SVTR配置文件选择[configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml) +修改SVTR配置文件中的训练数据部分如下: + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/seal_ppocr_crop/ + label_file_list: + - ./train_data/seal_ppocr_crop/train_list.txt +``` + +修改预测部分配置文件: + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/seal_ppocr_crop/ + label_file_list: + - ./train_data/seal_ppocr_crop_test/train_list.txt +``` + +启动训练: + +```bash linenums="1" +!python3 tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml + +``` + +训练完成后可以发现测试集指标达到了61%。 +由于数据较少,训练时会发现在训练集上的acc指标远大于测试集上的acc指标,即出现过拟合现象。通过补充数据和一些数据增强可以缓解这个问题。 + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: diff --git "a/docs/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" "b/docs/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" new file mode 100644 index 0000000000..2c7508f7a3 --- /dev/null +++ "b/docs/applications/\345\217\221\347\245\250\345\205\263\351\224\256\344\277\241\346\201\257\346\212\275\345\217\226.md" @@ -0,0 +1,315 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 基于VI-LayoutXLM的发票关键信息抽取 + +## 1. 项目背景及意义 + +关键信息抽取在文档场景中被广泛使用,如身份证中的姓名、住址信息抽取,快递单中的姓名、联系方式等关键字段内容的抽取。传统基于模板匹配的方案需要针对不同的场景制定模板并进行适配,较为繁琐,不够鲁棒。基于该问题,我们借助飞桨提供的PaddleOCR套件中的关键信息抽取方案,实现对增值税发票场景的关键信息抽取。 + +## 2. 项目内容 + +本项目基于PaddleOCR开源套件,以VI-LayoutXLM多模态关键信息抽取模型为基础,针对增值税发票场景进行适配,提取该场景的关键信息。 + +## 3. 安装环境 + +```bash linenums="1" +# 首先git官方的PaddleOCR项目,安装需要的依赖 +# 第一次运行打开该注释 +git clone https://gitee.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +# 安装PaddleOCR的依赖 +pip install -r requirements.txt +# 安装关键信息抽取任务的依赖 +pip install -r ./ppstructure/kie/requirements.txt +``` + +## 4. 关键信息抽取 + +基于文档图像的关键信息抽取包含3个部分:(1)文本检测(2)文本识别(3)关键信息抽取方法,包括语义实体识别或者关系抽取,下面分别进行介绍。 + +### 4.1 文本检测 + +本文重点关注发票的关键信息抽取模型训练与预测过程,因此在关键信息抽取过程中,直接使用标注的文本检测与识别标注信息进行测试,如果你希望自定义该场景的文本检测模型,完成端到端的关键信息抽取部分,请参考[文本检测模型训练教程](../ppocr/model_train/detection.md),按照训练数据格式准备数据,并完成该场景下垂类文本检测模型的微调过程。 + +### 4.2 文本识别 + +本文重点关注发票的关键信息抽取模型训练与预测过程,因此在关键信息抽取过程中,直接使用提供的文本检测与识别标注信息进行测试,如果你希望自定义该场景的文本检测模型,完成端到端的关键信息抽取部分,请参考[文本识别模型训练教程](../ppocr/model_train/recognition.md),按照训练数据格式准备数据,并完成该场景下垂类文本识别模型的微调过程。 + +### 4.3 语义实体识别 (Semantic Entity Recognition) + +语义实体识别指的是给定一段文本行,确定其类别(如`姓名`、`住址`等类别)。PaddleOCR中提供了基于VI-LayoutXLM的多模态语义实体识别方法,融合文本、位置与版面信息,相比LayoutXLM多模态模型,去除了其中的视觉骨干网络特征提取部分,引入符合阅读顺序的文本行排序方法,同时使用UDML联合互蒸馏方法进行训练,最终在精度与速度方面均超越LayoutXLM。更多关于VI-LayoutXLM的算法介绍与精度指标,请参考:[VI-LayoutXLM算法介绍](../algorithm/kie/algorithm_kie_layoutxlm.md)。 + +#### 4.3.1 准备数据 + +发票场景为例,我们首先需要标注出其中的关键字段,我们将其标注为`问题-答案`的key-value pair,如下,编号No为12270830,则`No`字段标注为question,`12270830`字段标注为answer。如下图所示。 + +![](./images/185381131-76b6e260-04fe-46d9-baca-6bdd7fe0d0ce.jpg) + +**注意:** + +* 如果文本检测模型数据标注过程中,没有标注 **非关键信息内容** 的检测框,那么在标注关键信息抽取任务的时候,也不需要标注该部分,如上图所示;如果标注的过程,如果同时标注了**非关键信息内容** 的检测框,那么我们需要将该部分的label记为other。 +* 标注过程中,需要以文本行为单位进行标注,无需标注单个字符的位置信息。 + +已经处理好的增值税发票数据集从这里下载:[增值税发票数据集下载链接](https://aistudio.baidu.com/aistudio/datasetdetail/165561)。 + +下载好发票数据集,并解压在train_data目录下,目录结构如下所示。 + +```text linenums="1" +train_data + |--zzsfp + |---class_list.txt + |---imgs/ + |---train.json + |---val.json +``` + +其中`class_list.txt`是包含`other`, `question`, `answer`,3个种类的的类别列表(不区分大小写),`imgs`目录底下,`train.json`与`val.json`分别表示训练与评估集合的标注文件。训练集中包含30张图片,验证集中包含8张图片。部分标注如下所示。 + +```python linenums="1" +b33.jpg [{"transcription": "No", "label": "question", "points": [[2882, 472], [3026, 472], [3026, 588], [2882, 588]], }, {"transcription": "12269563", "label": "answer", "points": [[3066, 448], [3598, 448], [3598, 576], [3066, 576]], ]}] +``` + +相比于OCR检测的标注,仅多了`label`字段。 + +#### 4.3.2 开始训练 + +VI-LayoutXLM的配置为[ser_vi_layoutxlm_xfund_zh_udml.yml](../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml),需要修改数据、类别数目以及配置文件。 + +```yaml linenums="1" linenums="1" +Architecture: + model_type: &model_type "kie" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: &algorithm "LayoutXLM" + Transform: + Backbone: + name: LayoutXLMForSer + pretrained: True + # one of base or vi + mode: vi + checkpoints: + # 定义类别数目 + num_classes: &num_classes 5 + ... + +PostProcess: + name: DistillationSerPostProcess + model_name: ["Student", "Teacher"] + key: backbone_out + # 定义类别文件 + class_path: &class_path train_data/zzsfp/class_list.txt + +Train: + dataset: + name: SimpleDataSet + # 定义训练数据目录与标注文件 + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/train.json + ... + +Eval: + dataset: + # 定义评估数据目录与标注文件 + name: SimpleDataSet + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/val.json + ... +``` + +LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 + +| 模型 | 迭代轮数 | Hmean | +| :---: | :---: | :---: | +| LayoutXLM | 50 | 100.00% | +| VI-LayoutXLM | 50 | 100.00% | + +可以看出,由于当前数据量较少,场景比较简单,因此2个模型的Hmean均达到了100%。 + +#### 4.3.3 模型评估 + +模型训练过程中,使用的是知识蒸馏的策略,最终保留了学生模型的参数,在评估时,我们需要针对学生模型的配置文件进行修改: [ser_vi_layoutxlm_xfund_zh.yml](../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml),修改内容与训练配置相同,包括**类别数、类别映射文件、数据目录**。 + +修改完成后,执行下面的命令完成评估过程。 + +```bash linenums="1" +# 注意:需要根据你的配置文件地址与保存的模型地址,对评估命令进行修改 +python3 tools/eval.py -c ./fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +输出结果如下所示: + +```bash linenums="1" +[2022/08/18 08:49:58] ppocr INFO: metric eval *************** +[2022/08/18 08:49:58] ppocr INFO: precision:1.0 +[2022/08/18 08:49:58] ppocr INFO: recall:1.0 +[2022/08/18 08:49:58] ppocr INFO: hmean:1.0 +[2022/08/18 08:49:58] ppocr INFO: fps:1.9740402401574881 +``` + +#### 4.3.4 模型预测 + +使用下面的命令进行预测: + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +预测结果会保存在配置文件中的`Global.save_res_path`目录中。 + +部分预测结果如下所示。 + +![](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240704190212828.jpg) + +* 注意:在预测时,使用的文本检测与识别结果为标注的结果,直接从json文件里面进行读取。 + +如果希望使用OCR引擎结果得到的结果进行推理,则可以使用下面的命令进行推理。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True +``` + +结果如下所示: + +![](./images/185384321-61153faa-e407-45c4-8e7c-a39540248189.jpg) + +它会使用PP-OCRv3的文本检测与识别模型进行获取文本位置与内容信息。 + +可以看出,由于训练的过程中,没有标注额外的字段为other类别,所以大多数检测出来的字段被预测为question或者answer。 + +如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入检测与识别的inference 模型路径,即可完成OCR文本检测与识别以及SER的串联过程。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c fapiao/ser_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/imgs/b25.jpg Global.infer_mode=True Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +``` + +### 4.4 关系抽取(Relation Extraction) + +使用SER模型,可以获取图像中所有的question与answer的字段,继续这些字段的类别,我们需要进一步获取question与answer之间的连接,因此需要进一步训练关系抽取模型,解决该问题。本文也基于VI-LayoutXLM多模态预训练模型,进行下游RE任务的模型训练。 + +#### 4.4.1 准备数据 + +以发票场景为例,相比于SER任务,RE中还需要标记每个文本行的id信息以及链接关系linking,如下所示。 + +![](./images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d.jpg) + +![](./images/185387870-dc9125a0-9ceb-4036-abf3-184b6e65dc7d-20240704190305748.jpg) + +标注文件的部分内容如下所示。 + +```python linenums="1" +b33.jpg [{"transcription": "No", "label": "question", "points": [[2882, 472], [3026, 472], [3026, 588], [2882, 588]], "id": 0, "linking": [[0, 1]]}, {"transcription": "12269563", "label": "answer", "points": [[3066, 448], [3598, 448], [3598, 576], [3066, 576]], "id": 1, "linking": [[0, 1]]}] +``` + +相比与SER的标注,多了`id`与`linking`的信息,分别表示唯一标识以及连接关系。 + +已经处理好的增值税发票数据集从这里下载:[增值税发票数据集下载链接](https://aistudio.baidu.com/aistudio/datasetdetail/165561)。 + +#### 4.4.2 开始训练 + +基于VI-LayoutXLM的RE任务配置为[re_vi_layoutxlm_xfund_zh_udml.yml](../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml),需要修改**数据路径、类别列表文件**。 + +```yaml linenums="1" linenums="1" +Train: + dataset: + name: SimpleDataSet + # 定义训练数据目录与标注文件 + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/train.json + transforms: + - DecodeImage: # load image + img_mode: RGB + channel_first: False + - VQATokenLabelEncode: # Class handling label + contains_re: True + algorithm: *algorithm + class_path: &class_path train_data/zzsfp/class_list.txt + ... + +Eval: + dataset: + # 定义评估数据目录与标注文件 + name: SimpleDataSet + data_dir: train_data/zzsfp/imgs + label_file_list: + - train_data/zzsfp/val.json + ... + +``` + +LayoutXLM与VI-LayoutXLM针对该场景的训练结果如下所示。 + +| 模型 | 迭代轮数 | Hmean | +| :---: | :---: | :---: | +| LayoutXLM | 50 | 98.00% | +| VI-LayoutXLM | 50 | 99.30% | + +可以看出,对于VI-LayoutXLM相比LayoutXLM的Hmean高了1.3%。 + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +* PaddleX官方交流频道: + +#### 4.4.3 模型评估 + +模型训练过程中,使用的是知识蒸馏的策略,最终保留了学生模型的参数,在评估时,我们需要针对学生模型的配置文件进行修改: [re_vi_layoutxlm_xfund_zh.yml](../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml),修改内容与训练配置相同,包括**类别映射文件、数据目录**。 + +修改完成后,执行下面的命令完成评估过程。 + +```bash linenums="1" +# 注意:需要根据你的配置文件地址与保存的模型地址,对评估命令进行修改 +python3 tools/eval.py -c ./fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +输出结果如下所示: + +```python linenums="1" +[2022/08/18 12:17:14] ppocr INFO: metric eval *************** +[2022/08/18 12:17:14] ppocr INFO: precision:1.0 +[2022/08/18 12:17:14] ppocr INFO: recall:0.9873417721518988 +[2022/08/18 12:17:14] ppocr INFO: hmean:0.9936305732484078 +[2022/08/18 12:17:14] ppocr INFO: fps:2.765963539771157 +``` + +#### 4.4.4 模型预测 + +使用下面的命令进行预测: + +```bash linenums="1" +# -c 后面的是RE任务的配置文件 +# -o 后面的字段是RE任务的配置 +# -c_ser 后面的是SER任务的配置文件 +# -c_ser 后面的字段是SER任务的配置 +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_trained/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=False -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_trained/best_accuracy +``` + +预测结果会保存在配置文件中的`Global.save_res_path`目录中。 + +部分预测结果如下所示。 + +![](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240704190316813.jpg) + +* 注意:在预测时,使用的文本检测与识别结果为标注的结果,直接从json文件里面进行读取。 + +如果希望使用OCR引擎结果得到的结果进行推理,则可以使用下面的命令进行推理。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy +``` + +如果希望构建基于你在垂类场景训练得到的OCR检测与识别模型,可以使用下面的方法传入,即可完成SER + RE的串联过程。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser_re.py -c fapiao/re_vi_layoutxlm.yml -o Architecture.Backbone.checkpoints=fapiao/models/re_vi_layoutxlm_fapiao_udml/best_accuracy Global.infer_img=./train_data/zzsfp/val.json Global.infer_mode=True -c_ser fapiao/ser_vi_layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=fapiao/models/ser_vi_layoutxlm_fapiao_udml/best_accuracy Global.kie_rec_model_dir="your_rec_model" Global.kie_det_model_dir="your_det_model" +``` diff --git "a/docs/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md" "b/docs/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..15b899e730 --- /dev/null +++ "b/docs/applications/\345\244\232\346\250\241\346\200\201\350\241\250\345\215\225\350\257\206\345\210\253.md" @@ -0,0 +1,797 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 多模态表单识别 + +## 1 项目说明 + +计算机视觉在金融领域的应用覆盖文字识别、图像识别、视频识别等,其中文字识别(OCR)是金融领域中的核心AI能力,其应用覆盖客户服务、风险防控、运营管理等各项业务,针对的对象包括通用卡证票据识别(银行卡、身份证、营业执照等)、通用文本表格识别(印刷体、多语言、手写体等)以及一些金融特色票据凭证。通过因此如果能够在结构化信息提取时同时利用文字、页面布局等信息,便可增强不同版式下的泛化性。 + +表单识别旨在识别各种具有表格性质的证件、房产证、营业执照、个人信息表、发票等关键键值对(如姓名-张三),其广泛应用于银行、证券、公司财务等领域,具有很高的商业价值。本次范例项目开源了全流程表单识别方案,能够在多个场景快速实现迁移能力。表单识别通常存在以下难点: + +- 人工摘录工作效率低; +- 国内常见表单版式多; +- 传统技术方案泛化效果不满足。 + +表单识别包含两大阶段:OCR阶段和文档视觉问答阶段。 + +其中,OCR阶段选取了PaddleOCR的PP-OCRv2模型,主要由文本检测和文本识别两个模块组成。DOC-VQA文档视觉问答阶段基于PaddleNLP自然语言处理算法库实现的LayoutXLM模型,支持基于多模态方法的语义实体识别(Semantic Entity Recognition, SER)以及关系抽取(Relation Extraction, RE)任务。本案例流程如 **图1** 所示: + +![](./images/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e.jpeg) + +注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375?contributionType=1) + +## 2 安装说明 + +下载PaddleOCR源码,上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~ + +```bash linenums="1" +unzip -q PaddleOCR.zip +``` + +```bash linenums="1" +# 如仍需安装or安装更新,可以执行以下步骤 +# git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph +# git clone https://gitee.com/PaddlePaddle/PaddleOCR +``` + +```bash linenums="1" +# 安装依赖包 +pip install -U pip +pip install -r /home/aistudio/PaddleOCR/requirements.txt +pip install paddleocr + +pip install yacs gnureadline paddlenlp==2.2.1 +pip install xlsxwriter +``` + +## 3 数据准备 + +这里使用[XFUN数据集](https://github.com/doc-analysis/XFUND)做为实验数据集。 XFUN数据集是微软提出的一个用于KIE任务的多语言数据集,共包含七个数据集,每个数据集包含149张训练集和50张验证集 + +分别为:ZH(中文)、JA(日语)、ES(西班牙)、FR(法语)、IT(意大利)、DE(德语)、PT(葡萄牙) + +本次实验选取中文数据集作为我们的演示数据集。法语数据集作为实践课程的数据集,数据集样例图如 **图2** 所示。 + +![](./images/0f84137778cd4ab6899c64109d452290e9c678ccf01744978bc9c0647adbba45.jpg) + +### 3.1 下载处理好的数据集 + +处理好的XFUND中文数据集下载地址:[https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar) ,可以运行如下指令完成中文数据集下载和解压。 + +![](./images/31e3dbee31d441d2a36d45b5af660e832dfa2f437f4d49a1914312a15b6a29a7.jpeg) + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar +tar -xf XFUND.tar + +# XFUN其他数据集使用下面的代码进行转换 +# 代码链接:https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppstructure/vqa/helper/trans_xfun_data.py +# %cd PaddleOCR +# python3 ppstructure/vqa/tools/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path +# %cd ../ +``` + +运行上述指令后在 /home/aistudio/PaddleOCR/ppstructure/vqa/XFUND 目录下有2个文件夹,目录结构如下所示: + +```bash linenums="1" +/home/aistudio/PaddleOCR/ppstructure/vqa/XFUND + └─ zh_train/ 训练集 + ├── image/ 图片存放文件夹 + ├── xfun_normalize_train.json 标注信息 + └─ zh_val/ 验证集 + ├── image/ 图片存放文件夹 + ├── xfun_normalize_val.json 标注信息 + +``` + +该数据集的标注格式为 + +```bash linenums="1" +{ + "height": 3508, # 图像高度 + "width": 2480, # 图像宽度 + "ocr_info": [ + { + "text": "邮政地址:", # 单个文本内容 + "label": "question", # 文本所属类别 + "bbox": [261, 802, 483, 859], # 单个文本框 + "id": 54, # 文本索引 + "linking": [[54, 60]], # 当前文本和其他文本的关系 [question, answer] + "words": [] + }, + { + "text": "湖南省怀化市市辖区", + "label": "answer", + "bbox": [487, 810, 862, 859], + "id": 60, + "linking": [[54, 60]], + "words": [] + } + ] +} +``` + +### 3.2 转换为PaddleOCR检测和识别格式 + +使用XFUND训练PaddleOCR检测和识别模型,需要将数据集格式改为训练需求的格式。 + +![](./images/9a709f19e7174725a8cfb09fd922ade74f8e9eb73ae1438596cbb2facef9c24a.jpeg) + +**文本检测** 标注文件格式如下,中间用'\t'分隔: + +" 图像文件名 json.dumps编码的图像标注信息" +ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] + +json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。 `transcription` 表示当前文本框的文字,***当其内容为“###”时,表示该文本框无效,在训练时会跳过。*** + +**文本识别** 标注文件的格式如下, txt文件中默认请将图片路径和图片标签用'\t'分割,如用其他方式分割将造成训练报错。 + +```text linenums="1" +" 图像文件名 图像标注信息 " + +train_data/rec/train/word_001.jpg 简单可依赖 +train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 +... +``` + +```bash linenums="1" +unzip -q /home/aistudio/data/data140302/XFUND_ori.zip -d /home/aistudio/data/data140302/ +``` + +已经提供转换脚本,执行如下代码即可转换成功: + +```bash linenums="1" +%cd /home/aistudio/ +python trans_xfund_data.py +``` + +## 4 OCR + +选用飞桨OCR开发套件[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)中的PP-OCRv2模型进行文本检测和识别。PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](../ppocr/blog/enhanced_ctc_loss.md)损失函数改进,进一步在推理速度和预测效果上取得明显提升。更多细节请参考PP-OCRv2[技术报告](https://arxiv.org/abs/2109.03144)。 + +### 4.1 文本检测 + +我们使用2种方案进行训练、评估: + +- **PP-OCRv2中英文超轻量检测预训练模型** +- **XFUND数据集+fine-tune** + +#### 4.1.1 方案1:预训练模型 + +##### 1)下载预训练模型 + +![](./images/2aff41ee8fce4e9bac8295cc00720217bde2aeee7ee7473689848bed0b6fde05.jpeg) + +PaddleOCR已经提供了PP-OCR系列模型,部分模型展示如下表所示: + +| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 | +| ------------------------------------- | ----------------------- | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | + +更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](./doc/doc_ch/models_list.md) + +这里我们使用PP-OCRv2中英文超轻量检测模型,下载并解压预训练模型: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/pretrain/ +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar +tar -xf ch_PP-OCRv2_det_distill_train.tar && rm -rf ch_PP-OCRv2_det_distill_train.tar +% cd .. +``` + +##### 2)模型评估 + +![](./images/75b0e977dfb74a83851f8828460759f337b1b7a0c33c47a08a30f3570e1e2e74.jpeg) + +接着使用下载的超轻量检测模型在XFUND验证集上进行评估,由于蒸馏需要包含多个网络,甚至多个Student网络,在计算指标的时候只需要计算一个Student网络的指标即可,key字段设置为Student则表示只计算Student网络的精度。 + +```yaml linenums="1" +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" +``` + +首先修改配置文件`configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_distill.yml`中的以下字段: + +```yaml linenums="1" +Eval.dataset.data_dir:指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +然后在XFUND验证集上进行评估,具体代码如下: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR +python tools/eval.py \ + -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_distill.yml \ + -o Global.checkpoints="./pretrain_models/ch_PP-OCRv2_det_distill_train/best_accuracy" +``` + +使用预训练模型进行评估,指标如下所示: + +| 方案 | hmeans | +| -------- | -------- | +| PP-OCRv2中英文超轻量检测预训练模型 | 77.26% | + +使用文本检测预训练模型在XFUND验证集上评估,达到77%左右,充分说明ppocr提供的预训练模型具有泛化能力。 + +#### 4.1.2 方案2:XFUND数据集+fine-tune + +PaddleOCR提供的蒸馏预训练模型包含了多个模型的参数,我们提取Student模型的参数,在XFUND数据集上进行finetune,可以参考如下代码: + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("pretrain/ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams") +# 查看权重参数的keys +# print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("student_model."):]: all_params[key] for key in all_params if "student_model." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "pretrain/ch_PP-OCRv2_det_distill_train/student.pdparams") +``` + +##### 1)模型训练 + +![](./images/560c44b8dd604da7987bd25da0a882156ffcfb7f6bcb44108fe9bde77512e572.jpeg) + +修改配置文件`configs/det/ch_PP-OCRv2_det_student.yml`中的以下字段: + +```yaml linenums="1" +Global.pretrained_model:指向预训练模型路径 +Train.dataset.data_dir:指向训练集图片存放目录 +Train.dataset.label_file_list:指向训练集标注文件 +Eval.dataset.data_dir:指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +Optimizer.lr.learning_rate:调整学习率,本实验设置为0.005 +Train.dataset.transforms.EastRandomCropData.size:训练尺寸改为[1600, 1600] +Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数 + limit_side_len: 1600 + limit_type: 'min' + +``` + +执行下面命令启动训练: + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python tools/train.py \ + -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml +``` + +##### 2)模型评估 + +![](./images/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77.jpeg) + +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`。 + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +将下载或训练完成的模型放置在对应目录下即可完成模型评估 + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +python tools/eval.py \ + -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml \ + -o Global.checkpoints="pretrain/ch_db_mv3-student1600-finetune/best_accuracy" +``` + +同时我们提供了未finetuen的模型,配置文件参数(`pretrained_model`设置为空,`learning_rate` 设置为0.001) + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +python tools/eval.py \ + -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml \ + -o Global.checkpoints="pretrain/ch_db_mv3-student1600/best_accuracy" +``` + +使用训练好的模型进行评估,指标如下所示: + +| 方案 | hmeans | +| -------- | -------- | +| XFUND数据集 | 79.27% | +| XFUND数据集+fine-tune | 85.24% | + +对比仅使用XFUND数据集训练的模型,使用XFUND数据集+finetune训练,在验证集上评估达到85%左右,说明 finetune会提升垂类场景效果。 + +##### 3)导出模型 + +![](./images/07c3b060c54e4b00be7de8d41a8a4696ff53835343cc4981aab0555183306e79.jpeg) + +在模型训练过程中保存的模型文件是包含前向预测和反向传播的过程,在实际的工业部署则不需要反向传播,因此需要将模型进行导成部署需要的模型格式。 执行下面命令,即可导出模型。 + +```bash linenums="1" +# 加载配置文件`ch_PP-OCRv2_det_student.yml`,从`pretrain/ch_db_mv3-student1600-finetune`目录下加载`best_accuracy`模型 +# inference模型保存在`./output/det_db_inference`目录下 +%cd /home/aistudio/PaddleOCR/ +python tools/export_model.py \ + -c configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml \ + -o Global.pretrained_model="pretrain/ch_db_mv3-student1600-finetune/best_accuracy" \ + Global.save_inference_dir="./output/det_db_inference/" +``` + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/rec_crnn/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +##### 4)模型预测 + +![](./images/0d582de9aa46474791e08654f84a614a6510e98bfe5f4ad3a26501cbf49ec151.jpeg) + +加载上面导出的模型,执行如下命令对验证集或测试集图片进行预测: + +```yaml linenums="1" +det_model_dir:预测模型 +image_dir:测试图片路径 +use_gpu:是否使用GPU +``` + +检测可视化结果保存在`/home/aistudio/inference_results/`目录下,查看检测效果。 + +```bash linenums="1" +%pwd +!python tools/infer/predict_det.py \ + --det_algorithm="DB" \ + --det_model_dir="./output/det_db_inference/" \ + --image_dir="./doc/vqa/input/zh_val_21.jpg" \ + --use_gpu=True +``` + +总结,我们分别使用PP-OCRv2中英文超轻量检测预训练模型、XFUND数据集+finetune2种方案进行评估、训练等,指标对比如下: + +| 方案 | hmeans | 结果分析 | +| -------- | -------- | -------- | +| PP-OCRv2中英文超轻量检测预训练模型 | 77.26% | ppocr提供的预训练模型有泛化能力 | +| XFUND数据集 | 79.27% | | +| XFUND数据集+finetune | 85.24% | finetune会提升垂类场景效果 | + +### 4.2 文本识别 + +我们分别使用如下3种方案进行训练、评估: + +- PP-OCRv2中英文超轻量识别预训练模型 +- XFUND数据集+fine-tune +- XFUND数据集+fine-tune+真实通用识别数据 + +#### 4.2.1 方案1:预训练模型 + +##### 1)下载预训练模型 + +![](./images/b7230e9964074181837e1132029f9da8178bf564ac5c43a9a93a30e975c0d8b4.jpeg) + +我们使用PP-OCRv2中英文超轻量文本识别模型,下载并解压预训练模型: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/pretrain/ +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar +tar -xf ch_PP-OCRv2_rec_train.tar && rm -rf ch_PP-OCRv2_rec_train.tar +% cd .. +``` + +##### 2)模型评估 + +![](./images/166ce56d634c4c7589fe68fbc6e7ae663305dcc82ba144c781507341ffae7fe8.jpeg) + +首先修改配置文件`configs/det/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml`中的以下字段: + +```bash linenums="1" +Eval.dataset.data_dir:指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +我们使用下载的预训练模型进行评估: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR +CUDA_VISIBLE_DEVICES=0 python tools/eval.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml \ + -o Global.checkpoints=./pretrain/ch_PP-OCRv2_rec_train/best_accuracy +``` + +使用预训练模型进行评估,指标如下所示: + +| 方案 | acc | +| -------- | -------- | +| PP-OCRv2中英文超轻量识别预训练模型 | 67.48% | + +使用文本预训练模型在XFUND验证集上评估,acc达到67%左右,充分说明ppocr提供的预训练模型具有泛化能力。 + +#### 4.2.2 方案2:XFUND数据集+finetune + +同检测模型,我们提取Student模型的参数,在XFUND数据集上进行finetune,可以参考如下代码: + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("pretrain/ch_PP-OCRv2_rec_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "pretrain/ch_PP-OCRv2_rec_train/student.pdparams") +``` + +##### 1)模型训练 + +![](./images/166ce56d634c4c7589fe68fbc6e7ae663305dcc82ba144c781507341ffae7fe8.jpeg) + +修改配置文件`configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml`中的以下字段: + +```yaml linenums="1" +Global.pretrained_model:指向预训练模型路径 +Global.character_dict_path: 字典路径 +Optimizer.lr.values:学习率 +Train.dataset.data_dir:指向训练集图片存放目录 +Train.dataset.label_file_list:指向训练集标注文件 +Eval.dataset.data_dir:指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +执行如下命令启动训练: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +CUDA_VISIBLE_DEVICES=0 python tools/train.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml +``` + +##### 2)模型评估 + +![](./images/c07c88f708ad43cc8cd615861626d0e8333c0e3d4dda49ac8cba1f8939fa8a94.jpeg) + +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-finetune/best_accuracy` + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +CUDA_VISIBLE_DEVICES=0 python tools/eval.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml \ + -o Global.checkpoints=./pretrain/rec_mobile_pp-OCRv2-student-finetune/best_accuracy +``` + +使用预训练模型进行评估,指标如下所示: + +| 方案 | acc | +| -------- | -------- | +| XFUND数据集+finetune | 72.33% | + +使用XFUND数据集+finetune训练,在验证集上评估达到72%左右,说明 finetune会提升垂类场景效果。 + +#### 4.2.3 方案3:XFUND数据集+finetune+真实通用识别数据 + +接着我们在上述`XFUND数据集+finetune`实验的基础上,添加真实通用识别数据,进一步提升识别效果。首先准备真实通用识别数据,并上传到AIStudio: + +##### 1)模型训练 + +![](./images/45f288ce8b2c45d8aa5407785b4b40f4876fc3da23744bd7a78060797fba0190.jpeg) + +在上述`XFUND数据集+finetune`实验中修改配置文件`configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml`的基础上,继续修改以下字段: + +```yaml linenums="1" +Train.dataset.label_file_list:指向真实识别训练集图片存放目录 +Train.dataset.ratio_list:动态采样 +``` + +执行如下命令启动训练: + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +CUDA_VISIBLE_DEVICES=0 python tools/train.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml +``` + +##### 2)模型评估 + +![](./images/965db9f758614c6f9be301286cd5918f21110603c8aa4a1dbf5371e3afeec782.jpeg) + +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`。 + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python tools/eval.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml \ + -o Global.checkpoints=./pretrain/rec_mobile_pp-OCRv2-student-realdata/best_accuracy +``` + +使用预训练模型进行评估,指标如下所示: + +| 方案 | acc | +| -------- | -------- | +| XFUND数据集+fine-tune+真实通用识别数据 | 85.29% | + +使用XFUND数据集+finetune训练,在验证集上评估达到85%左右,说明真实通用识别数据对于性能提升很有帮助。 + +##### 3)导出模型 + +![](./images/3dc7f69fac174cde96b9d08b5e2353a1d88dc63e7be9410894c0783660b35b76.jpeg) + +导出模型只保留前向预测的过程: + +```bash linenums="1" +!python tools/export_model.py \ + -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml \ + -o Global.pretrained_model=pretrain/rec_mobile_pp-OCRv2-student-realdata/best_accuracy \ + Global.save_inference_dir=./output/rec_crnn_inference/ +``` + +##### 4)模型预测 + +![](./images/60b95b4945954f81a080a8f308cee66f83146479cd1142b9b6b1290938fd1df8.jpeg) + +加载上面导出的模型,执行如下命令对验证集或测试集图片进行预测,检测可视化结果保存在`/home/aistudio/inference_results/`目录下,查看检测、识别效果。需要通过`--rec_char_dict_path`指定使用的字典路径 + +```bash linenums="1" +python tools/infer/predict_system.py \ + --image_dir="./doc/vqa/input/zh_val_21.jpg" \ + --det_model_dir="./output/det_db_inference/" \ + --rec_model_dir="./output/rec_crnn_inference/" \ + --rec_image_shape="3, 32, 320" \ + --rec_char_dict_path="/home/aistudio/XFUND/word_dict.txt" +``` + +总结,我们分别使用PP-OCRv2中英文超轻量检测预训练模型、XFUND数据集+finetune2种方案进行评估、训练等,指标对比如下: + +| 方案 | acc | 结果分析 | +| -------- | -------- | -------- | +| PP-OCRv2中英文超轻量识别预训练模型 | 67.48% | ppocr提供的预训练模型具有泛化能力 | +| XFUND数据集+fine-tune |72.33% | finetune会提升垂类场景效果 | +| XFUND数据集+fine-tune+真实通用识别数据 | 85.29% | 真实通用识别数据对于性能提升很有帮助 | + +## 5 文档视觉问答(DOC-VQA) + +VQA指视觉问答,主要针对图像内容进行提问和回答,DOC-VQA是VQA任务中的一种,DOC-VQA主要针对文本图像的文字内容提出问题。 + +PaddleOCR中DOC-VQA系列算法基于PaddleNLP自然语言处理算法库实现LayoutXLM论文,支持基于多模态方法的 **语义实体识别 (Semantic Entity Recognition, SER)** 以及 **关系抽取 (Relation Extraction, RE)** 任务。 + +如果希望直接体验预测过程,可以下载我们提供的预训练模型,跳过训练过程,直接预测即可。 + +```bash linenums="1" +%cd pretrain +#下载SER模型 +wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar && tar -xvf ser_LayoutXLM_xfun_zh.tar +#下载RE模型 +wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar && tar -xvf re_LayoutXLM_xfun_zh.tar +%cd ../ +``` + +### 5.1 SER + +SER: 语义实体识别 (Semantic Entity Recognition), 可以完成对图像中的文本识别与分类。 + +![](./images/a3b25766f3074d2facdf88d4a60fc76612f51992fd124cf5bd846b213130665b-0097611.jpeg) + +**图19** 中不同颜色的框表示不同的类别,对于XFUND数据集,有QUESTION, ANSWER, HEADER 3种类别 + +- 深紫色:HEADER +- 浅紫色:QUESTION +- 军绿色:ANSWER + +在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 + +#### 5.1.1 模型训练 + +![](./images/2e45f297c9d44ca5b8718ae100a365f7348eaeed4cb8495b904f28a9c8075d8a.jpeg) + +启动训练之前,需要修改配置文件 `configs/vqa/ser/layoutxlm.yml` 以下四个字段: + +```yaml linenums="1" +Train.dataset.data_dir:指向训练集图片存放目录 +Train.dataset.label_file_list:指向训练集标注文件 +Eval.dataset.data_dir:指指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR/ +CUDA_VISIBLE_DEVICES=0 python tools/train.py -c configs/vqa/ser/layoutxlm.yml +``` + +最终会打印出`precision`, `recall`, `hmean`等指标。 在`./output/ser_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。 + +#### 5.1.2 模型评估 + +![](./images/5df160ac39ee4d9e92a937094bc53a737272f9f2abeb4ddfaebb48e8eccf1be2.jpeg) + +我们使用下载的预训练模型进行评估,如果使用自己训练好的模型进行评估,将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段即可。 + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python tools/eval.py \ + -c configs/vqa/ser/layoutxlm.yml \ + -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ +``` + +最终会打印出`precision`, `recall`, `hmean`等指标,预训练模型评估指标如下: + +![](./images/2854aee557a74079a82dd5cd57e48bc2ce97974d5637477fb4deea137d0e312c.png) + +#### 5.1.3 模型预测 + +![](./images/0f7d50a0fb924b408b93e1fbd6ca64148eed34a2e6724280acd3e113fef7dc48.jpeg) + +使用如下命令即可完成`OCR引擎 + SER`的串联预测, 以SER预训练模型为例: + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python tools/infer_vqa_token_ser.py \ + -c configs/vqa/ser/layoutxlm.yml \ + -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ \ + Global.infer_img=doc/vqa/input/zh_val_42.jpg +``` + +最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`。通过如下命令查看预测图片: + +```python linenums="1" +import cv2 +from matplotlib import pyplot as plt +# 在notebook中使用matplotlib.pyplot绘图时,需要添加该命令进行显示 +%matplotlib inline + +img = cv2.imread('output/ser/zh_val_42_ser.jpg') +plt.figure(figsize=(48,24)) +plt.imshow(img) +``` + +### 5.2 RE + +基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。 + +![](./images/4de19ca3e54343e88961e816cad28bbacdc807f40b9440be914d871b0a914570.jpeg) + +图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 + +#### 5.2.1 模型训练 + +![](./images/268c707a62c54e93958d2b2ab29e0932953aad41819e44aaaaa05c8ad85c6491.jpeg) + +启动训练之前,需要修改配置文件`configs/vqa/re/layoutxlm.yml`中的以下四个字段 + +```yaml linenums="1" +Train.dataset.data_dir:指向训练集图片存放目录 +Train.dataset.label_file_list:指向训练集标注文件 +Eval.dataset.data_dir:指指向验证集图片存放目录 +Eval.dataset.label_file_list:指向验证集标注文件 +``` + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml +``` + +最终会打印出`precision`, `recall`, `hmean`等指标。 在`./output/re_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型 + +#### 5.2.2 模型评估 + +![](./images/93c66a43a69e472899c1c6732408b7a42e99a43721e94e9ca3c0a64e080306e4.jpeg) + +我们使用下载的预训练模型进行评估,如果使用自己训练好的模型进行评估,将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段即可。 + +```bash linenums="1" +CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py \ + -c configs/vqa/re/layoutxlm.yml \ + -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ +``` + +最终会打印出`precision`, `recall`, `hmean`等指标,预训练模型评估指标如下: + +![](./images/f99af54fb2d14691a73b1a748e0ca22618aeddfded0c4da58bbbb03edb8c2340.png) + +#### 5.2.3 模型预测 + +![](./images/bab32d32bdec4339b9a3e5f911e4b41f77996f3faabc40bd8309b5b20cad31e4.jpeg) + +使用如下命令即可完成OCR引擎 + SER + RE的串联预测, 以预训练SER和RE模型为例, + +最终会在config.Global.save_res_path字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为infer_results.txt。 + +```bash linenums="1" +cd /home/aistudio/PaddleOCR +CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser_re.py \ + -c configs/vqa/re/layoutxlm.yml \ + -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ \ + Global.infer_img=test_imgs/ \ + -c_ser configs/vqa/ser/layoutxlm.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ +``` + +最终会在config.Global.save_res_path字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为infer_results.txt, 每一行表示一张图片的结果,每张图片的结果如下所示,前面表示测试图片路径,后面为测试结果:key字段及对应的value字段。 + +```bash linenums="1" +test_imgs/t131.jpg {"政治面税": "群众", "性别": "男", "籍贯": "河北省邯郸市", "婚姻状况": "亏末婚口已婚口已娇", "通讯地址": "邯郸市阳光苑7号楼003", "民族": "汉族", "毕业院校": "河南工业大学", "户口性质": "口农村城镇", "户口地址": "河北省邯郸市", "联系电话": "13288888888", "健康状况": "健康", "姓名": "小六", "好高cm": "180", "出生年月": "1996年8月9日", "文化程度": "本科", "身份证号码": "458933777777777777"} +``` + +展示预测结果 + +```python linenums="1" +import cv2 +from matplotlib import pyplot as plt +%matplotlib inline + +img = cv2.imread('./output/re/t131_ser.jpg') +plt.figure(figsize=(48,24)) +plt.imshow(img) +``` + +## 6 导出Excel + +![](./images/ab93d3d90d77437a81c9534b2dd1d3e39ef81e8473054fd3aeff6e837ebfb827.jpeg) + +为了输出信息匹配对,我们修改`tools/infer_vqa_token_ser_re.py`文件中的`line 194-197`。 + +```python linenums="1" + fout.write(img_path + "\t" + json.dumps( + { + "ser_result": result, + }, ensure_ascii=False) + "\n") + +``` + +更改为 + +```python linenums="1" +result_key = {} +for ocr_info_head, ocr_info_tail in result: + result_key[ocr_info_head['text']] = ocr_info_tail['text'] + +fout.write(img_path + "\t" + json.dumps( + result_key, ensure_ascii=False) + "\n") +``` + +同时将输出结果导出到Excel中,效果如 图28 所示: +![](./images/9f45d3eef75e4842a0828bb9e518c2438300264aec0646cc9addfce860a04196.png) + +```python linenums="1" +import json +import xlsxwriter as xw + +workbook = xw.Workbook('output/re/infer_results.xlsx') +format1 = workbook.add_format({ + 'align': 'center', + 'valign': 'vcenter', + 'text_wrap': True, +}) +worksheet1 = workbook.add_worksheet('sheet1') +worksheet1.activate() +title = ['姓名', '性别', '民族', '文化程度', '身份证号码', '联系电话', '通讯地址'] +worksheet1.write_row('A1', title) +i = 2 + +with open('output/re/infer_results.txt', 'r', encoding='utf-8') as fin: + lines = fin.readlines() + for line in lines: + img_path, result = line.strip().split('\t') + result_key = json.loads(result) + # 写入Excel + row_data = [result_key['姓名'], result_key['性别'], result_key['民族'], result_key['文化程度'], result_key['身份证号码'], + result_key['联系电话'], result_key['通讯地址']] + row = 'A' + str(i) + worksheet1.write_row(row, row_data, format1) + i+=1 +workbook.close() +``` + +## 更多资源 + +- 更多深度学习知识、产业案例、面试宝典等,请参考:[awesome-DeepLearning](https://github.com/paddlepaddle/awesome-DeepLearning) +- 更多PaddleOCR使用教程,请参考:[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph) +- 更多PaddleNLP使用教程,请参考:[PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP) +- 飞桨框架相关资料,请参考:[飞桨深度学习平台](https://www.paddlepaddle.org.cn/?fr=paddleEdu_aistudio) + +## 参考链接 + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, +- microsoft/unilm/layoutxlm, +- XFUND dataset, diff --git "a/docs/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" "b/docs/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" new file mode 100644 index 0000000000..df2890c977 --- /dev/null +++ "b/docs/applications/\345\277\253\351\200\237\346\236\204\345\273\272\345\215\241\350\257\201\347\261\273OCR.md" @@ -0,0 +1,696 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 快速构建卡证类OCR + +## 1. 金融行业卡证识别应用 + +### 1.1 金融行业中的OCR相关技术 + +《“十四五”数字经济发展规划》指出,2020年我国数字经济核心产业增加值占GDP比重达7.8%,随着数字经济迈向全面扩展,到2025年该比例将提升至10%。 + +在过去数年的跨越发展与积累沉淀中,数字金融、金融科技已在对金融业的重塑与再造中充分印证了其自身价值。 + +以智能为目标,提升金融数字化水平,实现业务流程自动化,降低人力成本。 + +![](./images/8bb381f164c54ea9b4043cf66fc92ffdea8aaf851bab484fa6e19bd2f93f154f.jpeg) + +### 1.2 金融行业中的卡证识别场景介绍 + +应用场景:身份证、银行卡、营业执照、驾驶证等。 + +应用难点:由于数据的采集来源多样,以及实际采集数据各种噪声:反光、褶皱、模糊、倾斜等各种问题干扰。 + +![](./images/981640e17d05487e961162f8576c9e11634ca157f79048d4bd9d3bc21722afe8-20240704185952731.jpeg) + +### 1.3 OCR落地挑战 + +![](./images/a5973a8ddeff4bd7ac082f02dc4d0c79de21e721b41641cbb831f23c2cb8fce2.jpeg) + +## 2. 卡证识别技术解析 + +![](./images/d7f96effc2434a3ca2d4144ff33c50282b830670c892487d8d7dec151921cce7.jpeg) + +### 2.1 卡证分类模型 + +卡证分类:基于PPLCNet + +与其他轻量级模型相比在CPU环境下ImageNet数据集上的表现 + +![](./images/cbda3390cb994f98a3c8a9ba88c90c348497763f6c9f4b4797f7d63d84da5f63.jpeg) + +![](./images/dedab7b7fd6543aa9e7f625132b24e3ba3f200e361fa468dac615f7814dfb98d.jpeg) + +模型来自模型库PaddleClas,它是一个图像识别和图像分类任务的工具集,助力使用者训练出更好的视觉模型和应用落地。 + +### 2.2 卡证识别模型 + +检测:DBNet 识别:SVRT + +![](./images/9a7a4e19edc24310b46620f2ee7430f918223b93d4f14a15a52973c096926bad.jpeg) + +PPOCRv3在文本检测、识别进行了一系列改进优化,在保证精度的同时提升预测效率 + +![](./images/6afdbb77e8db4aef9b169e4e94c5d90a9764cfab4f2c4c04aa9afdf4f54d7680.jpeg) + +![](./images/c1a7d197847a4f168848c59b8e625d1d5e8066b778144395a8b9382bb85dc364.jpeg) + +## 3. OCR技术拆解 + +### 3.1技术流程 + +![](./images/89ba046177864d8783ced6cb31ba92a66ca2169856a44ee59ac2bb18e44a6c4b.jpeg) + +### 3.2 OCR技术拆解---卡证分类 + +#### 卡证分类:数据、模型准备 + +A 使用爬虫获取无标注数据,将相同类别的放在同一文件夹下,文件名从0开始命名。具体格式如下图所示。 + +​注:卡证类数据,建议每个类别数据量在500张以上 + +![](./images/6f875b6e695e4fe5aedf427beb0d4ce8064ad7cc33c44faaad59d3eb9732639d.jpeg) + +B 一行命令生成标签文件 + +```bash linenums="1" +tree -r -i -f | grep -E "jpg|JPG|jpeg|JPEG|png|PNG|webp" | awk -F "/" '{print $0" "$2}' > train_list.txt +``` + +C [下载预训练模型](https://github.com/PaddlePaddle/PaddleClas/blob/release/2.4/docs/zh_CN/models/PP-LCNet.md) + +#### 卡证分类---修改配置文件 + +配置文件主要修改三个部分: + +- 全局参数:预训练模型路径/训练轮次/图像尺寸 +- 模型结构:分类数 +- 数据处理:训练/评估数据路径 + +![](./images/e0dc05039c7444c5ab1260ff550a408748df8d4cfe864223adf390e51058dbd5.jpeg) + +#### 卡证分类---训练 + +指定配置文件启动训练: + +```bash linenums="1" +!python /home/aistudio/work/PaddleClas/tools/train.py -c /home/aistudio/work/PaddleClas/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml +``` + +![](./images/06af09bde845449ba0a676410f4daa1cdc3983ac95034bdbbafac3b7fd94042f.jpeg) + +​注:日志中显示了训练结果和评估结果(训练时可以设置固定轮数评估一次) + +### 3.2 OCR技术拆解---卡证识别 + +卡证识别(以身份证检测为例) +存在的困难及问题: + +- 在自然场景下,由于各种拍摄设备以及光线、角度不同等影响导致实际得到的证件影像千差万别。 + +- 如何快速提取需要的关键信息 + +- 多行的文本信息,检测结果如何正确拼接 + + ![](./images/4f8f5533a2914e0a821f4a639677843c32ec1f08a1b1488d94c0b8bfb6e72d2d.jpeg) + +- OCR技术拆解---OCR工具库 + + PaddleOCR是一个丰富、领先且实用的OCR工具库,助力开发者训练出更好的模型并应用落地 + +身份证识别:用现有的方法识别 + +![](./images/12d402e6a06d482a88f979e0ebdfb39f4d3fc8b80517499689ec607ddb04fbf3.jpeg) + +#### 身份证识别:检测+分类 +> +> 方法:基于现有的dbnet检测模型,加入分类方法。检测同时进行分类,从一定程度上优化识别流程 + +![](./images/e1e798c87472477fa0bfca0da12bb0c180845a3e167a4761b0d26ff4330a5ccb.jpeg) + +![](./images/23a5a19c746441309864586e467f995ec8a551a3661640e493fc4d77520309cd.jpeg) + +#### 数据标注 + +使用PaddleOCRLable进行快速标注 + +![](./images/a73180425fa14f919ce52d9bf70246c3995acea1831843cca6c17d871b8f5d95.jpeg) + +- 修改PPOCRLabel.py,将下图中的kie参数设置为True + +![](./images/d445cf4d850e4063b9a7fc6a075c12204cf912ff23ec471fa2e268b661b3d693.jpeg) + +- 数据标注踩坑分享 + +![](./images/89f42eccd600439fa9e28c97ccb663726e4e54ce3a854825b4c3b7d554ea21df.jpeg) + +​ 注:两者只有标注有差别,训练参数数据集都相同 + +## 4 . 项目实践 + +AIStudio项目链接:[快速构建卡证类OCR](https://aistudio.baidu.com/aistudio/projectdetail/4459116) + +### 4.1 环境准备 + +1)拉取[paddleocr](https://github.com/PaddlePaddle/PaddleOCR)项目,如果从github上拉取速度慢可以选择从gitee上获取。 + +```bash linenums="1" +!git clone https://github.com/PaddlePaddle/PaddleOCR.git -b release/2.6 /home/aistudio/work/ +``` + +2)获取并解压预训练模型,如果要使用其他模型可以从模型库里自主选择合适模型。 + +```bash linenums="1" +!wget -P work/pre_trained/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +!tar -vxf /home/aistudio/work/pre_trained/ch_PP-OCRv3_det_distill_train.tar -C /home/aistudio/work/pre_trained +``` + +3)安装必要依赖 + +```bash linenums="1" +!pip install -r /home/aistudio/work/requirements.txt +``` + +### 4.2 配置文件修改 + +修改配置文件 `work/configs/det/detmv3db.yml` + +具体修改说明如下: + +![](./images/fcdf517af5a6466294d72db7450209378d8efd9b77764e329d3f2aff3579a20c.jpeg) + +注:在上述的配置文件的Global变量中需要添加以下两个参数: + +​ - label_list 为标签表 +​ - num_classes 为分类数 +​上述两个参数根据实际的情况配置即可 + +![](./images/0b056be24f374812b61abf43305774767ae122c8479242f98aa0799b7bfc81d4.jpeg) + +其中lable_list内容如下例所示,***建议第一个参数设置为 background,不要设置为实际要提取的关键信息种类***: + +![](./images/9fc78bbcdf754898b9b2c7f000ddf562afac786482ab4f2ab063e2242faa542a.jpeg) + +配置文件中的其他设置说明 + +![](./images/c7fc5e631dd44bc8b714630f4e49d9155a831d9e56c64e2482ded87081d0db22.jpeg) + +![](./images/8d1022ac25d9474daa4fb236235bd58760039d58ad46414f841559d68e0d057f.jpeg) + +![](./images/ee927ad9ebd442bb96f163a7ebbf4bc95e6bedee97324a51887cf82de0851fd3.jpeg) + +### 4.3 代码修改 + +#### 4.3.1 数据读取 + +修改 PaddleOCR/ppocr/data/imaug/label_ops.py中的DetLabelEncode + +```python linenums="1" +class DetLabelEncode(object): + + # 修改检测标签的编码处,新增了参数分类数:num_classes,重写初始化方法,以及分类标签的读取 + + def __init__(self, label_list, num_classes=8, **kwargs): + self.num_classes = num_classes + self.label_list = [] + if label_list: + if isinstance(label_list, str): + with open(label_list, 'r+', encoding='utf-8') as f: + for line in f.readlines(): + self.label_list.append(line.replace("\n", "")) + else: + self.label_list = label_list + else: + assert ' please check label_list whether it is none or config is right' + + if num_classes != len(self.label_list): # 校验分类数和标签的一致性 + assert 'label_list length is not equal to the num_classes' + + def __call__(self, data): + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags, classes = [], [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['key_cls'] # 此处将kie中的参数作为分类读取 + boxes.append(box) + txts.append(txt) + + if txt in ['*', '###']: + txt_tags.append(True) + if self.num_classes > 1: + classes.append(-2) + else: + txt_tags.append(False) + if self.num_classes > 1: # 将KIE内容的key标签作为分类标签使用 + classes.append(int(self.label_list.index(txt))) + + if len(boxes) == 0: + + return None + boxes = self.expand_points_num(boxes) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=np.bool_) + classes = classes + data['polys'] = boxes + data['texts'] = txts + data['ignore_tags'] = txt_tags + if self.num_classes > 1: + data['classes'] = classes + return data +``` + +修改P`addleOCR/ppocr/data/imaug/make_shrink_map.py`中的MakeShrinkMap类。这里需要注意的是,如果我们设置的label_list中的第一个参数为要检测的信息那么会得到如下的mask, + +举例说明: +这是检测的mask图,图中有四个mask那么实际对应的分类应该是4类 + +![](./images/42d2188d3d6b498880952e12c3ceae1efabf135f8d9f4c31823f09ebe02ba9d2.jpeg) + +label_list中第一个为关键分类,则得到的分类Mask实际如下,与上图相比,少了一个box: + +![](./images/864604967256461aa7c5d32cd240645e9f4c70af773341d5911f22d5a3e87b5f.jpeg) + +```python linenums="1" +class MakeShrinkMap(object): + r''' + Making binary mask from detection data with ICDAR format. + Typically following the process of class `MakeICDARData`. + ''' + + def __init__(self, min_text_size=8, shrink_ratio=0.4, num_classes=8, **kwargs): + self.min_text_size = min_text_size + self.shrink_ratio = shrink_ratio + self.num_classes = num_classes # 添加了分类 + + def __call__(self, data): + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + if self.num_classes > 1: + classes = data['classes'] + + h, w = image.shape[:2] + text_polys, ignore_tags = self.validate_polygons(text_polys, + ignore_tags, h, w) + gt = np.zeros((h, w), dtype=np.float32) + mask = np.ones((h, w), dtype=np.float32) + gt_class = np.zeros((h, w), dtype=np.float32) # 新增分类 + for i in range(len(text_polys)): + polygon = text_polys[i] + height = max(polygon[:, 1]) - min(polygon[:, 1]) + width = max(polygon[:, 0]) - min(polygon[:, 0]) + if ignore_tags[i] or min(height, width) < self.min_text_size: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + else: + polygon_shape = Polygon(polygon) + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + shrinked = [] + + # Increase the shrink ratio every time we get multiple polygon returned back + possible_ratios = np.arange(self.shrink_ratio, 1, + self.shrink_ratio) + np.append(possible_ratios, 1) + for ratio in possible_ratios: + distance = polygon_shape.area * ( + 1 - np.power(ratio, 2)) / polygon_shape.length + shrinked = padding.Execute(-distance) + if len(shrinked) == 1: + break + + if shrinked == []: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + continue + + for each_shirnk in shrinked: + shirnk = np.array(each_shirnk).reshape(-1, 2) + cv2.fillPoly(gt, [shirnk.astype(np.int32)], 1) + if self.num_classes > 1: # 绘制分类的mask + cv2.fillPoly(gt_class, polygon.astype(np.int32)[np.newaxis, :, :], classes[i]) + + + data['shrink_map'] = gt + + if self.num_classes > 1: + data['class_mask'] = gt_class + + data['shrink_mask'] = mask + return data +``` + +由于在训练数据中会对数据进行resize设置,yml中的操作为:`EastRandomCropData`,所以需要修改`PaddleOCR/ppocr/data/imaug/random_crop_data.py`中的`EastRandomCropData` + +```python linenums="1" +class EastRandomCropData(object): + def __init__(self, + size=(640, 640), + max_tries=10, + min_crop_side_ratio=0.1, + keep_ratio=True, + num_classes=8, + **kwargs): + self.size = size + self.max_tries = max_tries + self.min_crop_side_ratio = min_crop_side_ratio + self.keep_ratio = keep_ratio + self.num_classes = num_classes + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + texts = data['texts'] + if self.num_classes > 1: + classes = data['classes'] + all_care_polys = [ + text_polys[i] for i, tag in enumerate(ignore_tags) if not tag + ] + # 计算crop区域 + crop_x, crop_y, crop_w, crop_h = crop_area( + img, all_care_polys, self.min_crop_side_ratio, self.max_tries) + # crop 图片 保持比例填充 + scale_w = self.size[0] / crop_w + scale_h = self.size[1] / crop_h + scale = min(scale_w, scale_h) + h = int(crop_h * scale) + w = int(crop_w * scale) + if self.keep_ratio: + padimg = np.zeros((self.size[1], self.size[0], img.shape[2]), + img.dtype) + padimg[:h, :w] = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h)) + img = padimg + else: + img = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], + tuple(self.size)) + # crop 文本框 + text_polys_crop = [] + ignore_tags_crop = [] + texts_crop = [] + classes_crop = [] + for poly, text, tag,class_index in zip(text_polys, texts, ignore_tags,classes): + poly = ((poly - (crop_x, crop_y)) * scale).tolist() + if not is_poly_outside_rect(poly, 0, 0, w, h): + text_polys_crop.append(poly) + ignore_tags_crop.append(tag) + texts_crop.append(text) + if self.num_classes > 1: + classes_crop.append(class_index) + data['image'] = img + data['polys'] = np.array(text_polys_crop) + data['ignore_tags'] = ignore_tags_crop + data['texts'] = texts_crop + if self.num_classes > 1: + data['classes'] = classes_crop + return data +``` + +#### 4.3.2 head修改 + +主要修改`ppocr/modeling/heads/det_db_head.py`,将Head类中的最后一层的输出修改为实际的分类数,同时在DBHead中新增分类的head。 + +![](./images/0e25da2ccded4af19e95c85c3d3287ab4d53e31a4eed4607b6a4cb637c43f6d3.jpeg) + +#### 4.3.3 修改loss + +修改`PaddleOCR/ppocr/losses/det_db_loss.py`中的DBLoss类,分类采用交叉熵损失函数进行计算。 + +![](./images/dc10a070018d4d27946c26ec24a2a85bc3f16422f4964f72a9b63c6170d954e1.jpeg) + +#### 4.3.4 后处理 + +由于涉及到eval以及后续推理能否正常使用,我们需要修改后处理的相关代码,修改位置`PaddleOCR/ppocr/postprocess/db_postprocess.py`中的DBPostProcess类 + +```python linenums="1" +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def boxes_from_bitmap(self, pred, _bitmap, classes, dest_width, dest_height): + """ + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + """ + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + class_indexes = [] + class_scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score, class_index, class_score = self.box_score_fast(pred, points.reshape(-1, 2), classes) + else: + score, class_index, class_score = self.box_score_slow(pred, contour, classes) + if self.box_thresh > score: + continue + + box = self.unclip(points).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + + boxes.append(box.astype(np.int16)) + scores.append(score) + + class_indexes.append(class_index) + class_scores.append(class_score) + + if classes is None: + return np.array(boxes, dtype=np.int16), scores + else: + return np.array(boxes, dtype=np.int16), scores, class_indexes, class_scores + + def unclip(self, box): + unclip_ratio = self.unclip_ratio + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box, classes): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + + if classes is None: + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0], None, None + else: + k = 999 + class_mask = np.full((ymax - ymin + 1, xmax - xmin + 1), k, dtype=np.int32) + + cv2.fillPoly(class_mask, box.reshape(1, -1, 2).astype(np.int32), 0) + classes = classes[ymin:ymax + 1, xmin:xmax + 1] + + new_classes = classes + class_mask + a = new_classes.reshape(-1) + b = np.where(a >= k) + classes = np.delete(a, b[0].tolist()) + + class_index = np.argmax(np.bincount(classes)) + class_score = np.sum(classes == class_index) / len(classes) + + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0], class_index, class_score + + def box_score_slow(self, bitmap, contour, classes): + """ + box_score_slow: use polyon mean score as the mean score + """ + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) + + if classes is None: + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0], None, None + else: + k = 999 + class_mask = np.full((ymax - ymin + 1, xmax - xmin + 1), k, dtype=np.int32) + + cv2.fillPoly(class_mask, contour.reshape(1, -1, 2).astype(np.int32), 0) + classes = classes[ymin:ymax + 1, xmin:xmax + 1] + + new_classes = classes + class_mask + a = new_classes.reshape(-1) + b = np.where(a >= k) + classes = np.delete(a, b[0].tolist()) + + class_index = np.argmax(np.bincount(classes)) + class_score = np.sum(classes == class_index) / len(classes) + + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0], class_index, class_score + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + + if "classes" in outs_dict: + classes = outs_dict['classes'] + if isinstance(classes, paddle.Tensor): + classes = classes.numpy() + classes = classes[:, 0, :, :] + + else: + classes = None + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + + if classes is None: + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, None, + src_w, src_h) + boxes_batch.append({'points': boxes}) + else: + boxes, scores, class_indexes, class_scores = self.boxes_from_bitmap(pred[batch_index], mask, + classes[batch_index], + src_w, src_h) + boxes_batch.append({'points': boxes, "classes": class_indexes, "class_scores": class_scores}) + + return boxes_batch +``` + +### 4.4. 模型启动 + +在完成上述步骤后我们就可以正常启动训练 + +```bash linenums="1" +!python /home/aistudio/work/PaddleOCR/tools/train.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml +``` + +其他命令: + +```bash linenums="1" +!python /home/aistudio/work/PaddleOCR/tools/eval.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml +!python /home/aistudio/work/PaddleOCR/tools/infer_det.py -c /home/aistudio/work/PaddleOCR/configs/det/det_mv3_db.yml +``` + +模型推理 + +```bash linenums="1" +!python /home/aistudio/work/PaddleOCR/tools/infer/predict_det.py --image_dir="/home/aistudio/work/test_img/" --det_model_dir="/home/aistudio/work/PaddleOCR/output/infer" +``` + +## 5 总结 + +1. 分类+检测在一定程度上能够缩短用时,具体的模型选取要根据业务场景恰当选择。 +2. 数据标注需要多次进行测试调整标注方法,一般进行检测模型微调,需要标注至少上百张。 +3. 设置合理的batch_size以及resize大小,同时注意lr设置。 + +## References + +1. +2. +3. diff --git "a/docs/applications/\346\211\213\345\206\231\346\226\207\345\255\227\350\257\206\345\210\253.md" "b/docs/applications/\346\211\213\345\206\231\346\226\207\345\255\227\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..f68f4634d9 --- /dev/null +++ "b/docs/applications/\346\211\213\345\206\231\346\226\207\345\255\227\350\257\206\345\210\253.md" @@ -0,0 +1,242 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 基于PP-OCRv3的手写文字识别 + +## 1. 项目背景及意义 + +目前光学字符识别(OCR)技术在我们的生活当中被广泛使用,但是大多数模型在通用场景下的准确性还有待提高。针对于此我们借助飞桨提供的PaddleOCR套件较容易的实现了在垂类场景下的应用。手写体在日常生活中较为常见,然而手写体的识别却存在着很大的挑战,因为每个人的手写字体风格不一样,这对于视觉模型来说还是相当有挑战的。因此训练一个手写体识别模型具有很好的现实意义。下面给出一些手写体的示例图: + +![example](./images/7a8865b2836f42d382e7c3fdaedc4d307d797fa2bcd0466e9f8b7705efff5a7b.png) + +## 2. 项目内容 + +本项目基于PaddleOCR套件,以PP-OCRv3识别模型为基础,针对手写文字识别场景进行优化。 + +Aistudio项目链接:[OCR手写文字识别](https://aistudio.baidu.com/aistudio/projectdetail/4330587) + +## 3. PP-OCRv3识别算法介绍 + +PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。如下图所示,PP-OCRv3采用了6个优化策略。 + +![v3_rec](./images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a-20240704185905678.jpg) + +优化策略汇总如下: + +* SVTR_LCNet:轻量级文本识别网络 +* GTC:Attention指导CTC训练策略 +* TextConAug:挖掘文字上下文信息的数据增广策略 +* TextRotNet:自监督的预训练模型 +* UDML:联合互学习策略 +* UIM:无标注数据挖掘方案 + +详细优化策略描述请参考[PP-OCRv3优化策略](../ppocr/blog/PP-OCRv3_introduction.md#3-识别优化) + +## 4. 安装环境 + +```bash linenums="1" +# 首先git官方的PaddleOCR项目,安装需要的依赖 +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +``` + +## 5. 数据准备 + +本项目使用公开的手写文本识别数据集,包含Chinese OCR, 中科院自动化研究所-手写中文数据集[CASIA-HWDB2.x](http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html),以及由中科院手写数据和网上开源数据合并组合的[数据集](https://aistudio.baidu.com/aistudio/datasetdetail/102884/0)等,该项目已经挂载处理好的数据集,可直接下载使用进行训练。 + +```bash linenums="1" +下载并解压数据 +tar -xf hw_data.tar +``` + +## 6. 模型训练 + +### 6.1 下载预训练模型 + +首先需要下载我们需要的PP-OCRv3识别预训练模型,更多选择请自行选择其他的[文字识别模型](../ppocr/model_list.md) + +```bash linenums="1" +# 使用该指令下载需要的预训练模型 +wget -P ./pretrained_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +# 解压预训练模型文件 +tar -xf ./pretrained_models/ch_PP-OCRv3_rec_train.tar -C pretrained_models +``` + +### 6.2 修改配置文件 + +我们使用`configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml`,主要修改训练轮数和学习率参相关参数,设置预训练模型路径,设置数据集路径。 另外,batch_size可根据自己机器显存大小进行调整。 具体修改如下几个地方: + +```yaml linenums="1" + epoch_num: 100 # 训练epoch数 + save_model_dir: ./output/ch_PP-OCR_v3_rec + save_epoch_step: 10 + eval_batch_step: [0, 100] # 评估间隔,每隔100step评估一次 + pretrained_model: ./pretrained_models/ch_PP-OCRv3_rec_train/best_accuracy # 预训练模型路径 + + + lr: + name: Cosine # 修改学习率衰减策略为Cosine + learning_rate: 0.0001 # 修改fine-tune的学习率 + warmup_epoch: 2 # 修改warmup轮数 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data # 训练集图片路径 + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/chineseocr-data/rec_hand_line_all_label_train.txt # 训练集标签 + - ./train_data/handwrite/HWDB2.0Train_label.txt + - ./train_data/handwrite/HWDB2.1Train_label.txt + - ./train_data/handwrite/HWDB2.2Train_label.txt + - ./train_data/handwrite/hwdb_ic13/handwriting_hwdb_train_labels.txt + - ./train_data/handwrite/HW_Chinese/train_hw.txt + ratio_list: + - 0.1 + - 1.0 + - 1.0 + - 1.0 + - 0.02 + - 1.0 + loader: + shuffle: true + batch_size_per_card: 64 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data # 测试集图片路径 + label_file_list: + - ./train_data/chineseocr-data/rec_hand_line_all_label_val.txt # 测试集标签 + - ./train_data/handwrite/HWDB2.0Test_label.txt + - ./train_data/handwrite/HWDB2.1Test_label.txt + - ./train_data/handwrite/HWDB2.2Test_label.txt + - ./train_data/handwrite/hwdb_ic13/handwriting_hwdb_val_labels.txt + - ./train_data/handwrite/HW_Chinese/test_hw.txt + loader: + shuffle: false + drop_last: false + batch_size_per_card: 64 + num_workers: 4 +``` + +由于数据集大多是长文本,因此需要**注释**掉下面的数据增广策略,以便训练出更好的模型。 + +```yaml linenums="1" +- RecConAug: + prob: 0.5 + ext_data_num: 2 + image_shape: [48, 320, 3] +``` + +### 6.3 开始训练 + +我们使用上面修改好的配置文件`configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml`,预训练模型,数据集路径,学习率,训练轮数等都已经设置完毕后,可以使用下面命令开始训练。 + +```bash linenums="1" +# 开始训练识别模型 +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml + +``` + +## 7. 模型评估 + +在训练之前,我们可以直接使用下面命令来评估预训练模型的效果: + +```bash linenums="1" +# 评估预训练模型 +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model="./pretrained_models/ch_PP-OCRv3_rec_train/best_accuracy" +``` + +``` +[2022/07/14 10:46:22] ppocr INFO: load pretrain successful from ./pretrained_models/ch_PP-OCRv3_rec_train/best_accuracy +eval model:: 100%|████████████████████████████| 687/687 [03:29<00:00, 3.27it/s] +[2022/07/14 10:49:52] ppocr INFO: metric eval *************** +[2022/07/14 10:49:52] ppocr INFO: acc:0.03724954461811258 +[2022/07/14 10:49:52] ppocr INFO: norm_edit_dis:0.4859541065843199 +[2022/07/14 10:49:52] ppocr INFO: Teacher_acc:0.0371584699368947 +[2022/07/14 10:49:52] ppocr INFO: Teacher_norm_edit_dis:0.48718814890536477 +[2022/07/14 10:49:52] ppocr INFO: fps:947.8562684823883 +``` + +可以看出,直接加载预训练模型进行评估,效果较差,因为预训练模型并不是基于手写文字进行单独训练的,所以我们需要基于预训练模型进行finetune。 +训练完成后,可以进行测试评估,评估命令如下: + +```bash linenums="1" +# 评估finetune效果 +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_rec/best_accuracy" + +``` + +评估结果如下,可以看出识别准确率为54.3%。 + +``` +[2022/07/14 10:54:06] ppocr INFO: metric eval *************** +[2022/07/14 10:54:06] ppocr INFO: acc:0.5430100180913 +[2022/07/14 10:54:06] ppocr INFO: norm_edit_dis:0.9203322593158589 +[2022/07/14 10:54:06] ppocr INFO: Teacher_acc:0.5401183969626324 +[2022/07/14 10:54:06] ppocr INFO: Teacher_norm_edit_dis:0.919827504507755 +[2022/07/14 10:54:06] ppocr INFO: fps:928.948733797251 +``` + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +* PaddleX官方交流频道: + +将下载或训练完成的模型放置在对应目录下即可完成模型推理 + +## 8. 模型导出推理 + +训练完成后,可以将训练模型转换成inference模型。inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +### 8.1 模型导出 + +导出命令如下: + +```bash linenums="1" +# 转化为推理模型 +python tools/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_rec/best_accuracy" Global.save_inference_dir="./inference/rec_ppocrv3/" + +``` + +### 8.2 模型推理 + +导出模型后,可以使用如下命令进行推理预测: + +```bash linenums="1" +# 推理预测 +python tools/infer/predict_rec.py --image_dir="train_data/handwrite/HWDB2.0Test_images/104-P16_4.jpg" --rec_model_dir="./inference/rec_ppocrv3/Student" +``` + +```bash linenums="1" +[2022/07/14 10:55:56] ppocr INFO: In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320 +[2022/07/14 10:55:58] ppocr INFO: Predicts of train_data/handwrite/HWDB2.0Test_images/104-P16_4.jpg:('品结构,差异化的多品牌渗透使欧莱雅确立了其在中国化妆', 0.9904912114143372) +``` + +```python linenums="1" +# 可视化文字识别图片 +from PIL import Image +import matplotlib.pyplot as plt +import numpy as np +import os + + +img_path = 'train_data/handwrite/HWDB2.0Test_images/104-P16_4.jpg' + +def vis(img_path): + plt.figure() + image = Image.open(img_path) + plt.imshow(image) + plt.show() + # image = image.resize([208, 208]) + + +vis(img_path) +``` + +![res](./images/ad7c02745491498d82e0ce95f4a274f9b3920b2f467646858709359b7af9d869.png) diff --git "a/docs/applications/\346\211\253\346\217\217\345\220\210\345\220\214\345\205\263\351\224\256\344\277\241\346\201\257\346\217\220\345\217\226.md" "b/docs/applications/\346\211\253\346\217\217\345\220\210\345\220\214\345\205\263\351\224\256\344\277\241\346\201\257\346\217\220\345\217\226.md" new file mode 100644 index 0000000000..d70679c39f --- /dev/null +++ "b/docs/applications/\346\211\253\346\217\217\345\220\210\345\220\214\345\205\263\351\224\256\344\277\241\346\201\257\346\217\220\345\217\226.md" @@ -0,0 +1,280 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 金融智能核验:扫描合同关键信息抽取 + +本案例将使用OCR技术和通用信息抽取技术,实现合同关键信息审核和比对。通过本章的学习,你可以快速掌握: + +1. 使用PaddleOCR提取扫描文本内容 +2. 使用PaddleNLP抽取自定义信息 + +点击进入 [AI Studio 项目](https://aistudio.baidu.com/aistudio/projectdetail/4545772) + +## 1. 项目背景 + +合同审核广泛应用于大中型企业、上市公司、证券、基金公司中,是规避风险的重要任务。 + +- 合同内容对比:合同审核场景中,快速找出不同版本合同修改区域、版本差异;如合同盖章归档场景中有效识别实际签署的纸质合同、电子版合同差异。 +- 合规性检查:法务人员进行合同审核,如合同完备性检查、大小写金额检查、签约主体一致性检查、双方权利和义务对等性分析等。 +- 风险点识别:通过合同审核可识别事实倾向型风险点和数值计算型风险点等,例如交付地点约定不明、合同总价款不一致、重要条款缺失等风险点。 + +![](./images/d5143df967fa4364a38868793fe7c57b0c0b1213930243babd6ae01423dcbc4d.png) + +传统业务中大多使用人工进行纸质版合同审核,存在成本高,工作量大,效率低的问题,且一旦出错将造成巨额损失。 + +本项目针对以上场景,使用PaddleOCR+PaddleNLP快速提取文本内容,经过少量数据微调即可准确抽取关键信息,**高效完成合同内容对比、合规性检查、风险点识别等任务,提高效率,降低风险**。 + +![](./images/54f3053e6e1b47a39b26e757006fe2c44910d60a3809422ab76c25396b92e69b-0096905.png) + +## 2. 解决方案 + +### 2.1 扫描合同文本内容提取 + +使用PaddleOCR开源的模型可以快速完成扫描文档的文本内容提取,在清晰文档上识别准确率可达到95%+。下面来快速体验一下: + +#### 2.1.1 环境准备 + +[PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)提供了适用于通用场景的高精轻量模型,提供数据预处理-模型推理-后处理全流程,支持pip安装: + +```bash linenums="1" +python -m pip install paddleocr +``` + +#### 2.1.2 效果测试 + +使用一张合同图片作为测试样本,感受ppocrv3模型效果: + +![](./images/46258d0dc9dc40bab3ea0e70434e4a905646df8a647f4c49921e217de5142def.jpeg) + +使用中文检测+识别模型提取文本,实例化PaddleOCR类: + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# paddleocr目前支持中英文、英文、法语、德语、韩语、日语等80个语种,可以通过修改lang参数进行切换 +ocr = PaddleOCR(use_angle_cls=False, lang="ch") # need to run only once to download and load model into memory +``` + +一行命令启动预测,预测结果包括`检测框`和`文本识别内容`: + +```python linenums="1" +img_path = "./test_img/hetong2.jpg" +result = ocr.ocr(img_path, cls=False) +for line in result: + print(line) + +# 可视化结果 +from PIL import Image + +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.show() +``` + +#### 2.1.3 图片预处理 + +通过上图可视化结果可以看到,印章部分造成的文本遮盖,影响了文本识别结果,因此可以考虑通道提取,去除图片中的红色印章: + +```python linenums="1" +import cv2 +import numpy as np +import matplotlib.pyplot as plt + +#读入图像,三通道 +image=cv2.imread("./test_img/hetong2.jpg",cv2.IMREAD_COLOR) #timg.jpeg + +#获得三个通道 +Bch,Gch,Rch=cv2.split(image) + +#保存三通道图片 +cv2.imwrite('blue_channel.jpg',Bch) +cv2.imwrite('green_channel.jpg',Gch) +cv2.imwrite('red_channel.jpg',Rch) +``` + +#### 2.1.4 合同文本信息提取 + +经过2.1.3的预处理后,合同照片的红色通道被分离,获得了一张相对更干净的图片,此时可以再次使用ppocr模型提取文本内容: + +```python linenums="1" +import numpy as np +import cv2 + + +img_path = './red_channel.jpg' +result = ocr.ocr(img_path, cls=False) + +# 可视化结果 +from PIL import Image + +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./simfang.ttf') +im_show = Image.fromarray(im_show) +vis = np.array(im_show) +im_show.show() +``` + +忽略检测框内容,提取完整的合同文本: + +```python linenums="1" +txts = [line[1][0] for line in result] +all_context = "\n".join(txts) +print(all_context) +``` + +通过以上环节就完成了扫描合同关键信息抽取的第一步:文本内容提取,接下来可以基于识别出的文本内容抽取关键信息 + +### 2.2 合同关键信息抽取 + +#### 2.2.1 环境准备 + +安装PaddleNLP + +```bash linenums="1" +pip install --upgrade pip +pip install --upgrade paddlenlp +``` + +#### 2.2.2 合同关键信息抽取 + +PaddleNLP 使用 Taskflow 统一管理多场景任务的预测功能,其中`information_extraction` 通过大量的有标签样本进行训练,在通用的场景中一般可以直接使用,只需更换关键字即可。例如在合同信息抽取中,我们重新定义抽取关键字: + +甲方、乙方、币种、金额、付款方式 + +将使用OCR提取好的文本作为输入,使用三行命令可以对上文中提取到的合同文本进行关键信息抽取: + +```python linenums="1" +from paddlenlp import Taskflow +schema = ["甲方","乙方","总价"] +ie = Taskflow('information_extraction', schema=schema) +ie.set_schema(schema) +ie(all_context) +``` + +可以看到UIE模型可以准确的提取出关键信息,用于后续的信息比对或审核。 + +## 3.效果优化 + +### 3.1 文本识别后处理调优 + +实际图片采集过程中,可能出现部分图片弯曲等问题,导致使用默认参数识别文本时存在漏检,影响关键信息获取。 + +例如下图: + +![](./images/fe350481be0241c58736d487d1bf06c2e65911bf01254a79944be629c4c10091.jpeg) + +直接进行预测: + +```python linenums="1" +img_path = "./test_img/hetong3.jpg" +# 预测结果 +result = ocr.ocr(img_path, cls=False) +# 可视化结果 +from PIL import Image + +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.show() +``` + +可视化结果可以看到,弯曲图片存在漏检,一般来说可以通过调整后处理参数解决,无需重新训练模型。漏检问题往往是因为检测模型获得的分割图太小,生成框的得分过低被过滤掉了,通常有两种方式调整参数: + +- 开启`use_dilatiion=True` 膨胀分割区域 +- 调小`det_db_box_thresh`阈值 + +```python linenums="1" +# 重新实例化 PaddleOCR +ocr = PaddleOCR(use_angle_cls=False, lang="ch", det_db_box_thresh=0.3, use_dilation=True) + +# 预测并可视化 +img_path = "./test_img/hetong3.jpg" +# 预测结果 +result = ocr.ocr(img_path, cls=False) +# 可视化结果 +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.show() +``` + +可以看到漏检问题被很好的解决,提取完整的文本内容: + +```python linenums="1" +txts = [line[1][0] for line in result] +context = "\n".join(txts) +print(context) +``` + +### 3.2 关键信息提取调优 + +UIE通过大量有标签样本进行训练,得到了一个开箱即用的高精模型。 然而针对不同场景,可能会出现部分实体无法被抽取的情况。通常来说有以下几个方法进行效果调优: + +- 修改 schema +- 添加正则方法 +- 标注小样本微调模型 + +**修改schema** + +Prompt和原文描述越像,抽取效果越好,例如 + +```text linenums="1" +三:合同价格:总价为人民币大写:参拾玖万捌仟伍佰 +元,小写:398500.00元。总价中包括站房工程建设、安装 +及相关避雷、消防、接地、电力、材料费、检验费、安全、 +验收等所需费用及其他相关费用和税金。 +``` + +schema = ["总金额"] 时无法准确抽取,与原文描述差异较大。 修改 schema = ["总价"] 再次尝试: + +```python linenums="1" +from paddlenlp import Taskflow +# schema = ["总金额"] +schema = ["总价"] +ie = Taskflow('information_extraction', schema=schema) +ie.set_schema(schema) +ie(all_context) +``` + +**模型微调** +UIE的建模方式主要是通过 `Prompt` 方式来建模, `Prompt` 在小样本上进行微调效果非常有效。详细的数据标注+模型微调步骤可以参考项目: + +[PaddleNLP信息抽取技术重磅升级!](https://aistudio.baidu.com/aistudio/projectdetail/3914778?channelType=0&channel=0) + +[工单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/3914778?contributionType=1) + +[快递单信息抽取](https://aistudio.baidu.com/aistudio/projectdetail/4038499?contributionType=1) + +## 总结 + +扫描合同的关键信息提取可以使用 PaddleOCR + PaddleNLP 组合实现,两个工具均有以下优势: + +- 使用简单:whl包一键安装,3行命令调用 +- 效果领先:优秀的模型效果可覆盖几乎全部的应用场景 +- 调优成本低:OCR模型可通过后处理参数的调整适配略有偏差的扫描文本, UIE模型可以通过极少的标注样本微调,成本很低。 + +## 作业 + +尝试自己解析出 `test_img/homework.png` 扫描合同中的 [甲方、乙方] 关键词: + +![](./images/50a49a3c9f8348bfa04e8c8b97d3cce0d0dd6b14040f43939268d120688ef7ca.jpg) + +更多场景下的垂类模型获取,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: diff --git "a/docs/applications/\346\266\262\346\231\266\345\261\217\350\257\273\346\225\260\350\257\206\345\210\253.md" "b/docs/applications/\346\266\262\346\231\266\345\261\217\350\257\273\346\225\260\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..9922a055f4 --- /dev/null +++ "b/docs/applications/\346\266\262\346\231\266\345\261\217\350\257\273\346\225\260\350\257\206\345\210\253.md" @@ -0,0 +1,642 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 基于PP-OCRv3的液晶屏读数识别 + +## 1. 项目背景及意义 + +目前光学字符识别(OCR)技术在我们的生活当中被广泛使用,但是大多数模型在通用场景下的准确性还有待提高,针对于此我们借助飞桨提供的PaddleOCR套件较容易的实现了在垂类场景下的应用。 + +该项目以国家质量基础(NQI)为准绳,充分利用大数据、云计算、物联网等高新技术,构建覆盖计量端、实验室端、数据端和硬件端的完整计量解决方案,解决传统计量校准中存在的难题,拓宽计量检测服务体系和服务领域;解决无数传接口或数传接口不统一、不公开的计量设备,以及计量设备所处的环境比较恶劣,不适合人工读取数据。通过OCR技术实现远程计量,引领计量行业向智慧计量转型和发展。 + +## 2. 项目内容 + +本项目基于PaddleOCR开源套件,以PP-OCRv3检测和识别模型为基础,针对液晶屏读数识别场景进行优化。 + +Aistudio项目链接:[OCR液晶屏读数识别](https://aistudio.baidu.com/aistudio/projectdetail/4080130) + +## 3. 安装环境 + +```bash linenums="1" +# 首先git官方的PaddleOCR项目,安装需要的依赖 +# 第一次运行打开该注释 +# git clone https://gitee.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +``` + +## 4. 文字检测 + +文本检测的任务是定位出输入图像中的文字区域。近年来学术界关于文本检测的研究非常丰富,一类方法将文本检测视为目标检测中的一个特定场景,基于通用目标检测算法进行改进适配,如TextBoxes[1]基于一阶段目标检测器SSD[2]算法,调整目标框使之适合极端长宽比的文本行,CTPN[3]则是基于Faster RCNN[4]架构改进而来。但是文本检测与目标检测在目标信息以及任务本身上仍存在一些区别,如文本一般长宽比较大,往往呈“条状”,文本行之间可能比较密集,弯曲文本等,因此又衍生了很多专用于文本检测的算法。本项目基于PP-OCRv3算法进行优化。 + +### 4.1 PP-OCRv3检测算法介绍 + +PP-OCRv3检测模型是对PP-OCRv2中的CML(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。如下图所示,CML的核心思想结合了①传统的Teacher指导Student的标准蒸馏与 ②Students网络之间的DML互学习,可以让Students网络互学习的同时,Teacher网络予以指导。PP-OCRv3分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 +![](./images/c306b2f028364805a55494d435ab553a76cf5ae5dd3f4649a948ea9aeaeb28b8.png) + +详细优化策略描述请参考[PP-OCRv3优化策略](../ppocr/blog/PP-OCRv3_introduction.md#2-检测优化) + +### 4.2 数据准备 + +[计量设备屏幕字符检测数据集](https://aistudio.baidu.com/aistudio/datasetdetail/127845)数据来源于实际项目中各种计量设备的数显屏,以及在网上搜集的一些其他数显屏,包含训练集755张,测试集355张。 + +```bash linenums="1" +# 在PaddleOCR下创建新的文件夹train_data +mkdir train_data +# 下载数据集并解压到指定路径下 +unzip icdar2015.zip -d train_data +``` + +```python linenums="1" +# 随机查看文字检测数据集图片 +from PIL import Image +import matplotlib.pyplot as plt +import numpy as np +import os + + +train = './train_data/icdar2015/text_localization/test' +# 从指定目录中选取一张图片 +def get_one_image(train): + plt.figure() + files = os.listdir(train) + n = len(files) + ind = np.random.randint(0,n) + img_dir = os.path.join(train,files[ind]) + image = Image.open(img_dir) + plt.imshow(image) + plt.show() + image = image.resize([208, 208]) + +get_one_image(train) +``` + +![det_png](./images/0639da09b774458096ae577e82b2c59e89ced6a00f55458f946997ab7472a4f8.jpeg) + +### 4.3 模型训练 + +#### 4.3.1 预训练模型直接评估 + +下载我们需要的PP-OCRv3检测预训练模型,更多选择请自行选择其他的[文字检测模型](../ppocr/model_list.md#1-文本检测模型) + +```bash linenums="1" +#使用该指令下载需要的预训练模型 +wget -P ./pretrained_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +# 解压预训练模型文件 +tar -xf ./pretrained_models/ch_PP-OCRv3_det_distill_train.tar -C pretrained_models +``` + +在训练之前,我们可以直接使用下面命令来评估预训练模型的效果: + +```bash linenums="1" +# 评估预训练模型 +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model="./pretrained_models/ch_PP-OCRv3_det_distill_train/best_accuracy" +``` + +结果如下: + +| | 方案 |hmeans| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| + +#### 4.3.2 预训练模型直接finetune + +##### 修改配置文件 + +我们使用configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml,主要修改训练轮数和学习率参相关参数,设置预训练模型路径,设置数据集路径。 另外,batch_size可根据自己机器显存大小进行调整。 具体修改如下几个地方: + +```yaml linenums="1" +epoch:100 +save_epoch_step:10 +eval_batch_step:[0, 50] +save_model_dir: ./output/ch_PP-OCR_v3_det/ +pretrained_model: ./pretrained_models/ch_PP-OCRv3_det_distill_train/best_accuracy +learning_rate: 0.00025 +num_workers: 0 # 如果单卡训练,建议将Train和Eval的loader部分的num_workers设置为0,否则会出现`/dev/shm insufficient`的报错 +``` + +##### 开始训练 + +使用我们上面修改的配置文件configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml,训练命令如下: + +```bash linenums="1" +# 开始训练模型 +python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model=./pretrained_models/ch_PP-OCRv3_det_distill_train/best_accuracy +``` + +评估训练好的模型: + +```bash linenums="1" +# 评估训练好的模型 +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_det/best_accuracy" +``` + +结果如下: + +| | 方案 |hmeans| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| + +#### 4.3.3 基于预训练模型Finetune_student模型 + +我们使用configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml,主要修改训练轮数和学习率参相关参数,设置预训练模型路径,设置数据集路径。 另外,batch_size可根据自己机器显存大小进行调整。 具体修改如下几个地方: + +```yaml linenums="1" +epoch:100 +save_epoch_step:10 +eval_batch_step:[0, 50] +save_model_dir: ./output/ch_PP-OCR_v3_det_student/ +pretrained_model: ./pretrained_models/ch_PP-OCRv3_det_distill_train/student +learning_rate: 0.00025 +num_workers: 0 # 如果单卡训练,建议将Train和Eval的loader部分的num_workers设置为0,否则会出现`/dev/shm insufficient`的报错 +``` + +训练命令如下: + +```bash linenums="1" +python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o Global.pretrained_model=./pretrained_models/ch_PP-OCRv3_det_distill_train/student +``` + +评估训练好的模型: + +```bash linenums="1" +# 评估训练好的模型 +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_det_student/best_accuracy" +``` + +结果如下: + +| | 方案 |hmeans| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| + +#### 4.3.4 基于预训练模型Finetune_teacher模型 + +首先需要从提供的预训练模型best_accuracy.pdparams中提取teacher参数,组合成适合dml训练的初始化模型,提取代码如下: + +```python linenums="1" +cd ./pretrained_models/ +# transform teacher params in best_accuracy.pdparams into teacher_dml.paramers +import paddle + +# load pretrained model +all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# print(all_params.keys()) + +# keep teacher params +t_params = {key[len("Teacher."):]: all_params[key] for key in all_params if "Teacher." in key} + +# print(t_params.keys()) + +s_params = {"Student." + key: t_params[key] for key in t_params} +s2_params = {"Student2." + key: t_params[key] for key in t_params} +s_params = {**s_params, **s2_params} +# print(s_params.keys()) + +paddle.save(s_params, "ch_PP-OCRv3_det_distill_train/teacher_dml.pdparams") + +``` + +我们使用configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml,主要修改训练轮数和学习率参相关参数,设置预训练模型路径,设置数据集路径。 另外,batch_size可根据自己机器显存大小进行调整。 具体修改如下几个地方: + +```yaml linenums="1" +epoch:100 +save_epoch_step:10 +eval_batch_step:[0, 50] +save_model_dir: ./output/ch_PP-OCR_v3_det_teacher/ +pretrained_model: ./pretrained_models/ch_PP-OCRv3_det_distill_train/teacher_dml +learning_rate: 0.00025 +num_workers: 0 # 如果单卡训练,建议将Train和Eval的loader部分的num_workers设置为0,否则会出现`/dev/shm insufficient`的报错 +``` + +训练命令如下: + +```bash linenums="1" +python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml -o Global.pretrained_model=./pretrained_models/ch_PP-OCRv3_det_distill_train/teacher_dml +``` + +评估训练好的模型: + +```bash linenums="1" +# 评估训练好的模型 +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_det_teacher/best_accuracy" +``` + +结果如下: + +| | 方案 |hmeans| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| +| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.80%| + +#### 4.3.5 采用CML蒸馏进一步提升student模型精度 + +需要从4.3.3和4.3.4训练得到的best_accuracy.pdparams中提取各自代表student和teacher的参数,组合成适合cml训练的初始化模型,提取代码如下: + +```python linenums="1" +# transform teacher params and student parameters into cml model +import paddle + +all_params = paddle.load("./pretrained_models/ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# print(all_params.keys()) + +t_params = paddle.load("./output/ch_PP-OCR_v3_det_teacher/best_accuracy.pdparams") +# print(t_params.keys()) + +s_params = paddle.load("./output/ch_PP-OCR_v3_det_student/best_accuracy.pdparams") +# print(s_params.keys()) + +for key in all_params: + # teacher is OK + if "Teacher." in key: + new_key = key.replace("Teacher", "Student") + #print("{} >> {}\n".format(key, new_key)) + assert all_params[key].shape == t_params[new_key].shape + all_params[key] = t_params[new_key] + + if "Student." in key: + new_key = key.replace("Student.", "") + #print("{} >> {}\n".format(key, new_key)) + assert all_params[key].shape == s_params[new_key].shape + all_params[key] = s_params[new_key] + + if "Student2." in key: + new_key = key.replace("Student2.", "") + print("{} >> {}\n".format(key, new_key)) + assert all_params[key].shape == s_params[new_key].shape + all_params[key] = s_params[new_key] + +paddle.save(all_params, "./pretrained_models/ch_PP-OCRv3_det_distill_train/teacher_cml_student.pdparams") +``` + +训练命令如下: + +```bash linenums="1" +python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model=./pretrained_models/ch_PP-OCRv3_det_distill_train/teacher_cml_student Global.save_model_dir=./output/ch_PP-OCR_v3_det_finetune/ +``` + +评估训练好的模型: + +```bash linenums="1" +# 评估训练好的模型 +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_det_finetune/best_accuracy" +``` + +结果如下: + +| | 方案 |hmeans| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量检测预训练模型直接预测 |47.50%| +| 1 | PP-OCRv3中英文超轻量检测预训练模型fintune |65.20%| +| 2 | PP-OCRv3中英文超轻量检测预训练模型fintune学生模型 |80.00%| +| 3 | PP-OCRv3中英文超轻量检测预训练模型fintune教师模型 |84.80%| +| 4 | 基于2和3训练好的模型fintune |82.70%| + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +将下载或训练完成的模型放置在对应目录下即可完成模型推理 + +#### 4.3.6 模型导出推理 + +训练完成后,可以将训练模型转换成inference模型。inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +##### 4.3.6.1 模型导出 + +导出命令如下: + +```bash linenums="1" +# 转化为推理模型 +python tools/export_model.py \ +-c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ +-o Global.pretrained_model=./output/ch_PP-OCR_v3_det_finetune/best_accuracy \ +-o Global.save_inference_dir="./inference/det_ppocrv3" + +``` + +##### 4.3.6.2 模型推理 + +导出模型后,可以使用如下命令进行推理预测: + +```bash linenums="1" +# 推理预测 +python tools/infer/predict_det.py --image_dir="train_data/icdar2015/text_localization/test/1.jpg" --det_model_dir="./inference/det_ppocrv3/Student" +``` + +## 5. 文字识别 + +文本识别的任务是识别出图像中的文字内容,一般输入来自于文本检测得到的文本框截取出的图像文字区域。文本识别一般可以根据待识别文本形状分为规则文本识别和不规则文本识别两大类。规则文本主要指印刷字体、扫描文本等,文本大致处在水平线位置;不规则文本往往不在水平位置,存在弯曲、遮挡、模糊等问题。不规则文本场景具有很大的挑战性,也是目前文本识别领域的主要研究方向。本项目基于PP-OCRv3算法进行优化。 + +### 5.1 PP-OCRv3识别算法介绍 + +PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。如下图所示,PP-OCRv3采用了6个优化策略。 +![](./images/d4f5344b5b854d50be738671598a89a45689c6704c4d481fb904dd7cf72f2a1a.png) + +优化策略汇总如下: + +- SVTR_LCNet:轻量级文本识别网络 +- GTC:Attention指导CTC训练策略 +- TextConAug:挖掘文字上下文信息的数据增广策略 +- TextRotNet:自监督的预训练模型 +- UDML:联合互学习策略 +- UIM:无标注数据挖掘方案 + +详细优化策略描述请参考[PP-OCRv3优化策略](../ppocr/blog/PP-OCRv3_introduction.md#3-识别优化) + +### 5.2 数据准备 + +[计量设备屏幕字符识别数据集](https://aistudio.baidu.com/aistudio/datasetdetail/128714)数据来源于实际项目中各种计量设备的数显屏,以及在网上搜集的一些其他数显屏,包含训练集19912张,测试集4099张。 + +```bash linenums="1" +# 解压下载的数据集到指定路径下 +unzip ic15_data.zip -d train_data +``` + +```python linenums="1" +# 随机查看文字检测数据集图片 +from PIL import Image +import matplotlib.pyplot as plt +import numpy as np +import os + +train = './train_data/ic15_data/train' +# 从指定目录中选取一张图片 +def get_one_image(train): + plt.figure() + files = os.listdir(train) + n = len(files) + ind = np.random.randint(0,n) + img_dir = os.path.join(train,files[ind]) + image = Image.open(img_dir) + plt.imshow(image) + plt.show() + image = image.resize([208, 208]) + +get_one_image(train) +``` + +![rec_png](./images/3de0d475c69746d0a184029001ef07c85fd68816d66d4beaa10e6ef60030f9b4.jpeg) + +### 5.3 模型训练 + +#### 下载预训练模型 + +下载我们需要的PP-OCRv3识别预训练模型,更多选择请自行选择其他的[文字识别模型](../ppocr/model_list.md#2-文本识别模型) + +```bash linenums="1" +# 使用该指令下载需要的预训练模型 +wget -P ./pretrained_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +# 解压预训练模型文件 +tar -xf ./pretrained_models/ch_PP-OCRv3_rec_train.tar -C pretrained_models +``` + +#### 修改配置文件 + +我们使用configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml,主要修改训练轮数和学习率参相关参数,设置预训练模型路径,设置数据集路径。 另外,batch_size可根据自己机器显存大小进行调整。 具体修改如下几个地方: + +```yaml linenums="1" + epoch_num: 100 # 训练epoch数 + save_model_dir: ./output/ch_PP-OCR_v3_rec + save_epoch_step: 10 + eval_batch_step: [0, 100] # 评估间隔,每隔100step评估一次 + cal_metric_during_train: true + pretrained_model: ./pretrained_models/ch_PP-OCRv3_rec_train/best_accuracy # 预训练模型路径 + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + use_space_char: true # 使用空格 + + lr: + name: Cosine # 修改学习率衰减策略为Cosine + learning_rate: 0.0002 # 修改fine-tune的学习率 + warmup_epoch: 2 # 修改warmup轮数 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ # 训练集图片路径 + ext_op_transform_idx: 1 + label_file_list: + - ./train_data/ic15_data/rec_gt_train.txt # 训练集标签 + ratio_list: + - 1.0 + loader: + shuffle: true + batch_size_per_card: 64 + drop_last: true + num_workers: 4 +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ic15_data/ # 测试集图片路径 + label_file_list: + - ./train_data/ic15_data/rec_gt_test.txt # 测试集标签 + ratio_list: + - 1.0 + loader: + shuffle: false + drop_last: false + batch_size_per_card: 64 + num_workers: 4 +``` + +在训练之前,我们可以直接使用下面命令来评估预训练模型的效果: + +```bash linenums="1" +# 评估预训练模型 +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model="./pretrained_models/ch_PP-OCRv3_rec_train/best_accuracy" +``` + +结果如下: + +| | 方案 |accuracy| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.40%| + +#### 开始训练 + +我们使用上面修改好的配置文件configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml,预训练模型,数据集路径,学习率,训练轮数等都已经设置完毕后,可以使用下面命令开始训练。 + +```bash linenums="1" +# 开始训练识别模型 +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml +``` + +训练完成后,可以对训练模型中最好的进行测试,评估命令如下: + +```bash linenums="1" +# 评估finetune效果 +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.checkpoints="./output/ch_PP-OCR_v3_rec/best_accuracy" +``` + +结果如下: + +| | 方案 |accuracy| +|---|---------------------------|---| +| 0 | PP-OCRv3中英文超轻量识别预训练模型直接预测 |70.40%| +| 1 | PP-OCRv3中英文超轻量识别预训练模型finetune |82.20%| + +如需获取已训练模型,请扫码填写问卷,加入PaddleOCR官方交流群获取全部OCR垂类模型下载链接、《动手学OCR》电子书等全套OCR学习资料🎁 + + + +将下载或训练完成的模型放置在对应目录下即可完成模型推理。 + +### 5.4 模型导出推理 + +训练完成后,可以将训练模型转换成inference模型。inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +#### 模型导出 + +导出命令如下: + +```bash linenums="1" +# 转化为推理模型 +python tools/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml -o Global.pretrained_model="./output/ch_PP-OCR_v3_rec/best_accuracy" Global.save_inference_dir="./inference/rec_ppocrv3/" +``` + +#### 模型推理 + +导出模型后,可以使用如下命令进行推理预测 + +```bash linenums="1" +# 推理预测 +python tools/infer/predict_rec.py --image_dir="train_data/ic15_data/test/1_crop_0.jpg" --rec_model_dir="./inference/rec_ppocrv3/Student" +``` + +## 6. 系统串联 + +我们将上面训练好的检测和识别模型进行系统串联测试,命令如下: + +```bash linenums="1" +#串联测试 +python3 tools/infer/predict_system.py --image_dir="./train_data/icdar2015/text_localization/test/142.jpg" --det_model_dir="./inference/det_ppocrv3/Student" --rec_model_dir="./inference/rec_ppocrv3/Student" +``` + +测试结果保存在`./inference_results/`目录下,可以用下面代码进行可视化 + +```bash linenums="1" +%cd /home/aistudio/PaddleOCR +# 显示结果 +import matplotlib.pyplot as plt +from PIL import Image +img_path= "./inference_results/142.jpg" +img = Image.open(img_path) +plt.figure("test_img", figsize=(30,30)) +plt.imshow(img) +plt.show() +``` + +![sys_res_png](./images/901ab741cb46441ebec510b37e63b9d8d1b7c95f63cc4e5e8757f35179ae6373-20240704185855034.png) + +### 6.1 后处理 + +如果需要获取key-value信息,可以基于启发式的规则,将识别结果与关键字库进行匹配;如果匹配上了,则取该字段为key, 后面一个字段为value。 + +```python linenums="1" +def postprocess(rec_res): + keys = ["型号", "厂家", "版本号", "检定校准分类", "计量器具编号", "烟尘流量", + "累积体积", "烟气温度", "动压", "静压", "时间", "试验台编号", "预测流速", + "全压", "烟温", "流速", "工况流量", "标杆流量", "烟尘直读嘴", "烟尘采样嘴", + "大气压", "计前温度", "计前压力", "干球温度", "湿球温度", "流量", "含湿量"] + key_value = [] + if len(rec_res) > 1: + for i in range(len(rec_res) - 1): + rec_str, _ = rec_res[i] + for key in keys: + if rec_str in key: + key_value.append([rec_str, rec_res[i + 1][0]]) + break + return key_value +key_value = postprocess(filter_rec_res) +``` + +## 7. PaddleServing部署 + +首先需要安装PaddleServing部署相关的环境 + +```bash linenums="1" +python -m pip install paddle-serving-server-gpu +python -m pip install paddle_serving_client +python -m pip install paddle-serving-app +``` + +### 7.1 转化检测模型 + +```bash linenums="1" +cd deploy/pdserving/ +python -m paddle_serving_client.convert --dirname ../../inference/det_ppocrv3/Student/ \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_det_v3_serving/ \ + --serving_client ./ppocr_det_v3_client/ +``` + +### 7.2 转化识别模型 + +```bash linenums="1" +python -m paddle_serving_client.convert --dirname ../../inference/rec_ppocrv3/Student \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_rec_v3_serving/ \ + --serving_client ./ppocr_rec_v3_client/ +``` + +### 7.3 启动服务 + +首先可以将后处理代码加入到web_service.py中,具体修改如下: + +```python linenums="1" +# 代码153行后面增加下面代码 +def _postprocess(rec_res): + keys = ["型号", "厂家", "版本号", "检定校准分类", "计量器具编号", "烟尘流量", + "累积体积", "烟气温度", "动压", "静压", "时间", "试验台编号", "预测流速", + "全压", "烟温", "流速", "工况流量", "标杆流量", "烟尘直读嘴", "烟尘采样嘴", + "大气压", "计前温度", "计前压力", "干球温度", "湿球温度", "流量", "含湿量"] + key_value = [] + if len(rec_res) > 1: + for i in range(len(rec_res) - 1): + rec_str, _ = rec_res[i] + for key in keys: + if rec_str in key: + key_value.append([rec_str, rec_res[i + 1][0]]) + break + return key_value +key_value = _postprocess(rec_list) +res = {"result": str(key_value)} +# res = {"result": str(result_list)} +``` + +启动服务端 + +```bash linenums="1" +python web_service.py 2>&1 >log.txt +``` + +### 7.4 发送请求 + +然后再开启一个新的终端,运行下面的客户端代码 + +```bash linenums="1" +python pipeline_http_client.py --image_dir ../../train_data/icdar2015/text_localization/test/142.jpg +``` + +可以获取到最终的key-value结果: + +```text linenums="1" +大气压, 100.07kPa +干球温度, 0000℃ +计前温度, 0000℃ +湿球温度, 0000℃ +计前压力, -0000kPa +流量, 00.0L/min +静压, 00000kPa +含湿量, 00.0 % +``` diff --git "a/docs/applications/\350\275\273\351\207\217\347\272\247\350\275\246\347\211\214\350\257\206\345\210\253.md" "b/docs/applications/\350\275\273\351\207\217\347\272\247\350\275\246\347\211\214\350\257\206\345\210\253.md" new file mode 100644 index 0000000000..de15aee6ff --- /dev/null +++ "b/docs/applications/\350\275\273\351\207\217\347\272\247\350\275\246\347\211\214\350\257\206\345\210\253.md" @@ -0,0 +1,815 @@ +--- +typora-copy-images-to: images +comments: true +--- + + +# 一种基于PaddleOCR的轻量级车牌识别模型 + +## 1. 项目介绍 + +车牌识别(Vehicle License Plate Recognition,VLPR) 是计算机视频图像识别技术在车辆牌照识别中的一种应用。车牌识别技术要求能够将运动中的汽车牌照从复杂背景中提取并识别出来,在高速公路车辆管理,停车场管理和城市交通中得到广泛应用。 + +本项目难点如下: + +1. 车牌在图像中的尺度差异大、在车辆上的悬挂位置不固定 +2. 车牌图像质量层次不齐: 角度倾斜、图片模糊、光照不足、过曝等问题严重 +3. 边缘和端测场景应用对模型大小有限制,推理速度有要求 + +针对以上问题, 本例选用 PP-OCRv3 这一开源超轻量OCR系统进行车牌识别系统的开发。基于PP-OCRv3模型,在CCPD数据集达到99%的检测和94%的识别精度,模型大小12.8M(2.5M+10.3M)。基于量化对模型体积进行进一步压缩到5.8M(1M+4.8M), 同时推理速度提升25%。 + +aistudio项目链接: [基于PaddleOCR的轻量级车牌识别范例](https://aistudio.baidu.com/aistudio/projectdetail/3919091?contributionType=1) + +## 2. 环境搭建 + +本任务基于Aistudio完成, 具体环境如下: + +- 操作系统: Linux +- PaddlePaddle: 2.3 +- paddleslim: 2.2.2 +- PaddleOCR: Release/2.5 + +下载 PaddleOCR代码 + +```bash linenums="1" +git clone -b dygraph https://github.com/PaddlePaddle/PaddleOCR +``` + +安装依赖库 + +```bash linenums="1" +pip install -r PaddleOCR/requirements.txt +``` + +## 3. 数据集准备 + +所使用的数据集为 CCPD2020 新能源车牌数据集,该数据集为 + +该数据集分布如下: + +|数据集类型|数量| +|---|---| +|训练集| 5769| +|验证集| 1001| +|测试集| 5006| + +数据集图片示例如下: + +![](./images/3bce057a8e0c40a0acbd26b2e29e4e2590a31bc412764be7b9e49799c69cb91c.jpg) + +数据集可以从这里下载 + +下载好数据集后对数据集进行解压 + +```bash linenums="1" +unzip -d /home/aistudio/data /home/aistudio/data/data101595/CCPD2020.zip +``` + +### 3.1 数据集标注规则 + +CPPD数据集的图片文件名具有特殊规则,详细可查看: + +具体规则如下: + +例如: 025-95_113-154&383_386&473-386&473_177&454_154&383_363&402-0_0_22_27_27_33_16-37-15.jpg + +每个名称可以分为七个字段,以-符号作为分割。这些字段解释如下: + +- 025:车牌面积与整个图片区域的面积比。025 (25%) +- 95_113:水平倾斜程度和垂直倾斜度。水平 95度 垂直 113度 +- 154&383_386&473:左上和右下顶点的坐标。左上(154,383) 右下(386,473) +- 386&473_177&454_154&383_363&402:整个图像中车牌的四个顶点的精确(x,y)坐标。这些坐标从右下角顶点开始。(386,473) (177,454) (154,383) (363,402) +- 0_0_22_27_27_33_16:CCPD中的每个图像只有一个车牌。每个车牌号码由一个汉字,一个字母和五个字母或数字组成。有效的中文车牌由七个字符组成:省(1个字符),字母(1个字符),字母+数字(5个字符)。“ 0_0_22_27_27_33_16”是每个字符的索引。这三个数组定义如下:每个数组的最后一个字符是字母O,而不是数字0。我们将O用作“无字符”的符号,因为中文车牌字符中没有O。因此以上车牌拼起来即为 皖AY339S +- 37:牌照区域的亮度。 37 (37%) +- 15:车牌区域的模糊度。15 (15%) + +```python linenums="1" +provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"] +alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W','X', 'Y', 'Z', 'O'] +ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X','Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O'] +``` + +### 3.2 制作符合PP-OCR训练格式的标注文件 + +在开始训练之前,可使用如下代码制作符合PP-OCR训练格式的标注文件。 + +```python linenums="1" +import cv2 +import os +import json +from tqdm import tqdm +import numpy as np + +provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"] +alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'O'] +ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O'] + +def make_label(img_dir, save_gt_folder, phase): + crop_img_save_dir = os.path.join(save_gt_folder, phase, 'crop_imgs') + os.makedirs(crop_img_save_dir, exist_ok=True) + + f_det = open(os.path.join(save_gt_folder, phase, 'det.txt'), 'w', encoding='utf-8') + f_rec = open(os.path.join(save_gt_folder, phase, 'rec.txt'), 'w', encoding='utf-8') + + i = 0 + for filename in tqdm(os.listdir(os.path.join(img_dir, phase))): + str_list = filename.split('-') + if len(str_list) < 5: + continue + coord_list = str_list[3].split('_') + txt_list = str_list[4].split('_') + boxes = [] + for coord in coord_list: + boxes.append([int(x) for x in coord.split("&")]) + boxes = [boxes[2], boxes[3], boxes[0], boxes[1]] + lp_number = provinces[int(txt_list[0])] + alphabets[int(txt_list[1])] + ''.join([ads[int(x)] for x in txt_list[2:]]) + + # det + det_info = [{'points':boxes, 'transcription':lp_number}] + f_det.write('{}\t{}\n'.format(os.path.join(phase, filename), json.dumps(det_info, ensure_ascii=False))) + + # rec + boxes = np.float32(boxes) + img = cv2.imread(os.path.join(img_dir, phase, filename)) + # crop_img = img[int(boxes[:,1].min()):int(boxes[:,1].max()),int(boxes[:,0].min()):int(boxes[:,0].max())] + crop_img = get_rotate_crop_image(img, boxes) + crop_img_save_filename = '{}_{}.jpg'.format(i,'_'.join(txt_list)) + crop_img_save_path = os.path.join(crop_img_save_dir, crop_img_save_filename) + cv2.imwrite(crop_img_save_path, crop_img) + f_rec.write('{}/crop_imgs/{}\t{}\n'.format(phase, crop_img_save_filename, lp_number)) + i+=1 + f_det.close() + f_rec.close() + +def get_rotate_crop_image(img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + +img_dir = '/home/aistudio/data/CCPD2020/ccpd_green' +save_gt_folder = '/home/aistudio/data/CCPD2020/PPOCR' +# phase = 'train' # change to val and test to make val dataset and test dataset +for phase in ['train','val','test']: + make_label(img_dir, save_gt_folder, phase) +``` + +通过上述命令可以完成了`训练集`,`验证集`和`测试集`的制作,制作完成的数据集信息如下: + +| 类型 | 数据集 | 图片地址 | 标签地址 | 图片数量 | +| --- | --- | --- | --- | --- | +| 检测 | 训练集 | /home/aistudio/data/CCPD2020/ccpd_green/train | /home/aistudio/data/CCPD2020/PPOCR/train/det.txt | 5769 | +| 检测 | 验证集 | /home/aistudio/data/CCPD2020/ccpd_green/val | /home/aistudio/data/CCPD2020/PPOCR/val/det.txt | 1001 | +| 检测 | 测试集 | /home/aistudio/data/CCPD2020/ccpd_green/test | /home/aistudio/data/CCPD2020/PPOCR/test/det.txt | 5006 | +| 识别 | 训练集 | /home/aistudio/data/CCPD2020/PPOCR/train/crop_imgs | /home/aistudio/data/CCPD2020/PPOCR/train/rec.txt | 5769 | +| 识别 | 验证集 | /home/aistudio/data/CCPD2020/PPOCR/val/crop_imgs | /home/aistudio/data/CCPD2020/PPOCR/val/rec.txt | 1001 | +| 识别 | 测试集 | /home/aistudio/data/CCPD2020/PPOCR/test/crop_imgs | /home/aistudio/data/CCPD2020/PPOCR/test/rec.txt | 5006 | + +在普遍的深度学习流程中,都是在训练集训练,在验证集选择最优模型后在测试集上进行测试。在本例中,我们省略中间步骤,直接在训练集训练,在测试集选择最优模型,因此我们只使用训练集和测试集。 + +## 4. 实验 + +由于数据集比较少,为了模型更好和更快的收敛,这里选用 PaddleOCR 中的 PP-OCRv3 模型进行文本检测和识别,并且使用 PP-OCRv3 模型参数作为预训练模型。PP-OCRv3在PP-OCRv2的基础上,中文场景端到端Hmean指标相比于PP-OCRv2提升5%, 英文数字模型端到端效果提升11%。详细优化细节请参考[PP-OCRv3](../ppocr/blog/PP-OCRv3_introduction.md)技术报告。 + +由于车牌场景均为端侧设备部署,因此对速度和模型大小有比较高的要求,因此还需要采用量化训练的方式进行模型大小的压缩和模型推理速度的加速。模型量化可以在基本不损失模型的精度的情况下,将FP32精度的模型参数转换为Int8精度,减小模型参数大小并加速计算,使用量化后的模型在移动端等部署时更具备速度优势。 + +因此,本实验中对于车牌检测和识别有如下3种方案: + +1. PP-OCRv3中英文超轻量预训练模型直接预测 +2. CCPD车牌数据集在PP-OCRv3模型上fine-tune +3. CCPD车牌数据集在PP-OCRv3模型上fine-tune后量化 + +### 4.1 检测 + +#### 4.1.1 预训练模型直接预测 + +从下表中下载PP-OCRv3文本检测预训练模型 + +|模型名称|模型简介|配置文件|推理模型大小|下载地址| +| --- | --- | --- | --- | --- | +|ch_PP-OCRv3_det| 【最新】原始超轻量模型,支持中英文、多语种文本检测 |[ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| + +使用如下命令下载预训练模型 + +```bash linenums="1" +mkdir models +cd models +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar -xf ch_PP-OCRv3_det_distill_train.tar +cd /home/aistudio/PaddleOCR +``` + +预训练模型下载完成后,我们使用[ch_PP-OCRv3_det_student.yml](../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml) 配置文件进行后续实验,在开始评估之前需要对配置文件中部分字段进行设置,具体如下: + +1. 模型存储和训练相关: + - Global.pretrained_model: 指向PP-OCRv3文本检测预训练模型地址 +2. 数据集相关 + - Eval.dataset.data_dir:指向测试集图片存放目录 + - Eval.dataset.label_file_list:指向测试集标注文件 + +上述字段均为必须修改的字段,可以通过修改配置文件的方式改动,也可在不需要修改配置文件的情况下,改变训练的参数。这里使用不改变配置文件的方式 。使用如下命令进行PP-OCRv3文本检测预训练模型的评估 + +```bash linenums="1" +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_det_distill_train/student.pdparams \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/det.txt] +``` + +上述指令中,通过-c 选择训练使用配置文件,通过-o参数在不需要修改配置文件的情况下,改变训练的参数。 + +使用预训练模型进行评估,指标如下所示: + +| 方案 |hmeans| +|---------------------------|---| +| PP-OCRv3中英文超轻量检测预训练模型直接预测 |76.12%| + +#### 4.1.2 CCPD车牌数据集fine-tune + +##### 训练 + +为了进行fine-tune训练,我们需要在配置文件中设置需要使用的预训练模型地址,学习率和数据集等参数。 具体如下: + +1. 模型存储和训练相关: + 1. Global.pretrained_model: 指向PP-OCRv3文本检测预训练模型地址 + 2. Global.eval_batch_step: 模型多少step评估一次,这里设为从第0个step开始没隔772个step评估一次,772为一个epoch总的step数。 +2. 优化器相关: + 1. Optimizer.lr.name: 学习率衰减器设为常量 Const + 2. Optimizer.lr.learning_rate: 做 fine-tune 实验,学习率需要设置的比较小,此处学习率设为配置文件中的0.05倍 + 3. Optimizer.lr.warmup_epoch: warmup_epoch设为0 +3. 数据集相关: + 1. Train.dataset.data_dir:指向训练集图片存放目录 + 2. Train.dataset.label_file_list:指向训练集标注文件 + 3. Eval.dataset.data_dir:指向测试集图片存放目录 + 4. Eval.dataset.label_file_list:指向测试集标注文件 + +使用如下代码即可启动在CCPD车牌数据集上的fine-tune。 + +```bash linenums="1" +python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_det_distill_train/student.pdparams \ + Global.save_model_dir=output/CCPD/det \ + Global.eval_batch_step="[0, 772]" \ + Optimizer.lr.name=Const \ + Optimizer.lr.learning_rate=0.0005 \ + Optimizer.lr.warmup_epoch=0 \ + Train.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Train.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/train/det.txt] \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/det.txt] +``` + +在上述命令中,通过`-o`的方式修改了配置文件中的参数。 + +##### 评估 + +训练完成后使用如下命令进行评估 + +```bash linenums="1" +python tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det/best_accuracy.pdparams \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/det.txt] +``` + +使用预训练模型和CCPD车牌数据集fine-tune,指标分别如下: + +|方案|hmeans| +|---|---| +|PP-OCRv3中英文超轻量检测预训练模型直接预测|76.12%| +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| + +可以看到进行fine-tune能显著提升车牌检测的效果。 + +#### 4.1.3 CCPD车牌数据集fine-tune+量化训练 + +此处采用 PaddleOCR 中提供好的[量化教程](../ppocr/model_compress/quantization.md)对模型进行量化训练。 + +量化训练可通过如下命令启动: + +```bash linenums="1" +python3.7 deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det/best_accuracy.pdparams \ + Global.save_model_dir=output/CCPD/det_quant \ + Global.eval_batch_step="[0, 772]" \ + Optimizer.lr.name=Const \ + Optimizer.lr.learning_rate=0.0005 \ + Optimizer.lr.warmup_epoch=0 \ + Train.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Train.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/train/det.txt] \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/det.txt] +``` + +量化后指标对比如下 + +|方案|hmeans| 模型大小 | 预测速度(lite) | +|---|---|------|------------| +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| 2.5M | 223ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune+量化|98.91%| 1.0M | 189ms | + +可以看到通过量化训练在精度几乎无损的情况下,降低模型体积60%并且推理速度提升15%。 + +速度测试基于[PaddleOCR lite教程](../ppocr/infer_deploy/lite.md)完成。 + +#### 4.1.4 模型导出 + +使用如下命令可以将训练好的模型进行导出 + +非量化模型 + +```bash linenums="1" +python tools/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det/best_accuracy.pdparams \ + Global.save_inference_dir=output/det/infer +``` + +量化模型 + +```bash linenums="1" +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det_quant/best_accuracy.pdparams \ + Global.save_inference_dir=output/det/infer +``` + +### 4.2 识别 + +#### 4.2.1 预训练模型直接预测 + +从下表中下载PP-OCRv3文本识别预训练模型 + +|模型名称|模型简介|配置文件|推理模型大小|下载地址| +| --- | --- | --- | --- | --- | +|ch_PP-OCRv3_rec|【最新】原始超轻量模型,支持中英文、数字识别|[ch_PP-OCRv3_rec_distillation.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | + +使用如下命令下载预训练模型 + +```bash linenums="1" +mkdir models +cd models +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +tar -xf ch_PP-OCRv3_rec_train.tar +cd /home/aistudio/PaddleOCR +``` + +PaddleOCR提供的PP-OCRv3识别模型采用蒸馏训练策略,因此提供的预训练模型中会包含`Teacher`和`Student`模型的参数,详细信息可参考[knowledge_distillation.md](../ppocr/model_compress/knowledge_distillation.md)。 因此,模型下载完成后需要使用如下代码提取`Student`模型的参数: + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("models/ch_PP-OCRv3_rec_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "models/ch_PP-OCRv3_rec_train/student.pdparams") +``` + +预训练模型下载完成后,我们使用[ch_PP-OCRv3_rec.yml](../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml) 配置文件进行后续实验,在开始评估之前需要对配置文件中部分字段进行设置,具体如下: + +1. 模型存储和训练相关: + 1. Global.pretrained_model: 指向PP-OCRv3文本识别预训练模型地址 +2. 数据集相关 + 1. Eval.dataset.data_dir:指向测试集图片存放目录 + 2. Eval.dataset.label_file_list:指向测试集标注文件 + +使用如下命令进行PP-OCRv3文本识别预训练模型的评估 + +```bash linenums="1" +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_rec_train/student.pdparams \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/rec.txt] +``` + +如需获取已训练模型,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +评估部分日志如下: + +```bash linenums="1" +[2022/05/12 19:52:02] ppocr INFO: load pretrain successful from models/ch_PP-OCRv3_rec_train/best_accuracy +eval model:: 100%|██████████████████████████████| 40/40 [00:15<00:00, 2.57it/s] +[2022/05/12 19:52:17] ppocr INFO: metric eval *************** +[2022/05/12 19:52:17] ppocr INFO: acc:0.0 +[2022/05/12 19:52:17] ppocr INFO: norm_edit_dis:0.8656084923002452 +[2022/05/12 19:52:17] ppocr INFO: Teacher_acc:0.000399520574511545 +[2022/05/12 19:52:17] ppocr INFO: Teacher_norm_edit_dis:0.8657902943394548 +[2022/05/12 19:52:17] ppocr INFO: fps:1443.1801978719905 + +``` + +使用预训练模型进行评估,指标如下所示: + +|方案|acc| +|---|---| +|PP-OCRv3中英文超轻量识别预训练模型直接预测|0%| + +从评估日志中可以看到,直接使用PP-OCRv3预训练模型进行评估,acc非常低,但是norm_edit_dis很高。因此,我们猜测是模型大部分文字识别是对的,只有少部分文字识别错误。使用如下命令进行infer查看模型的推理结果进行验证: + +```bash linenums="1" +python tools/infer_rec.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_rec_train/student.pdparams \ + Global.infer_img=/home/aistudio/data/CCPD2020/PPOCR/test/crop_imgs/0_0_0_3_32_30_31_30_30.jpg +``` + +输出部分日志如下: + +```bash linenums="1" +[2022/05/01 08:51:57] ppocr INFO: train with paddle 2.2.2 and device CUDAPlace(0) +W0501 08:51:57.127391 11326 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.0, Runtime API Version: 10.1 +W0501 08:51:57.132315 11326 device_context.cc:465] device: 0, cuDNN Version: 7.6. +[2022/05/01 08:52:00] ppocr INFO: load pretrain successful from models/ch_PP-OCRv3_rec_train/student +[2022/05/01 08:52:00] ppocr INFO: infer_img: /home/aistudio/data/CCPD2020/PPOCR/test/crop_imgs/0_0_3_32_30_31_30_30.jpg +[2022/05/01 08:52:00] ppocr INFO: result: {"Student": {"label": "皖A·D86766", "score": 0.9552637934684753}, "Teacher": {"label": "皖A·D86766", "score": 0.9917094707489014}} +[2022/05/01 08:52:00] ppocr INFO: success! +``` + +从infer结果可以看到,车牌中的文字大部分都识别正确,只是多识别出了一个`·`。针对这种情况,有如下两种方案: + +1. 直接通过后处理去掉多识别的`·`。 +2. 进行 fine-tune。 + +#### 4.2.2 预训练模型直接预测+改动后处理 + +直接通过后处理去掉多识别的`·`,在后处理的改动比较简单,只需在 [ppocr/postprocess/rec_postprocess.py](../ppocr/postprocess/rec_postprocess.py) 文件的76行添加如下代码: + +```python linenums="1" +text = text.replace('·','') +``` + +改动前后指标对比: + +|方案|acc| +|---|---| +|PP-OCRv3中英文超轻量识别预训练模型直接预测|0.20%| +|PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`|90.97%| + +可以看到,去掉多余的`·`能大幅提高精度。 + +#### 4.2.3 CCPD车牌数据集fine-tune + +##### 训练 + +为了进行fine-tune训练,我们需要在配置文件中设置需要使用的预训练模型地址,学习率和数据集等参数。 具体如下: + +1. 模型存储和训练相关: + 1. Global.pretrained_model: 指向PP-OCRv3文本识别预训练模型地址 + 2. Global.eval_batch_step: 模型多少step评估一次,这里设为从第0个step开始没隔45个step评估一次,45为一个epoch总的step数。 +2. 优化器相关 + 1. Optimizer.lr.name: 学习率衰减器设为常量 Const + 2. Optimizer.lr.learning_rate: 做 fine-tune 实验,学习率需要设置的比较小,此处学习率设为配置文件中的0.05倍 + 3. Optimizer.lr.warmup_epoch: warmup_epoch设为0 +3. 数据集相关 + 1. Train.dataset.data_dir:指向训练集图片存放目录 + 2. Train.dataset.label_file_list:指向训练集标注文件 + 3. Eval.dataset.data_dir:指向测试集图片存放目录 + 4. Eval.dataset.label_file_list:指向测试集标注文件 + +使用如下命令启动 fine-tune + +```bash linenums="1" +python tools/train.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_rec_train/student.pdparams \ + Global.save_model_dir=output/CCPD/rec/ \ + Global.eval_batch_step="[0, 90]" \ + Optimizer.lr.name=Const \ + Optimizer.lr.learning_rate=0.0005 \ + Optimizer.lr.warmup_epoch=0 \ + Train.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Train.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/train/rec.txt] \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/rec.txt] +``` + +##### 评估 + +训练完成后使用如下命令进行评估 + +```bash linenums="1" +python tools/eval.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec/best_accuracy.pdparams \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/rec.txt] +``` + +使用预训练模型和CCPD车牌数据集fine-tune,指标分别如下: + +|方案| acc | +|---|--------| +|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0.00% | +|PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`| 90.97% | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | + +可以看到进行fine-tune能显著提升车牌识别的效果。 + +#### 4.2.4 CCPD车牌数据集fine-tune+量化训练 + +此处采用 PaddleOCR 中提供好的[量化教程](../ppocr/model_compress/quantization.md)对模型进行量化训练。 + +量化训练可通过如下命令启动: + +```bash linenums="1" +python3.7 deploy/slim/quantization/quant.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec/best_accuracy.pdparams \ + Global.save_model_dir=output/CCPD/rec_quant/ \ + Global.eval_batch_step="[0, 90]" \ + Optimizer.lr.name=Const \ + Optimizer.lr.learning_rate=0.0005 \ + Optimizer.lr.warmup_epoch=0 \ + Train.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Train.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/train/rec.txt] \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/rec.txt] +``` + +量化后指标对比如下 + +|方案| acc | 模型大小 | 预测速度(lite) | +|---|--------|-------|------------| +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | 10.3M | 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.40% | 4.8M | 1.8ms | + +可以看到量化后能降低模型体积53%并且推理速度提升57%,但是由于识别数据过少,量化带来了1%的精度下降。 + +速度测试基于[PaddleOCR lite教程](../ppocr/infer_deploy/lite.md)完成。 + +#### 4.2.5 模型导出 + +使用如下命令可以将训练好的模型进行导出。 + +非量化模型 + +```bash linenums="1" +python tools/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/rec/infer +``` + +量化模型 + +```bash linenums="1" +python deploy/slim/quantization/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec_quant/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/rec_quant/infer +``` + +### 4.3 计算End2End指标 + +端到端指标可通过 [PaddleOCR内置脚本](../tools/end2end/readme.md) 进行计算,具体步骤如下: + +#### 1. 导出模型 + +通过如下命令进行模型的导出。注意,量化模型导出时,需要配置eval数据集 + +```bash linenums="1" +# 检测模型 + +# 预训练模型 +python tools/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_det_distill_train/student.pdparams \ + Global.save_inference_dir=output/ch_PP-OCRv3_det_distill_train/infer + +# 非量化模型 +python tools/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/det/infer + +# 量化模型 +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml -o \ + Global.pretrained_model=output/CCPD/det_quant/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/det_quant/infer \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/ccpd_green \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/det.txt] \ + Eval.loader.num_workers=0 + +# 识别模型 + +# 预训练模型 +python tools/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=models/ch_PP-OCRv3_rec_train/student.pdparams \ + Global.save_inference_dir=output/ch_PP-OCRv3_rec_train/infer + +# 非量化模型 +python tools/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/rec/infer + +# 量化模型 +python deploy/slim/quantization/export_model.py -c configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml -o \ + Global.pretrained_model=output/CCPD/rec_quant/best_accuracy.pdparams \ + Global.save_inference_dir=output/CCPD/rec_quant/infer \ + Eval.dataset.data_dir=/home/aistudio/data/CCPD2020/PPOCR \ + Eval.dataset.label_file_list=[/home/aistudio/data/CCPD2020/PPOCR/test/rec.txt] +``` + +#### 2. 用导出的模型对测试集进行预测 + +此处,分别使用PP-OCRv3预训练模型,fintune模型和量化模型对测试集的所有图像进行预测,命令如下: + +```bash linenums="1" +# PP-OCRv3中英文超轻量检测预训练模型,PP-OCRv3中英文超轻量识别预训练模型 +python3 tools/infer/predict_system.py --det_model_dir=models/ch_PP-OCRv3_det_distill_train/infer --rec_model_dir=models/ch_PP-OCRv3_rec_train/infer --det_limit_side_len=736 --det_limit_type=min --image_dir=/home/aistudio/data/CCPD2020/ccpd_green/test/ --draw_img_save_dir=infer/pretrain --use_dilation=true + +# PP-OCRv3中英文超轻量检测预训练模型+fine-tune,PP-OCRv3中英文超轻量识别预训练模型+fine-tune +python3 tools/infer/predict_system.py --det_model_dir=output/CCPD/det/infer --rec_model_dir=output/CCPD/rec/infer --det_limit_side_len=736 --det_limit_type=min --image_dir=/home/aistudio/data/CCPD2020/ccpd_green/test/ --draw_img_save_dir=infer/fine-tune --use_dilation=true + +# PP-OCRv3中英文超轻量检测预训练模型 fine-tune +量化,PP-OCRv3中英文超轻量识别预训练模型 fine-tune +量化 结果转换和评估 +python3 tools/infer/predict_system.py --det_model_dir=output/CCPD/det_quant/infer --rec_model_dir=output/CCPD/rec_quant/infer --det_limit_side_len=736 --det_limit_type=min --image_dir=/home/aistudio/data/CCPD2020/ccpd_green/test/ --draw_img_save_dir=infer/quant --use_dilation=true +``` + +#### 3. 转换label并计算指标 + +将gt和上一步保存的预测结果转换为端对端评测需要的数据格式,并根据转换后的数据进行端到端指标计算 + +```bash linenums="1" +python3 tools/end2end/convert_ppocr_label.py --mode=gt --label_path=/home/aistudio/data/CCPD2020/PPOCR/test/det.txt --save_folder=end2end/gt + +# PP-OCRv3中英文超轻量检测预训练模型,PP-OCRv3中英文超轻量识别预训练模型 结果转换和评估 +python3 tools/end2end/convert_ppocr_label.py --mode=pred --label_path=infer/pretrain/system_results.txt --save_folder=end2end/pretrain +python3 tools/end2end/eval_end2end.py end2end/gt end2end/pretrain + +# PP-OCRv3中英文超轻量检测预训练模型,PP-OCRv3中英文超轻量识别预训练模型+后处理去掉多识别的`·` 结果转换和评估 +# 需手动修改后处理函数 +python3 tools/end2end/convert_ppocr_label.py --mode=pred --label_path=infer/post/system_results.txt --save_folder=end2end/post +python3 tools/end2end/eval_end2end.py end2end/gt end2end/post + +# PP-OCRv3中英文超轻量检测预训练模型 fine-tune,PP-OCRv3中英文超轻量识别预训练模型 fine-tune 结果转换和评估 +python3 tools/end2end/convert_ppocr_label.py --mode=pred --label_path=infer/fine-tune/system_results.txt --save_folder=end2end/fine-tune +python3 tools/end2end/eval_end2end.py end2end/gt end2end/fine-tune + +# PP-OCRv3中英文超轻量检测预训练模型 fine-tune +量化,PP-OCRv3中英文超轻量识别预训练模型 fine-tune +量化 结果转换和评估 +python3 tools/end2end/convert_ppocr_label.py --mode=pred --label_path=infer/quant/system_results.txt --save_folder=end2end/quant +python3 tools/end2end/eval_end2end.py end2end/gt end2end/quant +``` + +日志如下: + +```bash linenums="1" +The convert label saved in end2end/gt +The convert label saved in end2end/pretrain +start testing... +hit, dt_count, gt_count 2 5988 5006 +character_acc: 70.42% +avg_edit_dist_field: 2.37 +avg_edit_dist_img: 2.37 +precision: 0.03% +recall: 0.04% +fmeasure: 0.04% +The convert label saved in end2end/post +start testing... +hit, dt_count, gt_count 4224 5988 5006 +character_acc: 81.59% +avg_edit_dist_field: 1.47 +avg_edit_dist_img: 1.47 +precision: 70.54% +recall: 84.38% +fmeasure: 76.84% +The convert label saved in end2end/fine-tune +start testing... +hit, dt_count, gt_count 4286 4898 5006 +character_acc: 94.16% +avg_edit_dist_field: 0.47 +avg_edit_dist_img: 0.47 +precision: 87.51% +recall: 85.62% +fmeasure: 86.55% +The convert label saved in end2end/quant +start testing... +hit, dt_count, gt_count 4349 4951 5006 +character_acc: 94.13% +avg_edit_dist_field: 0.47 +avg_edit_dist_img: 0.47 +precision: 87.84% +recall: 86.88% +fmeasure: 87.36% +``` + +各个方案端到端指标如下: + +|模型| 指标 | +|---|--------| +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型| 0.04% | +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`| 78.27% | +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune| 87.14% | +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化| 88.00% | + +从结果中可以看到对预训练模型不做修改,只根据场景下的具体情况进行后处理的修改就能大幅提升端到端指标到78.27%,在CCPD数据集上进行 fine-tune 后指标进一步提升到87.14%, 在经过量化训练之后,由于检测模型的recall变高,指标进一步提升到88%。但是这个结果仍旧不符合检测模型+识别模型的真实性能(99%*94%=93%),因此我们需要对 base case 进行具体分析。 + +在之前的端到端预测结果中,可以看到很多不符合车牌标注的文字被识别出来, 因此可以进行简单的过滤来提升precision + +为了快速评估,我们在 `tools/end2end/convert_ppocr_label.py` 脚本的 58 行加入如下代码,对非8个字符的结果进行过滤 + +```python linenums="1" +if len(txt) != 8: # 车牌字符串长度为8 + continue +``` + +此外,通过可视化box可以发现有很多框都是竖直翻转之后的框,并且没有完全框住车牌边界,因此需要进行框的竖直翻转以及轻微扩大,示意图如下: + +![](./images/59ab0411c8eb4dfd917fb2b6e5b69a17ee7ca48351444aec9ac6104b79ff1028.jpg) + +修改前后个方案指标对比如下: + +各个方案端到端指标如下: + +|模型|base|A:识别结果过滤|B:use_dilation|C:flip_box|best| +|---|---|---|---|---|---| +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型|0.04%|0.08%|0.02%|0.05%|0.00%(A)| +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`|78.27%|90.84%|78.61%|79.43%|91.66%(A+B+C)| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|87.14%|90.40%|87.66%|89.98%|92.50%(A+B+C)| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|88.00%|90.54%|88.50%|89.46%|92.02%(A+B+C)| + +从结果中可以看到对预训练模型不做修改,只根据场景下的具体情况进行后处理的修改就能大幅提升端到端指标到91.66%,在CCPD数据集上进行 fine-tune 后指标进一步提升到92.5%, 在经过量化训练之后,指标变为92.02%。 + +### 4.4 部署 + +#### 基于 Paddle Inference 的python推理 + +检测模型和识别模型分别 fine-tune 并导出为inference模型之后,可以使用如下命令基于 Paddle Inference 进行端到端推理并对结果进行可视化。 + +```bash linenums="1" +python tools/infer/predict_system.py \ + --det_model_dir=output/CCPD/det/infer/ \ + --rec_model_dir=output/CCPD/rec/infer/ \ + --image_dir="/home/aistudio/data/CCPD2020/ccpd_green/test/04131106321839081-92_258-159&509_530&611-527&611_172&599_159&509_530&525-0_0_3_32_30_31_30_30-109-106.jpg" \ + --rec_image_shape=3,48,320 +``` + +推理结果如下 + +![](./images/76b6a0939c2c4cf49039b6563c4b28e241e11285d7464e799e81c58c0f7707a7-20240704185943337.png) + +#### 端侧部署 + +端侧部署我们采用基于 PaddleLite 的 cpp 推理。Paddle Lite是飞桨轻量化推理引擎,为手机、IOT端提供高效推理能力,并广泛整合跨平台硬件,为端侧部署及应用落地问题提供轻量化的部署方案。具体可参考 [PaddleOCR lite教程](../ppocr/infer_deploy/lite.md) + +### 4.5 实验总结 + +我们分别使用PP-OCRv3中英文超轻量预训练模型在车牌数据集上进行了直接评估和 fine-tune 和 fine-tune +量化3种方案的实验,并基于[PaddleOCR lite教程](../ppocr/infer_deploy/lite.md)进行了速度测试,指标对比如下: + +- 检测 + +|方案|hmeans| 模型大小 | 预测速度(lite) | +|---|---|------|------------| +|PP-OCRv3中英文超轻量检测预训练模型直接预测|76.12%|2.5M| 233ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune|99.00%| 2.5M | 233ms | +|PP-OCRv3中英文超轻量检测预训练模型 fine-tune + 量化|98.91%| 1.0M | 189ms | + +- 识别 + +|方案| acc | 模型大小 | 预测速度(lite) | +|---|--------|-------|------------| +|PP-OCRv3中英文超轻量识别预训练模型直接预测| 0.00% |10.3M| 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型直接预测+后处理去掉多识别的`·`| 90.97% |10.3M| 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune| 94.54% | 10.3M | 4.2ms | +|PP-OCRv3中英文超轻量识别预训练模型 fine-tune + 量化| 93.40% | 4.8M | 1.8ms | + +- 端到端指标如下: + +|方案|fmeasure|模型大小|预测速度(lite) | +|---|---|---|---| +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型|0.08%|12.8M|298ms| +|PP-OCRv3中英文超轻量检测预训练模型
PP-OCRv3中英文超轻量识别预训练模型 + 后处理去掉多识别的`·`|91.66%|12.8M|298ms| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune
PP-OCRv3中英文超轻量识别预训练模型+fine-tune|92.50%|12.8M|298ms| +|PP-OCRv3中英文超轻量检测预训练模型+fine-tune+量化
PP-OCRv3中英文超轻量识别预训练模型+fine-tune+量化|92.02%|5.80M|224ms| + +## **结论** + +PP-OCRv3的检测模型在未经过fine-tune的情况下,在车牌数据集上也有一定的精度,经过 fine-tune 后能够极大的提升检测效果,精度达到99%。在使用量化训练后检测模型的精度几乎无损,并且模型大小压缩60%。 + +PP-OCRv3的识别模型在未经过fine-tune的情况下,在车牌数据集上精度为0,但是经过分析可以知道,模型大部分字符都预测正确,但是会多预测一个特殊字符,去掉这个特殊字符后,精度达到90%。PP-OCRv3识别模型在经过 fine-tune 后识别精度进一步提升,达到94.4%。在使用量化训练后识别模型大小压缩53%,但是由于数据量多少,带来了1%的精度损失。 + +从端到端结果中可以看到对预训练模型不做修改,只根据场景下的具体情况进行后处理的修改就能大幅提升端到端指标到91.66%,在CCPD数据集上进行 fine-tune 后指标进一步提升到92.5%, 在经过量化训练之后,指标轻微下降到92.02%但模型大小降低54%。 diff --git "a/docs/applications/\351\253\230\347\262\276\345\272\246\344\270\255\346\226\207\350\257\206\345\210\253\346\250\241\345\236\213.md" "b/docs/applications/\351\253\230\347\262\276\345\272\246\344\270\255\346\226\207\350\257\206\345\210\253\346\250\241\345\236\213.md" new file mode 100644 index 0000000000..11d059190a --- /dev/null +++ "b/docs/applications/\351\253\230\347\262\276\345\272\246\344\270\255\346\226\207\350\257\206\345\210\253\346\250\241\345\236\213.md" @@ -0,0 +1,113 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 高精度中文场景文本识别模型SVTR + +## 1. 简介 + +PP-OCRv3是百度开源的超轻量级场景文本检测识别模型库,其中超轻量的场景中文识别模型SVTR_LCNet使用了SVTR算法结构。为了保证速度,SVTR_LCNet将SVTR模型的Local Blocks替换为LCNet,使用两层Global Blocks。在中文场景中,PP-OCRv3识别主要使用如下优化策略([详细技术报告](../ppocr/blog/PP-OCRv3_introduction.md)): + +- GTC:Attention指导CTC训练策略; +- TextConAug:挖掘文字上下文信息的数据增广策略; +- TextRotNet:自监督的预训练模型; +- UDML:联合互学习策略; +- UIM:无标注数据挖掘方案。 + +其中 *UIM:无标注数据挖掘方案* 使用了高精度的SVTR中文模型进行无标注文件的刷库,该模型在PP-OCRv3识别的数据集上训练,精度对比如下表。 + +|中文识别算法|模型|UIM|精度| +| --- | --- | --- |--- | +|PP-OCRv3|SVTR_LCNet| w/o |78.40%| +|PP-OCRv3|SVTR_LCNet| w |79.40%| +|SVTR|SVTR-Tiny|-|82.50%| + +aistudio项目链接: [高精度中文场景文本识别模型SVTR](https://aistudio.baidu.com/aistudio/projectdetail/4263032) + +## 2. SVTR中文模型使用 + +### 环境准备 + +本任务基于Aistudio完成, 具体环境如下: + +- 操作系统: Linux +- PaddlePaddle: 2.3 +- PaddleOCR: dygraph + +下载PaddleOCR代码 + +```bash linenums="1" +git clone -b dygraph https://github.com/PaddlePaddle/PaddleOCR +``` + +安装依赖库 + +```bash linenums="1" +pip install -r PaddleOCR/requirements.txt -i https://mirror.baidu.com/pypi/simple +``` + +### 快速使用 + +获取SVTR中文模型文件,请加入PaddleX官方交流频道,获取20G OCR学习大礼包(内含《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料) + +- PaddleX官方交流频道: + +```bash linenums="1" +# 解压模型文件 +tar xf svtr_ch_high_accuracy.tar +``` + +预测中文文本,以下图为例: +![](../doc/imgs_words/ch/word_1.jpg) + +预测命令: + +```bash linenums="1" +# CPU预测 +python tools/infer_rec.py -c configs/rec/rec_svtrnet_ch.yml -o Global.pretrained_model=./svtr_ch_high_accuracy/best_accuracy Global.infer_img=./doc/imgs_words/ch/word_1.jpg Global.use_gpu=False + +# GPU预测 +#python tools/infer_rec.py -c configs/rec/rec_svtrnet_ch.yml -o Global.pretrained_model=./svtr_ch_high_accuracy/best_accuracy Global.infer_img=./doc/imgs_words/ch/word_1.jpg Global.use_gpu=True +``` + +可以看到最后打印结果为 + +- result: 韩国小馆 0.9853458404541016 + +0.9853458404541016为预测置信度。 + +### 推理模型导出与预测 + +inference 模型(paddle.jit.save保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +运行识别模型转inference模型命令,如下: + +```bash linenums="1" +python tools/export_model.py -c configs/rec/rec_svtrnet_ch.yml -o Global.pretrained_model=./svtr_ch_high_accuracy/best_accuracy Global.save_inference_dir=./inference/svtr_ch +``` + +转换成功后,在目录下有三个文件: + +```bash linenums="1" +inference/svtr_ch/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +inference模型预测,命令如下: + +```bash linenums="1" +# CPU预测 +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_1.jpg" --rec_algorithm='SVTR' --rec_model_dir=./inference/svtr_ch/ --rec_image_shape='3, 32, 320' --rec_char_dict_path=ppocr/utils/ppocr_keys_v1.txt --use_gpu=False + +# GPU预测 +#python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_1.jpg" --rec_algorithm='SVTR' --rec_model_dir=./inference/svtr_ch/ --rec_image_shape='3, 32, 320' --rec_char_dict_path=ppocr/utils/ppocr_keys_v1.txt --use_gpu=True +``` + +**注意** + +- 使用SVTR算法时,需要指定--rec_algorithm='SVTR' +- 如果使用自定义字典训练的模型,需要将--rec_char_dict_path=ppocr/utils/ppocr_keys_v1.txt修改为自定义的字典 +- --rec_image_shape='3, 32, 320' 该参数不能去掉 diff --git a/docs/community/code_and_doc.en.md b/docs/community/code_and_doc.en.md new file mode 100644 index 0000000000..ad846752d2 --- /dev/null +++ b/docs/community/code_and_doc.en.md @@ -0,0 +1,336 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Appendix + + This appendix contains python, document specifications and Pull Request process. + +## Appendix 1:Python Code Specification + +The Python code of PaddleOCR follows [PEP8 Specification]( https://www.python.org/dev/peps/pep-0008/ ), some of the key concerns include the following + +- Space + + - Spaces should be added after commas, semicolons, colons, not before them + + ```python linenums="1" + # true: + print(x, y) + + # false: + print(x , y) + ``` + + - When specifying a keyword parameter or default parameter value in a function, do not use spaces on both sides of it + + ```python linenums="1" + # true: + def complex(real, imag=0.0) + # false: + def complex(real, imag = 0.0) + ``` + +- comment + + - Inline comments: inline comments are indicated by the` # `sign. Two spaces should be left between code and` # `, and one space should be left between` # `and comments, for example + + ```python linenums="1" + x = x + 1 # Compensate for border + ``` + + - Functions and methods: The definition of each function should include the following: + + - Function description: Utility, input and output of function + - Args: Name and description of each parameter + - Returns: The meaning and type of the return value + + ```python linenums="1" + def fetch_bigtable_rows(big_table, keys, other_silly_variable=None): + """Fetches rows from a Bigtable. + + Retrieves rows pertaining to the given keys from the Table instance + represented by big_table. Silly things may happen if + other_silly_variable is not None. + + Args: + big_table: An open Bigtable Table instance. + keys: A sequence of strings representing the key of each table row + to fetch. + other_silly_variable: Another optional variable, that has a much + longer name than the other args, and which does nothing. + + Returns: + A dict mapping keys to the corresponding table row data + fetched. Each row is represented as a tuple of strings. For + example: + + {'Serak': ('Rigel VII', 'Preparer'), + 'Zim': ('Irk', 'Invader'), + 'Lrrr': ('Omicron Persei 8', 'Emperor')} + + If a key from the keys argument is missing from the dictionary, + then that row was not found in the table. + """ + pass + ``` + +## Appendix 2: Document Specification + +### 2.1 Overall Description + +- Document Location: If you add new features to your original Markdown file, please **Do not re-create** a new file. If you don't know where to add it, you can first PR the code and then ask the official in commit. + +- New Markdown Document Name: Describe the content of the document in English, typically a combination of lowercase letters and underscores, such as `add_New_Algorithm.md` + +- New Markdown Document Format: Catalog - Body - FAQ + + > The directory generation method can use [this site](https://ecotrust-canada.github.io/markdown-toc/ ) Automatically extract directories after copying MD contents, and then add ` + +- English and Chinese: Any changes or additions to the document need to be made in both Chinese and English documents. + +### 2.2 Format Specification + +- Title format: The document title format follows the format of: Arabic decimal point combination-space-title (for example, `2.1 XXXX`, `2.XXXX`) + +- Code block: Displays code in code block format that needs to be run, describing the meaning of command parameters before the code block. for example: + + > Pipeline of detection + direction Classify + recognition: Vertical text can be recognized after set direction classifier parameters`--use_angle_cls true`. + > + > ```bash linenums="1" + > paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true + > ``` + +- Variable Rrferences: If code variables or command parameters are referenced in line, they need to be represented in line code, for example, above `--use_angle_cls true` with one space in front and one space in back + +- Uniform naming: e.g. PP-OCRv2, PP-OCR mobile, `paddleocr` whl package, PPOCRLabel, Paddle Lite, etc. + +- Supplementary notes: Supplementary notes by reference format `>`. + +- Picture: If a picture is added to the description document, specify the naming of the picture (describing its content) and add the picture under `doc/`. + +- Title: Capitalize the first letter of each word in the title. + +## Appendix 3: Pull Request Description + +### 3.1 PaddleOCR Branch Description + +PaddleOCR will maintain two branches in the future, one for each: + +- release/x.x family branch: stable release version branch, also the default branch. PaddleOCR releases a new release branch based on feature updates and adapts to the release version of Paddle. As versions iterate, more and more release/x.x family branches are maintained by default with the latest version of the release branch. +- dygraph branch: For the development branch, adapts the dygraph version of the Paddle dynamic graph to primarily develop new functionality. If you need to redevelop, choose the dygraph branch. To ensure that the dygraph branch pulls out the release/x.x branch when needed, the code for the dygraph branch can only use the valid API in the latest release branch of Paddle. That is, if a new API has been developed in the Paddle dygraph branch but has not yet appeared in the release branch code, do not use it in Paddle OCR. In addition, performance optimization, parameter tuning, policy updates that do not involve API can be developed normally. + +The historical branch of PaddleOCR will no longer be maintained in the future. These branches will continue to be maintained, considering that some of you may still be using them: + +Develop branch: This branch was used for the development and testing of static diagrams and is currently compatible with version >=1.7. If you have special needs, you can also use this branch to accommodate older versions of Paddle, but you won't update your code until you fix the bug. + +PaddleOCR welcomes you to actively contribute code to repo. Here are some basic processes for contributing code. + +### 3.2 PaddleOCR Code Submission Process And Specification + +If you are familiar with Git use, you can jump directly to [Some Conventions For Submitting Code in 3.2.10](#Some_conventions_for_submitting_code) + +#### 3.2.1 Create Your `Remote Repo` + +In PaddleOCR [GitHub Home]( https://github.com/PaddlePaddle/PaddleOCR ) Click the `Fork` button in the upper left corner to create a `remote repo`in your personal directory, such as `https://github.com/ {your_name}/PaddleOCR`. + + ![banner](./images/banner.png) + +Clone `Remote repo` + + ```bash linenums="1" + # pull code of develop branch + git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph + cd PaddleOCR + ``` + +Clone failures are mostly due to network reasons, try again later or configure the proxy + +#### 3.2.2 Login And Connect Using Token + +Start by viewing the information for the current `remote repo`. + +```bash linenums="1" +git remote -v +# origin https://github.com/{your_name}/PaddleOCR.git (fetch) +# origin https://github.com/{your_name}/PaddleOCR.git (push) +``` + +Only the information of the clone `remote repo`, i.e. the PaddleOCR under your username, is available. Due to the change in Github's login method, you need to reconfigure the `remote repo` address by means of a Token. The token is generated as follows: + +1. Find Personal Access Tokens: Click on your avatar in the upper right corner of the Github page and choose Settings --> Developer settings --> Personal access tokens, + +2. Click Generate new token: Fill in the token name in Note, such as 'paddle'. In Select scopes, select repo (required), admin:repo_hook, delete_repo, etc. You can check them according to your needs. Then click Generate token to generate the token, and finally copy the generated token. + +Delete the original origin configuration + +```bash linenums="1" +git remote rm origin +``` + +Change the remote branch to `https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git`. For example, if the token value is 12345 and your user name is PPOCR, run the following command + +```bash linenums="1" +git remote add origin https://oauth2:12345@github.com/PPOCR/PaddleOCR.git +``` + +This establishes a connection to our own `remote repo`. Next we create a remote host of the original PaddleOCR repo, named upstream. + +```bash linenums="1" +git remote add upstream https://github.com/PaddlePaddle/PaddleOCR.git +``` + +Use `git remote -v` to view current `remote warehouse` information, output as follows, found to include two origin and two upstream of `remote repo` . + +```bash linenums="1" +origin https://github.com/{your_name}/PaddleOCR.git (fetch) +origin https://github.com/{your_name}/PaddleOCR.git (push) +upstream https://github.com/PaddlePaddle/PaddleOCR.git (fetch) +upstream https://github.com/PaddlePaddle/PaddleOCR.git (push) +``` + +This is mainly to keep the local repository up to date when subsequent pull request (PR) submissions are made. + +#### 3.2.3 Create Local Branch + +First get the latest code of upstream, then create a new_branch branch based on the dygraph of the upstream repo (upstream). + +```bash linenums="1" +git fetch upstream +git checkout -b new_branch upstream/dygraph +``` + +> If for a newly forked PaddleOCR project, the user's remote repo (origin) has the same branch updates as the upstream repository (upstream), you can also create a new local branch based on the default branch of the origin repo or a specified branch with the following command +> +> ```bash linenums="1" +> # Create new_branch branch on user remote repo (origin) based on develop branch +> git checkout -b new_branch origin/develop +> # Create new_branch branch based on upstream remote repo develop branch +> # If you need to create a new branch from upstream, +> # you need to first use git fetch upstream to get upstream code +> git checkout -b new_branch upstream/develop +> ``` + +The final switch to the new branch is displayed with the following output information. + +Branch new_branch set up to track remote branch develop from upstream. +Switched to a new branch 'new_branch' + +After switching branches, file changes can be made on this branch + +#### 3.2.4 Use Pre-Commit Hook + +Paddle developers use the pre-commit tool to manage Git pre-submit hooks. It helps us format the source code (C++, Python) and automatically check for basic things (such as having only one EOL per file, not adding large files to Git) before committing it. + +The pre-commit test is part of the unit test in Travis-CI. PR that does not satisfy the hook cannot be submitted to PaddleOCR. Install it first and run it in the current directory: + +```bash linenums="1" +pip install pre-commit +pre-commit install +``` + +> 1. Paddle uses clang-format to adjust the C/C++ source code format. Make sure the `clang-format` version is above 3.8. +> +> 2. Yapf installed through pip install pre-commit is slightly different from conda install-c conda-forge pre-commit, and PaddleOCR developers use `pip install pre-commit`. + +#### 3.2.5 Modify And Submit Code + +If you make some changes on `README.Md` on PaddleOCR, you can view the changed file through `git status`, and then add the changed file using `git add`。 + +```bash linenums="1" +git status # View change files +git add README.md +pre-commit +``` + +Repeat these steps until the pre-comit format check does not error. As shown below. + +![img](./images/precommit_pass.png) + +Use the following command to complete the submission. + +```bash linenums="1" +git commit -m "your commit info" +``` + +#### 3.2.6 Keep Local Repo Up To Date + +Get the latest code for upstream and update the current branch. Here the upstream comes from section 2.2, `Connecting to a remote repo`. + +```bash linenums="1" +git fetch upstream +# If you want to commit to another branch, you need to pull code from another branch of upstream, here is develop +git pull upstream develop +``` + +#### 3.2.7 Push To Remote Repo + +```bash linenums="1" +git push origin new_branch +``` + +#### 3.2.7 Submit Pull Request + +Click the new pull request to select the local branch and the target branch, as shown in the following figure. In the description of PR, fill in the functions completed by the PR. Next, wait for review, and if you need to modify something, update the corresponding branch in origin with the steps above. + +![banner](./images/pr.png) + +#### 3.2.8 Sign CLA Agreement And Pass Unit Tests + +Signing the CLA When submitting a Pull Request to PaddlePaddle for the first time, you need to sign a CLA (Contributor License Agreement) agreement to ensure that your code can be incorporated as follows: + +1. Please check the Check section in PR, find the license/cla, and click on the right detail to enter the CLA website + +2. Click Sign in with GitHub to agree on the CLA website and when clicked, it will jump back to your Pull Request page + +#### 3.2.9 Delete Branch + +- Remove remote branch + + After PR is merged into the main repo, we can delete the branch of the remote repofrom the PR page. + You can also use `git push origin:branch name` to delete remote branches, such as: + + ```bash linenums="1" + git push origin :new_branch + ``` + +- Delete local branch + + ```bash linenums="1" + # Switch to the development branch, otherwise the current branch cannot be deleted + git checkout develop + + # Delete new_ Branch Branch + git branch -D new_branch + ``` + +#### 3.2.10 Some Conventions For Submitting Code + +In order for official maintainers to better focus on the code itself when reviewing it, please follow the following conventions each time you submit your code: + +1)Please ensure that the unit tests in Travis-CI pass smoothly. If not, indicate that there is a problem with the submitted code, and the official maintainer generally does not review it. + +2)Before submitting a Pull Request. + +- Note the number of commits. + + Reason: If you only modify one file and submit more than a dozen commits, each commit will only make a few modifications, which can be very confusing to the reviewer. The reviewer needs to look at each commit individually to see what changes have been made, and does not exclude the fact that changes between commits overlap each other. + + Suggestion: Keep as few commits as possible each time you submit, and supplement your last commit with git commit --amend. For multiple commits that have been Push to a remote warehouse, you can refer to [squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed ). + +- Note the name of each commit: it should reflect the content of the current commit, not be too arbitrary. + +3) If you have solved a problem, add in the first comment box of the Pull Request:fix #issue_number,This will automatically close the corresponding Issue when the Pull Request is merged. Key words include:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,please choose the right vocabulary. Detailed reference [Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages). + +In addition, in response to the reviewer's comments, you are requested to abide by the following conventions: + +1) Each review comment from an official maintainer would like a response, which would better enhance the contribution of the open source community. + +- If you agree to the review opinion and modify it accordingly, give a simple Done. +- If you disagree with the review, please give your own reasons for refuting. + +2)If there are many reviews: + +- Please give an overview of the changes. +- Please reply with `start a review', not directly. The reason is that each reply sends an e-mail message, which can cause a mail disaster. diff --git a/docs/community/code_and_doc.md b/docs/community/code_and_doc.md new file mode 100644 index 0000000000..c81a4cd322 --- /dev/null +++ b/docs/community/code_and_doc.md @@ -0,0 +1,327 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 附录 + +本附录包含了Python、文档规范以及Pull Request流程,请各位开发者遵循相关内容 + +## 附录1:Python代码规范 + +PaddleOCR的Python代码遵循 [PEP8规范](https://www.python.org/dev/peps/pep-0008/),其中一些关注的重点包括如下内容 + +- 空格 + + - 空格应该加在逗号、分号、冒号后,而非他们的前面 + + ```python linenums="1" + # 正确: + print(x, y) + + # 错误: + print(x , y) + ``` + + - 在函数中指定关键字参数或默认参数值时, 不要在其两侧使用空格 + + ```python linenums="1" + # 正确: + def complex(real, imag=0.0) + # 错误: + def complex(real, imag = 0.0) + ``` + +- 注释 + + - 行内注释:行内注释使用 `#` 号表示,在代码与 `#` 之间需要空两个空格, `#` 与注释之间应当空一个空格,例如 + + ```python linenums="1" + x = x + 1 # Compensate for border + ``` + + - 函数和方法:每个函数的定义后的描述应该包括以下内容: + + - 函数描述:函数的作用,输入输出的 + - Args:每个参数的名字以及对该参数的描述 + - Returns:返回值的含义和类型 + + ```python linenums="1" + def fetch_bigtable_rows(big_table, keys, other_silly_variable=None): + """Fetches rows from a Bigtable. + + Retrieves rows pertaining to the given keys from the Table instance + represented by big_table. Silly things may happen if + other_silly_variable is not None. + + Args: + big_table: An open Bigtable Table instance. + keys: A sequence of strings representing the key of each table row + to fetch. + other_silly_variable: Another optional variable, that has a much + longer name than the other args, and which does nothing. + + Returns: + A dict mapping keys to the corresponding table row data + fetched. Each row is represented as a tuple of strings. For + example: + + {'Serak': ('Rigel VII', 'Preparer'), + 'Zim': ('Irk', 'Invader'), + 'Lrrr': ('Omicron Persei 8', 'Emperor')} + + If a key from the keys argument is missing from the dictionary, + then that row was not found in the table. + """ + pass + ``` + +## 附录2:文档规范 + +### 2.1 总体说明 + +- 文档位置:如果您增加的新功能可以补充在原有的Markdown文件中,请**不要重新新建**一个文件。如果您对添加的位置不清楚,可以先PR代码,然后在commit中询问官方人员。 + +- 新增Markdown文档名称:使用英文描述文档内容,一般由小写字母与下划线组合而成,例如 `add_new_algorithm.md` + +- 新增Markdown文档格式:目录 - 正文 - FAQ + + > 目录生成方法可以使用 [此网站](https://ecotrust-canada.github.io/markdown-toc/) 将md内容复制之后自动提取目录,然后在md文件的每个标题前添加 + +- 中英双语:任何对文档的改动或新增都需要分别在中文和英文文档上进行。 + +### 2.2 格式规范 + +- 标题格式:文档标题格式按照:阿拉伯数字小数点组合 - 空格 - 标题的格式(例如 `2.1 XXXX` , `2. XXXX`) + +- 代码块:通过代码块格式展示需要运行的代码,在代码块前描述命令参数的含义。例如: + + > 检测+方向分类器+识别全流程:设置方向分类器参数 `--use_angle_cls true` 后可对竖排文本进行识别。 + > + > ```bash linenums="1" + > paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true + > ``` + +- 变量引用:如果在行内引用到代码变量或命令参数,需要用行内代码表示,例如上方 `--use_angle_cls true` ,并在前后各空一格 + +- 统一命名:如PP-OCRv2、PP-OCR mobile、`paddleocr` whl包、PPOCRLabel、Paddle Lite等 + +- 补充说明:通过引用格式 `>` 补充说明,或对注意事项进行说明 + +- 图片:如果在说明文档中增加了图片,请规范图片的命名形式(描述图片内容),并将图片添加在 `doc/` 下 + +## 附录3:Pull Request说明 + +### 3.1 PaddleOCR分支说明 + +PaddleOCR未来将维护2种分支,分别为: + +- release/x.x系列分支:为稳定的发行版本分支,也是默认分支。PaddleOCR会根据功能更新情况发布新的release分支,同时适配Paddle的release版本。随着版本迭代,release/x.x系列分支会越来越多,默认维护最新版本的release分支。 +- dygraph分支:为开发分支,适配Paddle动态图的dygraph版本,主要用于开发新功能。如果有同学需要进行二次开发,请选择dygraph分支。为了保证dygraph分支能在需要的时候拉出release/x.x分支,dygraph分支的代码只能使用Paddle最新release分支中有效的api。也就是说,如果Paddle dygraph分支中开发了新的api,但尚未出现在release分支代码中,那么请不要在PaddleOCR中使用。除此之外,对于不涉及api的性能优化、参数调整、策略更新等,都可以正常进行开发。 + +PaddleOCR的历史分支,未来将不再维护。考虑到一些同学可能仍在使用,这些分支还会继续保留: + +- develop分支:这个分支曾用于静态图的开发与测试,目前兼容>=1.7版本的Paddle。如果有特殊需求,要适配旧版本的Paddle,那还可以使用这个分支,但除了修复bug外不再更新代码。 + +PaddleOCR欢迎大家向repo中积极贡献代码,下面给出一些贡献代码的基本流程。 + +### 3.2 PaddleOCR代码提交流程与规范 + +> 如果你熟悉Git使用,可以直接跳转到 [3.2.10 提交代码的一些约定](#提交代码的一些约定) + +#### 3.2.1 创建你的 `远程仓库` + +- 在PaddleOCR的 [GitHub首页](https://github.com/PaddlePaddle/PaddleOCR),点击左上角 `Fork` 按钮,在你的个人目录下创建 `远程仓库`,比如`https://github.com/{your_name}/PaddleOCR`。 + + ![banner](./images/banner.png) + +- 将 `远程仓库` Clone到本地 + + ```bash linenums="1" + # 拉取dygraph分支的代码 + git clone https://github.com/{your_name}/PaddleOCR.git -b dygraph + cd PaddleOCR + ``` + +> 多数情况下clone失败是由于网络原因,请稍后重试或配置代理 + +#### 3.2.2 通过Token方式登录与建立连接 + +首先查看当前 `远程仓库` 的信息。 + +```bash linenums="1" +git remote -v +# origin https://github.com/{your_name}/PaddleOCR.git (fetch) +# origin https://github.com/{your_name}/PaddleOCR.git (push) +``` + +只有clone的 `远程仓库` 的信息,也就是自己用户名下的 PaddleOCR。由于Github的登录方式变化,需要通过Token的方式重新配置 `远程仓库` 的地址。生成Token的方式如下: + +1. 找到个人访问令牌(token):在Github页面右上角点击自己的头像,然后依次选择 Settings --> Developer settings --> Personal access tokens +2. 点击 Generate new token:在Note中填入token名称,例如’paddle‘。在Select scopes选择repo(必选)、admin:repo_hook、delete_repo等,可根据自身需要勾选。然后点击Generate token生成token。最后复制生成的token。 + +删除原始的origin配置 + +```bash linenums="1" +git remote rm origin +``` + +将remote分支改成 `https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git`。例如:如果token值为12345,你的用户名为PPOCR,则运行下方命令 + +```bash linenums="1" +git remote add origin https://oauth2:12345@github.com/PPOCR/PaddleOCR.git +``` + +这样我们就与自己的 `远程仓库` 建立了连接。接下来我们创建一个原始 PaddleOCR 仓库的远程主机,命名为 upstream。 + +```bash linenums="1" +git remote add upstream https://github.com/PaddlePaddle/PaddleOCR.git +``` + +使用 `git remote -v` 查看当前 `远程仓库` 的信息,输出如下,发现包括了origin和upstream 2个 `远程仓库` 。 + +```bash linenums="1" +origin https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git (fetch) +origin https://oauth2:{token}@github.com/{your_name}/PaddleOCR.git (push) +upstream https://github.com/PaddlePaddle/PaddleOCR.git (fetch) +upstream https://github.com/PaddlePaddle/PaddleOCR.git (push) +``` + +这主要是为了后续在提交pull request(PR)时,始终保持本地仓库最新。 + +#### 3.2.3 创建本地分支 + +首先获取 upstream 的最新代码,然后基于上游仓库 (upstream)的dygraph创建new_branch分支。 + +```bash linenums="1" +git fetch upstream +git checkout -b new_branch upstream/dygraph +``` + +> 如果对于新Fork的PaddleOCR项目,用户远程仓库(origin)与上游(upstream)仓库的分支更新情况相同,也可以基于origin仓库的默认分支或指定分支创建新的本地分支,命令如下。 +> +> ```bash linenums="1" +> # 基于用户远程仓库(origin)的dygraph创建new_branch分支 +> git checkout -b new_branch origin/dygraph +> +> # 基于用户远程仓库(origin)的默认分支创建new_branch分支 +> git checkout -b new_branch +> ``` + +最终会显示切换到新的分支,输出信息如下 + +```bash linenums="1" +Branch new_branch set up to track remote branch develop from upstream. +Switched to a new branch 'new_branch' +``` + +切换分支之后即可在此分支上进行文件改动 + +#### 3.2.4 使用pre-commit勾子 + +Paddle 开发人员使用 pre-commit 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。 + +pre-commit测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 PaddleOCR,首先安装并在当前目录运行它: + +```bash linenums="1" +pip install pre-commit +pre-commit install +``` + + > 1. Paddle 使用 clang-format 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。 + > + > 2. 通过pip install pre-commit和conda install -c conda-forge pre-commit安装的yapf稍有不同的,PaddleOCR 开发人员使用的是 `pip install pre-commit`。 + +#### 3.2.5 修改与提交代码 + + 假设对PaddleOCR的 `README.md` 做了一些修改,可以通过 `git status` 查看改动的文件,然后使用 `git add` 添加改动文件。 + +```bash linenums="1" +git status # 查看改动文件 +git add README.md +pre-commit +``` + +重复上述步骤,直到pre-comit格式检查不报错。如下所示。 + +![img](./images/precommit_pass.png) + +提交修改,并写明修改内容("your commit info") + +```bash linenums="1" +git commit -m "your commit info" +``` + +#### 3.2.6 Push到远程仓库 + +使用push命令将修改的commit提交到 `远程仓库` + +```bash linenums="1" +git push origin new_branch +``` + +#### 3.2.7 提交Pull Request + +打开自己的远程仓库界面,选择提交的分支。点击new pull request或contribute进入PR界面。选择本地分支和目标分支,如下图所示。在PR的描述说明中,填写该PR所完成的功能。接下来等待review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。 + +![img](./images/pr.png) + +#### 3.2.8 签署CLA协议和通过单元测试 + +- 签署CLA 在首次向PaddlePaddle提交Pull Request时,您需要您签署一次CLA(Contributor License Agreement)协议,以保证您的代码可以被合入,具体签署方式如下: + + 1. 请您查看PR中的Check部分,找到license/cla,并点击右侧detail,进入CLA网站 + + 2. 点击CLA网站中的“Sign in with GitHub to agree”,点击完成后将会跳转回您的Pull Request页面 + +#### 3.2.9 删除分支 + +- 删除远程分支 + + 在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。 + + 也可以使用 `git push origin :分支名` 删除远程分支,如: + + ```bash linenums="1" + git push origin :new_branch + ``` + +- 删除本地分支 + + ```bash linenums="1" + # 切换到dygraph分支,否则无法删除当前分支 + git checkout dygraph + + # 删除new_branch分支 + git branch -D new_branch + ``` + +#### 3.2.10 提交代码的一些约定 + +为了使官方维护人员在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定: + +1)请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,官方维护人员一般不做评审。 + +2)提交Pull Request前: + +- 请注意commit的数量。 + + 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。 + + 建议:每次提交时,保持尽量少的commit,可以通过git commit --amend补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](https://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。 + +- 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。 + +3)如果解决了某个Issue的问题,请在该Pull Request的第一个评论框中加上:fix #issue_number,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。 + +此外,在回复评审人意见时,请您遵守以下约定: + +1)官方维护人员的每一个review意见都希望得到回复,这样会更好地提升开源社区的贡献。 + +- 对评审意见同意且按其修改完的,给个简单的Done即可; +- 对评审意见不同意的,请给出您自己的反驳理由。 + +2)如果评审意见比较多: + +- 请给出总体的修改情况。 +- 请采用`start a review`进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 diff --git a/docs/community/community_contribution.md b/docs/community/community_contribution.md new file mode 100644 index 0000000000..54c5b45957 --- /dev/null +++ b/docs/community/community_contribution.md @@ -0,0 +1,141 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 社区贡献 + +感谢大家长久以来对PaddleOCR的支持和关注,与广大开发者共同构建一个专业、和谐、相互帮助的开源社区是PaddleOCR的目标。本文档展示了已有的社区贡献、对于各类贡献说明、新的机会与流程,希望贡献流程更加高效、路径更加清晰。 + +PaddleOCR希望可以通过AI的力量助力任何一位有梦想的开发者实现自己的想法,享受创造价值带来的愉悦。 + + + + + +--- + +## 1. 社区贡献 + +### 1.1 基于PaddleOCR的社区项目 + +| 类别 | 项目 | 描述 | 开发者 | +| -------- | ------ | ------ | --------- | +| 通用工具 | [FastOCRLabel](https://gitee.com/BaoJianQiang/FastOCRLabel) | 完整的C#版本标注GUI | [包建强](https://gitee.com/BaoJianQiang) | +| 通用工具 | [DangoOCR离线版](https://github.com/PantsuDango/DangoOCR) | 通用型桌面级即时翻译GUI | [PantsuDango](https://github.com/PantsuDango) | +| 通用工具 | [scr2txt](https://github.com/lstwzd/scr2txt) | 截屏转文字GUI | [lstwzd](https://github.com/lstwzd) | +| 通用工具 | [ocr_sdk](https://github.com/mymagicpower/AIAS/blob/main/1_image_sdks/text_recognition/ocr_sdk) | OCR java SDK工具箱 | [Calvin](https://github.com/mymagicpower) | +| 通用工具 | [iocr](https://github.com/mymagicpower/AIAS/blob/main/8_suite_hub/iocr) | IOCR 自定义模板识别(支持表格识别) | [Calvin](https://github.com/mymagicpower) | +| 通用工具 | [Lmdb Dataset Format Conversion Tool](https://github.com/OneYearIsEnough/PaddleOCR-Recog-LmdbDataset-Conversion) | 文本识别任务中lmdb数据格式转换工具 | [OneYearIsEnough](https://github.com/OneYearIsEnough) | +| 通用工具 | [用paddleocr打造一款“盗幕笔记”](https://github.com/kjf4096/paddleocr_dmbj) | 用PaddleOCR记笔记 | [kjf4096](https://github.com/kjf4096) | +| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/1054614?channelType=0&channel=0) | 英文视频自动生成字幕 | [叶月水狐](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/322052) | +| 垂类工具 | [id_card_ocr](https://github.com/baseli/id_card_ocr) | 身份证复印件识别 | [baseli](https://github.com/baseli) | +| 垂类工具 | [Paddle_Table_Image_Reader](https://github.com/thunder95/Paddle_Table_Image_Reader) | 能看懂表格图片的数据助手 | [thunder95](https://github.com/thunder95]) | +| 垂类工具 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3382897) | OCR流程中对手写体进行过滤 | [daassh](https://github.com/daassh) | +| 垂类场景调优 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/2803693) | 电表读数和编号识别 | [深渊上的坑](https://github.com/edencfc) | +| 垂类场景调优 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3284199) | LCD液晶字符检测 | [Dream拒杰](https://github.com/zhangyingying520) | +| 前后处理 | [paddleOCRCorrectOutputs](https://github.com/yuranusduke/paddleOCRCorrectOutputs) | 获取OCR识别结果的key-value | [yuranusduke](https://github.com/yuranusduke) | +|前处理| [optlab](https://github.com/GreatV/optlab) |OCR前处理工具箱,基于Qt和Leptonica。|[GreatV](https://github.com/GreatV)| +|应用部署| [PaddleOCRSharp](https://github.com/raoyutian/PaddleOCRSharp) |PaddleOCR的.NET封装与应用部署。|[raoyutian](https://github.com/raoyutian/PaddleOCRSharp)| +|应用部署| [PaddleSharp](https://github.com/sdcb/PaddleSharp) |PaddleOCR的.NET封装与应用部署,支持跨平台、GPU|[sdcb](https://github.com/sdcb)| +| 应用部署 | [PaddleOCR-Streamlit-Demo](https://github.com/Lovely-Pig/PaddleOCR-Streamlit-Demo) | 使用Streamlit部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-PyWebIO-Demo](https://github.com/Lovely-Pig/PaddleOCR-PyWebIO-Demo) | 使用PyWebIO部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-Paddlejs-Vue-Demo](https://github.com/Lovely-Pig/PaddleOCR-Paddlejs-Vue-Demo) | 使用Paddle.js和Vue部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 应用部署 | [PaddleOCR-Paddlejs-React-Demo](https://github.com/Lovely-Pig/PaddleOCR-Paddlejs-React-Demo) | 使用Paddle.js和React部署PaddleOCR | [Lovely-Pig](https://github.com/Lovely-Pig) | +| 学术前沿模型训练与推理 | [AI Studio项目](https://aistudio.baidu.com/aistudio/projectdetail/3397137) | StarNet-MobileNetV3算法–中文训练 | [xiaoyangyang2](https://github.com/xiaoyangyang2) | +| 学术前沿模型训练与推理 | [ABINet-paddle](https://github.com/Huntersdeng/abinet-paddle) | ABINet算法前向运算的paddle实现以及模型各部分的实现细节分析 | [Huntersdeng](https://github.com/Huntersdeng) | + +### 1.2 为PaddleOCR新增功能 + +- 非常感谢 [authorfu](https://github.com/authorfu) 贡献Android([#340](https://github.com/PaddlePaddle/PaddleOCR/pull/340))和[xiadeye](https://github.com/xiadeye) 贡献IOS的demo代码([#325](https://github.com/PaddlePaddle/PaddleOCR/pull/325)) +- 非常感谢 [tangmq](https://gitee.com/tangmq) 给PaddleOCR增加Docker化部署服务,支持快速发布可调用的Restful API服务([#507](https://github.com/PaddlePaddle/PaddleOCR/pull/507))。 +- 非常感谢 [lijinhan](https://github.com/lijinhan) 给PaddleOCR增加java SpringBoot 调用OCR Hubserving接口完成对OCR服务化部署的使用([#1027](https://github.com/PaddlePaddle/PaddleOCR/pull/1027))。 +- 非常感谢 [Evezerest](https://github.com/Evezerest), [ninetailskim](https://github.com/ninetailskim), [edencfc](https://github.com/edencfc), [BeyondYourself](https://github.com/BeyondYourself), [1084667371](https://github.com/1084667371) 贡献了[PPOCRLabel](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.3/PPOCRLabel/README_ch.md) 的完整代码。 +- 非常感谢 [bupt906](https://github.com/bupt906) 贡献MicroNet结构代码([#5251](https://github.com/PaddlePaddle/PaddleOCR/pull/5251))和贡献OneCycle学习率策略代码([#5252](https://github.com/PaddlePaddle/PaddleOCR/pull/5252)) + +### 1.3 代码修复 + +- 非常感谢 [zhangxin](https://github.com/ZhangXinNan)([Blog](https://blog.csdn.net/sdlypyzq)) 贡献新的可视化方式、添加.gitgnore、处理手动设置PYTHONPATH环境变量的问题([#210](https://github.com/PaddlePaddle/PaddleOCR/pull/210))。 +- 非常感谢 [lyl120117](https://github.com/lyl120117) 贡献打印网络结构的代码([#304](https://github.com/PaddlePaddle/PaddleOCR/pull/304))。 +- 非常感谢 [BeyondYourself](https://github.com/BeyondYourself) 给PaddleOCR提了很多非常棒的建议,并简化了PaddleOCR的部分代码风格([so many commits)](https://github.com/PaddlePaddle/PaddleOCR/commits?author=BeyondYourself)。 + +### 1.4 文档优化与翻译 + +- 非常感谢 **[RangeKing](https://github.com/RangeKing),[HustBestCat](https://github.com/HustBestCat),[v3fc](https://github.com/v3fc),[1084667371](https://github.com/1084667371)** 贡献翻译《动手学OCR》notebook[电子书英文版](https://github.com/PaddlePaddle/PaddleOCR/tree/dygraph/notebook/notebook_en)。 +- 非常感谢 [thunderstudying](https://github.com/thunderstudying),[RangeKing](https://github.com/RangeKing),[livingbody](https://github.com/livingbody), [WZMIAOMIAO](https://github.com/WZMIAOMIAO),[haigang1975](https://github.com/haigang1975) 补充多个英文markdown文档。 +- 非常感谢 **[fanruinet](https://github.com/fanruinet)** 润色和修复35篇英文文档([#5205](https://github.com/PaddlePaddle/PaddleOCR/pull/5205))。 +- 非常感谢 [Khanh Tran](https://github.com/xxxpsyduck) 和 [Karl Horky](https://github.com/karlhorky) 贡献修改英文文档。 + +### 1.5 多语言语料 + +- 非常感谢 [xiangyubo](https://github.com/xiangyubo) 贡献手写中文OCR数据集([#321](https://github.com/PaddlePaddle/PaddleOCR/pull/321))。 +- 非常感谢 [Mejans](https://github.com/Mejans) 给PaddleOCR增加新语言奥克西坦语Occitan的字典和语料([#954](https://github.com/PaddlePaddle/PaddleOCR/pull/954))。 + +## 2. 贡献说明 + +### 2.1 新增功能类 + +PaddleOCR非常欢迎社区贡献以PaddleOCR为核心的各种服务、部署实例与软件应用,经过认证的社区贡献会被添加在上述社区贡献表中,为广大开发者增加曝光,也是PaddleOCR的荣耀,其中: + +- 项目形式:官方社区认证的项目代码应有良好的规范和结构,同时,还应配备一个详细的README.md,说明项目的使用方法。通过在requirements.txt文件中增加一行 `paddleocr` 可以自动收录到PaddleOCR的usedby中。 + +- 合入方式:如果是对PaddleOCR现有工具的更新升级,则会合入主repo。如果为PaddleOCR拓展了新功能,请先与官方人员联系,确认项目是否合入主repo,*即使新功能未合入主repo,我们同样也会以社区贡献的方式为您的个人项目增加曝光。* + +### 2.2 代码优化 + +如果您在使用PaddleOCR时遇到了代码bug、功能不符合预期等问题,可以为PaddleOCR贡献您的修改,其中: + +- Python代码规范可参考[附录1:Python代码规范](./code_and_doc.md#附录1python代码规范)。 + +- 提交代码前请再三确认不会引入新的bug,并在PR中描述优化点。如果该PR解决了某个issue,请在PR中连接到该issue。所有的PR都应该遵守附录3中的[3.2.10 提交代码的一些约定。](./code_and_doc.md#附录3pull-request说明) + +- 请在提交之前参考下方的[附录3:Pull Request说明](./code_and_doc.md#附录3pull-request说明)。如果您对git的提交流程不熟悉,同样可以参考附录3的3.2节。 + +### 2.3 文档优化 + +如果您在使用PaddleOCR时遇到了文档表述不清楚、描述缺失、链接失效等问题,可以为PaddleOCR贡献您的修改。文档书写规范请参考[附录2:文档规范](./code_and_doc.md#附录2文档规范)。 + +## 3. 更多贡献机会 + +我们非常鼓励开发者使用PaddleOCR实现自己的想法,同时我们也列出一些经过分析后认为有价值的拓展方向,整体收集在社区项目常规赛中。 + +## 4. 联系我们 + +我们非常欢迎广大开发者在有意向为PaddleOCR贡献代码、文档、语料等内容前与我们联系,这样可以大大降低PR过程中的沟通成本。同时,如果您觉得某些想法个人难以实现,我们也可以通过SIG的形式定向为项目招募志同道合的开发者一起共建。通过SIG渠道贡献的项目将会获得深层次的研发支持与运营资源(如公众号宣传、直播课等)。 + +我们推荐的贡献流程是: + +- 通过在github issue的题目中增加 `【third-party】` 标记,说明遇到的问题(以及解决的思路)或想拓展的功能,等待值班人员回复。例如 `【third-party】为PaddleOCR贡献IOS示例` +- 与我们沟通确认技术方案或bug、优化点准确无误后进行功能新增或相应的修改,代码与文档遵循相关规范。 +- PR链接到上述issue,等待review。 + +## 5. 致谢与后续 + +- 合入代码之后会在本文档第一节中更新信息,默认链接为github名字及主页,如果有需要更换主页,也可以联系我们。 +- 新增重要功能类,会在用户群广而告之,享受开源社区荣誉时刻。 +- **如果您有基于PaddleOCR的项目,但未出现在上述列表中,请按照 `4. 联系我们` 的步骤与我们联系。** + +## 附录:社区常规赛积分榜 + +| 开发者| 总积分 | 开发者| 总积分 | +| ---- | ------ | ----- | ------ | +| [RangeKing](https://github.com/RangeKing) | 220 | [WZMIAOMIAO](https://github.com/WZMIAOMIAO) | 36 | +| [hao6699](https://github.com/hao6699) | 145 | [v3fc](https://github.com/v3fc) | 35 | +| [mymagicpower](https://github.com/mymagicpower) | 140 | [imiyu](https://github.com/imiyu) | 30 | +| [raoyutian](https://github.com/raoyutian) | 90 | [haigang1975](https://github.com/haigang1975) | 29 | +| [sdcb](https://github.com/sdcb) | 80 | [daassh](https://github.com/daassh) | 23 | +| [zhiminzhang0830](https://github.com/zhiminzhang0830) | 70 | [xiaoyangyang2](https://github.com/xiaoyangyang2) | 20 | +| [Lovely-Pig](https://github.com/Lovely-Pig) | 70 | [prettyocean85](https://github.com/prettyocean85) | 20 | +| [livingbody](https://github.com/livingbody) | 70 | [nmusik](https://github.com/nmusik) | 20 | +| [fanruinet](https://github.com/fanruinet) | 70 | [kjf4096](https://github.com/kjf4096) | 20 | +| [bupt906](https://github.com/bupt906) | 60 | [chccc1994](https://github.com/chccc1994) | 20 | +| [edencfc](https://github.com/edencfc) | 57 | [BeyondYourself](https://github.com/BeyondYourself) | 20 | +| [zhangyingying520](https://github.com/zhangyingying520) | 57 | chenguoqi08161 | 18 | +| [ITerydh](https://github.com/ITerydh) | 55 | [weiwenlan](https://github.com/weiwenlan) | 10 | +| [telppa](https://github.com/telppa) | 40 | [shaoshenchen thinc](https://github.com/shaoshenchen) | 10 | +| sosojust1984 | 40 | [jordan2013](https://github.com/jordan2013) | 10 | +| [redearly123](https://github.com/redearly123) | 40 | [JimEverest](https://github.com/JimEverest) | 10 | +| [OneYearIsEnough](https://github.com/OneYearIsEnough) | 40 | [HustBestCat](https://github.com/HustBestCat) | 10 | +| [Huntersdeng](https://github.com/Huntersdeng) | 40 | | | +| [GreatV](https://github.com/GreatV) | 40 | | | +| CLXK294 | 40 | | | diff --git a/docs/community/images/banner.png b/docs/community/images/banner.png new file mode 100644 index 0000000000..d72b997a80 Binary files /dev/null and b/docs/community/images/banner.png differ diff --git a/docs/community/images/pr.png b/docs/community/images/pr.png new file mode 100644 index 0000000000..3d0d15f78d Binary files /dev/null and b/docs/community/images/pr.png differ diff --git a/docs/community/images/precommit_pass.png b/docs/community/images/precommit_pass.png new file mode 100644 index 0000000000..067fb75ddb Binary files /dev/null and b/docs/community/images/precommit_pass.png differ diff --git a/docs/data_anno_synth/data_annotation.en.md b/docs/data_anno_synth/data_annotation.en.md new file mode 100644 index 0000000000..772a5f46ee --- /dev/null +++ b/docs/data_anno_synth/data_annotation.en.md @@ -0,0 +1,31 @@ +--- +comments: true +--- + +# DATA ANNOTATION TOOLS + +There are the commonly used data annotation tools, which will be continuously updated. Welcome to contribute tools~ + +## 1. labelImg + +- Tool description: Rectangular label +- Tool address: +- Sketch diagram: + + ![labelimg](./images/labelimg.jpg) + +## 2. roLabelImg + +- Tool description: Label tool rewritten based on labelImg, supporting rotating rectangular label +- Tool address: +- Sketch diagram: + + ![roLabelImg](./images/roLabelImg.png) + +## 3. labelme + +- Tool description: Support four points, polygons, circles and other labels +- Tool address: +- Sketch diagram: + + ![labelme](./images/labelme.jpg) diff --git a/docs/data_anno_synth/data_annotation.md b/docs/data_anno_synth/data_annotation.md new file mode 100644 index 0000000000..3446bfbbf7 --- /dev/null +++ b/docs/data_anno_synth/data_annotation.md @@ -0,0 +1,35 @@ +--- +comments: true +--- + +# 数据标注工具 + +这里整理了常用的数据标注工具,持续更新中,欢迎各位小伙伴贡献工具~ + +## 1. labelImg + +- 工具描述:矩形标注 +- 工具地址: +- 示意图: + ![](./images/labelimg.jpg) + +## 2. roLabelImg + +- 工具描述:基于labelImg重写的标注工具,支持旋转矩形标注 +- 工具地址: +- 示意图: + ![](./images/roLabelImg.png) + +## 3. labelme + +- 工具描述:支持四点、多边形、圆形等多种标注 +- 工具地址: +- 示意图: + ![](./images/labelme.jpg) + +## 4. Vott + +- 工具描述:支持矩形,多边形等图片标注.支持视频标注.方便使用的快捷键以及比较好看的界面.同时支持导出多种标签格式. +- 工具地址: +- 示意图: + ![](./images/VoTT.jpg) diff --git a/docs/data_anno_synth/data_synthesis.en.md b/docs/data_anno_synth/data_synthesis.en.md new file mode 100644 index 0000000000..5d8a6fc2b8 --- /dev/null +++ b/docs/data_anno_synth/data_synthesis.en.md @@ -0,0 +1,17 @@ +--- +comments: true +--- + + +# DATA SYNTHESIS TOOLS + +In addition to open source data, users can also use synthesis tools to synthesize data. +There are the commonly used data synthesis tools, which will be continuously updated. Welcome to contribute tools~ + +* [Text_renderer](https://github.com/Sanster/text_renderer) +* [SynthText](https://github.com/ankush-me/SynthText) +* [SynthText_Chinese_version](https://github.com/JarveeLee/SynthText_Chinese_version) +* [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) +* [SynthText3D](https://github.com/MhLiao/SynthText3D) +* [UnrealText](https://github.com/Jyouhou/UnrealText/) +* [SynthTIGER](https://github.com/clovaai/synthtiger) diff --git a/docs/data_anno_synth/data_synthesis.md b/docs/data_anno_synth/data_synthesis.md new file mode 100644 index 0000000000..7ebcf862fa --- /dev/null +++ b/docs/data_anno_synth/data_synthesis.md @@ -0,0 +1,15 @@ +--- +comments: true +--- + +# 数据合成工具 + +除了开源数据,用户还可使用合成工具自行合成。这里整理了常用的数据合成工具,持续更新中,欢迎各位小伙伴贡献工具~ + +- [text_renderer](https://github.com/Sanster/text_renderer) +- [SynthText](https://github.com/ankush-me/SynthText) +- [SynthText_Chinese_version](https://github.com/JarveeLee/SynthText_Chinese_version) +- [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) +- [SynthText3D](https://github.com/MhLiao/SynthText3D) +- [UnrealText](https://github.com/Jyouhou/UnrealText/) +- [SynthTIGER](https://github.com/clovaai/synthtiger) diff --git a/docs/data_anno_synth/images/VoTT.jpg b/docs/data_anno_synth/images/VoTT.jpg new file mode 100644 index 0000000000..7c5c27ba84 Binary files /dev/null and b/docs/data_anno_synth/images/VoTT.jpg differ diff --git a/docs/data_anno_synth/images/labelimg.jpg b/docs/data_anno_synth/images/labelimg.jpg new file mode 100644 index 0000000000..8d58a445ca Binary files /dev/null and b/docs/data_anno_synth/images/labelimg.jpg differ diff --git a/docs/data_anno_synth/images/labelme.jpg b/docs/data_anno_synth/images/labelme.jpg new file mode 100644 index 0000000000..ce44e504df Binary files /dev/null and b/docs/data_anno_synth/images/labelme.jpg differ diff --git a/docs/data_anno_synth/images/roLabelImg.png b/docs/data_anno_synth/images/roLabelImg.png new file mode 100644 index 0000000000..3b02a3226d Binary files /dev/null and b/docs/data_anno_synth/images/roLabelImg.png differ diff --git a/docs/data_anno_synth/overview.en.md b/docs/data_anno_synth/overview.en.md new file mode 100644 index 0000000000..186157e233 --- /dev/null +++ b/docs/data_anno_synth/overview.en.md @@ -0,0 +1,6 @@ +--- +comments: true +--- + +- Semi-automatic Annotation Tool: PPOCRLabel: +- Data Synthesis Tool: Style-Text: diff --git a/docs/data_anno_synth/overview.md b/docs/data_anno_synth/overview.md new file mode 100644 index 0000000000..1068cda83e --- /dev/null +++ b/docs/data_anno_synth/overview.md @@ -0,0 +1,7 @@ +--- +comments: true +--- + + +- 半自动标注工具 PPOCRLabel: +- 数据合成工具 Style-Text: diff --git a/docs/datasets/datasets.en.md b/docs/datasets/datasets.en.md new file mode 100644 index 0000000000..f233ddfb84 --- /dev/null +++ b/docs/datasets/datasets.en.md @@ -0,0 +1,59 @@ +--- +comments: true +--- + +This is a collection of commonly used Chinese datasets, which is being updated continuously. You are welcome to contribute to this list~ + +In addition to opensource data, users can also use synthesis tools to synthesize data themselves. Current available synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator), etc. + +#### 1. ICDAR2019-LSVT + +- **Data sources**: +- **Introduction**: A total of 45w Chinese street view images, including 5w (2w test + 3w training) fully labeled data (text coordinates + text content), 40w weakly labeled data (text content only), as shown in the following figure: + ![](./images/LSVT_1.jpg) + + (a) Fully labeled data + + ![](./images/LSVT_2.jpg) + + (b) Weakly labeled data +- **Download link**: + +#### 2. ICDAR2017-RCTW-17 + +- **Data sources**: +- **Introduction**:It contains 12000 + images, most of them are collected in the wild through mobile camera. Some are screenshots. These images show a variety of scenes, including street views, posters, menus, indoor scenes and screenshots of mobile applications. + ![](./images/rctw.jpg) +- **Download link**: + +#### 3. Chinese Street View Text Recognition + +- **Data sources**: +- **Introduction**:A total of 290000 pictures are included, of which 210000 are used as training sets (with labels) and 80000 are used as test sets (without labels). The dataset is collected from the Chinese street view, and is formed by by cutting out the text line area (such as shop signs, landmarks, etc.) in the street view picture. All the images are preprocessed: by using affine transform, the text area is proportionally mapped to a picture with a height of 48 pixels, as shown in the figure: + + ![](./images/ch_street_rec_1.png) + (a) Label: 魅派集成吊顶 + ![](./images/ch_street_rec_2.png) + (b) Label: 母婴用品连锁 +- **Download link** + + +#### 4. Chinese Document Text Recognition + +- **Data sources**: +- **Introduction**: + - A total of 3.64 million pictures are divided into training set and validation set according to 99:1. + - Using Chinese corpus (news + classical Chinese), the data is randomly generated through changes in font, size, grayscale, blur, perspective, stretching, etc. + - 5990 characters including Chinese characters, English letters, numbers and punctuation(Characters set: ) + - Each sample is fixed with 10 characters, and the characters are randomly intercepted from the sentences in the corpus + - Image resolution is 280x32 + ![](./images/ch_doc1.jpg) + ![](./images/ch_doc3.jpg) +- **Download link**: (Password: lu7m) + +#### 5、ICDAR2019-ArT + +- **Data source**: +- **Introduction**:It includes 10166 images, 5603 in training sets and 4563 in test sets. It is composed of three parts: total text, scut-ctw1500 and Baidu curved scene text, including text with various shapes such as horizontal, multi-directional and curved. + ![](./images/ArT.jpg) +- **Download link**: diff --git a/docs/datasets/datasets.md b/docs/datasets/datasets.md new file mode 100644 index 0000000000..e7b55327c8 --- /dev/null +++ b/docs/datasets/datasets.md @@ -0,0 +1,88 @@ +--- +comments: true +--- + +这里整理了常用中文数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +除了开源数据,用户还可使用合成工具自行合成,可参考[数据合成工具](../data_anno_synth/data_synthesis.md); + +如果需要标注自己的数据,可参考[数据标注工具](../data_anno_synth/data_annotation.md)。 + +#### 1、ICDAR2019-LSVT + +- **数据来源**: +- **数据简介**: 共45w中文街景图像,包含5w(2w测试+3w训练)全标注数据(文本坐标+文本内容),40w弱标注数据(仅文本内容),如下图所示: + ![](./images/LSVT_1.jpg) + (a) 全标注数据 + ![](./images/LSVT_2.jpg) + (b) 弱标注数据 +- **下载地址**: +- **说明**:其中,test数据集的label目前没有开源,如要评估结果,可以去官网提交: + +#### 2、ICDAR2017-RCTW-17 + +- **数据来源**: +- **数据简介**:共包含12,000+图像,大部分图片是通过手机摄像头在野外采集的。有些是截图。这些图片展示了各种各样的场景,包括街景、海报、菜单、室内场景和手机应用程序的截图。 + ![](./images/rctw.jpg) +- **下载地址**: + +#### 3、中文街景文字识别 + +- **数据来源**: +- **数据简介**:ICDAR2019-LSVT行识别任务,共包括29万张图片,其中21万张图片作为训练集(带标注),8万张作为测试集(无标注)。数据集采自中国街景,并由街景图片中的文字行区域(例如店铺标牌、地标等等)截取出来而形成。所有图像都经过一些预处理,将文字区域利用仿射变化,等比映射为一张高为48像素的图片,如图所示: + ![](./images/ch_street_rec_1.png) + (a) 标注:魅派集成吊顶 + ![](./images/ch_street_rec_2.png) + (b) 标注:母婴用品连锁 +- **下载地址** + + +#### 4、中文文档文字识别 + +- **数据来源**: +- **数据简介**: + - 共约364万张图片,按照99:1划分成训练集和验证集。 + - 数据利用中文语料库(新闻 + 文言文),通过字体、大小、灰度、模糊、透视、拉伸等变化随机生成 + - 包含汉字、英文字母、数字和标点共5990个字符(字符集合: ) + - 每个样本固定10个字符,字符随机截取自语料库中的句子 + - 图片分辨率统一为280x32 + ![](./images/ch_doc1.jpg) + ![](./images/ch_doc3.jpg) +- **下载地址**: (密码:lu7m) + +#### 5、ICDAR2019-ArT + +- **数据来源**: +- **数据简介**:共包含10,166张图像,训练集5603图,测试集4563图。由Total-Text、SCUT-CTW1500、Baidu Curved Scene Text (ICDAR2019-LSVT部分弯曲数据) 三部分组成,包含水平、多方向和弯曲等多种形状的文本。 + ![](./images/ArT.jpg) +- **下载地址**: + +#### 6、电子印章数据集 + +- **数据来源**: +- **数据简介**:共包含10000张图像,训练集8000图,测试集2000图。数据集是用程序合成的,并不涉及隐私安全,主要用于印章弯曲文本的训练与检测。由开发者[jingsongliujing](https://github.com/jingsongliujing)贡献 +- **下载地址**: + +## 参考文献 + +**ICDAR 2019-LSVT Challenge** + +```bibtex +@article{sun2019icdar, + title={ICDAR 2019 Competition on Large-scale Street View Text with Partial Labeling--RRC-LSVT}, + author={Sun, Yipeng and Ni, Zihan and Chng, Chee-Kheng and Liu, Yuliang and Luo, Canjie and Ng, Chun Chet and Han, Junyu and Ding, Errui and Liu, Jingtuo and Karatzas, Dimosthenis and others}, + journal={arXiv preprint arXiv:1909.07741}, + year={2019} +} +``` + +**ICDAR 2019-ArT Challenge** + +```bibtex +@article{chng2019icdar2019, + title={ICDAR2019 Robust Reading Challenge on Arbitrary-Shaped Text (RRC-ArT)}, + author={Chng, Chee-Kheng and Liu, Yuliang and Sun, Yipeng and Ng, Chun Chet and Luo, Canjie and Ni, Zihan and Fang, ChuanMing and Zhang, Shuaitao and Han, Junyu and Ding, Errui and others}, + journal={arXiv preprint arXiv:1909.07145}, + year={2019} +} +``` diff --git a/docs/datasets/handwritten_datasets.en.md b/docs/datasets/handwritten_datasets.en.md new file mode 100644 index 0000000000..8ee1d0fb67 --- /dev/null +++ b/docs/datasets/handwritten_datasets.en.md @@ -0,0 +1,31 @@ +--- +comments: true +--- + + +# Handwritten OCR dataset + +Here we have sorted out the commonly used handwritten OCR dataset datasets, which are being updated continuously. We welcome you to contribute datasets ~ + +- [Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset](#Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset) +- [NIST handwritten single character dataset - English](#NIST handwritten single character dataset - English) + +## Institute of automation, Chinese Academy of Sciences - handwritten Chinese dataset + +- **Data source**: +- **Data introduction**: + - It includes online and offline handwritten data,`HWDB1.0~1.2` has totally 3895135 handwritten single character samples, which belong to 7356 categories (7185 Chinese characters and 171 English letters, numbers and symbols);`HWDB2.0~2.2` has totally 5091 pages of images, which are divided into 52230 text lines and 1349414 words. All text and text samples are stored as grayscale images. Some sample words are shown below. + + ![](./images/CASIA_0.jpg) + +- **Download address**: +- **使用建议**:Data for single character, white background, can form a large number of text lines for training. White background can be processed into transparent state, which is convenient to add various backgrounds. For the case of semantic needs, it is suggested to extract single character from real corpus to form text lines. + +## NIST handwritten single character dataset - English(NIST Handprinted Forms and Characters Database) + +- **Data source**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19) +- **Data introduction**: NIST19 dataset is suitable for handwritten document and character recognition model training. It is extracted from the handwritten sample form of 3600 authors and contains 810000 character images in total. Nine of them are shown below. + + ![](./images/nist_demo.png) + +- **Download address**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19) diff --git a/docs/datasets/handwritten_datasets.md b/docs/datasets/handwritten_datasets.md new file mode 100644 index 0000000000..e4adb10314 --- /dev/null +++ b/docs/datasets/handwritten_datasets.md @@ -0,0 +1,28 @@ +--- +comments: true +--- + + +# 手写OCR数据集 + +这里整理了常用手写数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +## 中科院自动化研究所-手写中文数据集 + +- **数据来源**: +- **数据简介**: + - 包含在线和离线两类手写数据,`HWDB1.0~1.2`总共有3895135个手写单字样本,分属7356类(7185个汉字和171个英文字母、数字、符号);`HWDB2.0~2.2`总共有5091页图像,分割为52230个文本行和1349414个文字。所有文字和文本样本均存为灰度图像。部分单字样本图片如下所示。 + + ![](./images/CASIA_0.jpg) + +- **下载地址**: +- **使用建议**:数据为单字,白色背景,可以大量合成文字行进行训练。白色背景可以处理成透明状态,方便添加各种背景。对于需要语义的情况,建议从真实语料出发,抽取单字组成文字行 + +## NIST手写单字数据集-英文(NIST Handprinted Forms and Characters Database) + +- **数据来源**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19) +- **数据简介**: NIST19数据集适用于手写文档和字符识别的模型训练,从3600位作者的手写样本表格中提取得到,总共包含81万张字符图片。其中9张图片示例如下: + + ![](./images/nist_demo.png) + +- **下载地址**: [https://www.nist.gov/srd/nist-special-database-19](https://www.nist.gov/srd/nist-special-database-19) diff --git a/docs/datasets/images/20210816_210413.gif b/docs/datasets/images/20210816_210413.gif new file mode 100644 index 0000000000..288f615620 Binary files /dev/null and b/docs/datasets/images/20210816_210413.gif differ diff --git a/docs/datasets/images/ArT.png b/docs/datasets/images/ArT.png new file mode 100644 index 0000000000..dfb9d0a581 Binary files /dev/null and b/docs/datasets/images/ArT.png differ diff --git a/docs/datasets/images/CASIA_0.jpg b/docs/datasets/images/CASIA_0.jpg new file mode 100644 index 0000000000..d65924b2e0 Binary files /dev/null and b/docs/datasets/images/CASIA_0.jpg differ diff --git a/docs/datasets/images/CDLA_demo/val_0633.jpg b/docs/datasets/images/CDLA_demo/val_0633.jpg new file mode 100644 index 0000000000..834848547a Binary files /dev/null and b/docs/datasets/images/CDLA_demo/val_0633.jpg differ diff --git a/docs/datasets/images/CDLA_demo/val_0941.jpg b/docs/datasets/images/CDLA_demo/val_0941.jpg new file mode 100644 index 0000000000..f7d548e120 Binary files /dev/null and b/docs/datasets/images/CDLA_demo/val_0941.jpg differ diff --git a/docs/datasets/images/LSVT_1.jpg b/docs/datasets/images/LSVT_1.jpg new file mode 100644 index 0000000000..ea11a7da59 Binary files /dev/null and b/docs/datasets/images/LSVT_1.jpg differ diff --git a/docs/datasets/images/LSVT_2.jpg b/docs/datasets/images/LSVT_2.jpg new file mode 100644 index 0000000000..67bfbe5259 Binary files /dev/null and b/docs/datasets/images/LSVT_2.jpg differ diff --git a/docs/datasets/images/VoTT.jpg b/docs/datasets/images/VoTT.jpg new file mode 100644 index 0000000000..7c5c27ba84 Binary files /dev/null and b/docs/datasets/images/VoTT.jpg differ diff --git a/docs/datasets/images/captcha_demo.png b/docs/datasets/images/captcha_demo.png new file mode 100644 index 0000000000..047a72648c Binary files /dev/null and b/docs/datasets/images/captcha_demo.png differ diff --git a/docs/datasets/images/ccpd_demo.png b/docs/datasets/images/ccpd_demo.png new file mode 100644 index 0000000000..a750d054f6 Binary files /dev/null and b/docs/datasets/images/ccpd_demo.png differ diff --git a/docs/datasets/images/ch_doc1.jpg b/docs/datasets/images/ch_doc1.jpg new file mode 100644 index 0000000000..53534400ab Binary files /dev/null and b/docs/datasets/images/ch_doc1.jpg differ diff --git a/docs/datasets/images/ch_doc3.jpg b/docs/datasets/images/ch_doc3.jpg new file mode 100644 index 0000000000..c0c2053643 Binary files /dev/null and b/docs/datasets/images/ch_doc3.jpg differ diff --git a/docs/datasets/images/ch_street_rec_1.png b/docs/datasets/images/ch_street_rec_1.png new file mode 100644 index 0000000000..a0e158cbd1 Binary files /dev/null and b/docs/datasets/images/ch_street_rec_1.png differ diff --git a/docs/datasets/images/ch_street_rec_2.png b/docs/datasets/images/ch_street_rec_2.png new file mode 100644 index 0000000000..bfa0fd0188 Binary files /dev/null and b/docs/datasets/images/ch_street_rec_2.png differ diff --git a/docs/datasets/images/cmb_demo.jpg b/docs/datasets/images/cmb_demo.jpg new file mode 100644 index 0000000000..8299149a7c Binary files /dev/null and b/docs/datasets/images/cmb_demo.jpg differ diff --git a/docs/datasets/images/crohme_demo/hme_00.jpg b/docs/datasets/images/crohme_demo/hme_00.jpg new file mode 100644 index 0000000000..66ff27db26 Binary files /dev/null and b/docs/datasets/images/crohme_demo/hme_00.jpg differ diff --git a/docs/datasets/images/crohme_demo/hme_01.jpg b/docs/datasets/images/crohme_demo/hme_01.jpg new file mode 100644 index 0000000000..68b7f09fc2 Binary files /dev/null and b/docs/datasets/images/crohme_demo/hme_01.jpg differ diff --git a/docs/datasets/images/crohme_demo/hme_02.jpg b/docs/datasets/images/crohme_demo/hme_02.jpg new file mode 100644 index 0000000000..ecc760f538 Binary files /dev/null and b/docs/datasets/images/crohme_demo/hme_02.jpg differ diff --git a/docs/datasets/images/doc.jpg b/docs/datasets/images/doc.jpg new file mode 100644 index 0000000000..f57e62abe1 Binary files /dev/null and b/docs/datasets/images/doc.jpg differ diff --git a/docs/datasets/images/funsd_demo/gt_train_00040534.jpg b/docs/datasets/images/funsd_demo/gt_train_00040534.jpg new file mode 100644 index 0000000000..9f7cf4d497 Binary files /dev/null and b/docs/datasets/images/funsd_demo/gt_train_00040534.jpg differ diff --git a/docs/datasets/images/funsd_demo/gt_train_00070353.jpg b/docs/datasets/images/funsd_demo/gt_train_00070353.jpg new file mode 100644 index 0000000000..36d3345e5e Binary files /dev/null and b/docs/datasets/images/funsd_demo/gt_train_00070353.jpg differ diff --git a/docs/datasets/images/ic15_location_download.png b/docs/datasets/images/ic15_location_download.png new file mode 100644 index 0000000000..7cb8540e5e Binary files /dev/null and b/docs/datasets/images/ic15_location_download.png differ diff --git a/docs/datasets/images/icdar_rec.png b/docs/datasets/images/icdar_rec.png new file mode 100644 index 0000000000..a840d6af59 Binary files /dev/null and b/docs/datasets/images/icdar_rec.png differ diff --git a/docs/datasets/images/labelimg.jpg b/docs/datasets/images/labelimg.jpg new file mode 100644 index 0000000000..8d58a445ca Binary files /dev/null and b/docs/datasets/images/labelimg.jpg differ diff --git a/docs/datasets/images/labelme.jpg b/docs/datasets/images/labelme.jpg new file mode 100644 index 0000000000..97cdb237a0 Binary files /dev/null and b/docs/datasets/images/labelme.jpg differ diff --git a/docs/datasets/images/nist_demo.png b/docs/datasets/images/nist_demo.png new file mode 100644 index 0000000000..4c2ce11e26 Binary files /dev/null and b/docs/datasets/images/nist_demo.png differ diff --git a/docs/datasets/images/publaynet_demo/gt_PMC3724501_00006.jpg b/docs/datasets/images/publaynet_demo/gt_PMC3724501_00006.jpg new file mode 100644 index 0000000000..3b7ee8921e Binary files /dev/null and b/docs/datasets/images/publaynet_demo/gt_PMC3724501_00006.jpg differ diff --git a/docs/datasets/images/publaynet_demo/gt_PMC5086060_00002.jpg b/docs/datasets/images/publaynet_demo/gt_PMC5086060_00002.jpg new file mode 100644 index 0000000000..cad8f3035b Binary files /dev/null and b/docs/datasets/images/publaynet_demo/gt_PMC5086060_00002.jpg differ diff --git a/docs/datasets/images/rctw.jpg b/docs/datasets/images/rctw.jpg new file mode 100644 index 0000000000..1e1f945b10 Binary files /dev/null and b/docs/datasets/images/rctw.jpg differ diff --git a/docs/datasets/images/roLabelImg.png b/docs/datasets/images/roLabelImg.png new file mode 100644 index 0000000000..9e354b0bf6 Binary files /dev/null and b/docs/datasets/images/roLabelImg.png differ diff --git a/docs/datasets/images/table_PubTabNet_demo/PMC524509_007_00.png b/docs/datasets/images/table_PubTabNet_demo/PMC524509_007_00.png new file mode 100755 index 0000000000..5b9d631cba Binary files /dev/null and b/docs/datasets/images/table_PubTabNet_demo/PMC524509_007_00.png differ diff --git a/docs/datasets/images/table_PubTabNet_demo/PMC535543_007_01.png b/docs/datasets/images/table_PubTabNet_demo/PMC535543_007_01.png new file mode 100755 index 0000000000..e808de72d6 Binary files /dev/null and b/docs/datasets/images/table_PubTabNet_demo/PMC535543_007_01.png differ diff --git a/docs/datasets/images/table_tal_demo/1.jpg b/docs/datasets/images/table_tal_demo/1.jpg new file mode 100644 index 0000000000..e7ddd6d1db Binary files /dev/null and b/docs/datasets/images/table_tal_demo/1.jpg differ diff --git a/docs/datasets/images/table_tal_demo/2.jpg b/docs/datasets/images/table_tal_demo/2.jpg new file mode 100644 index 0000000000..e7ddd6d1db Binary files /dev/null and b/docs/datasets/images/table_tal_demo/2.jpg differ diff --git a/docs/datasets/images/tablebank_demo/004.png b/docs/datasets/images/tablebank_demo/004.png new file mode 100644 index 0000000000..c1a2d36dfe Binary files /dev/null and b/docs/datasets/images/tablebank_demo/004.png differ diff --git a/docs/datasets/images/tablebank_demo/005.png b/docs/datasets/images/tablebank_demo/005.png new file mode 100644 index 0000000000..0d4d6ab46a Binary files /dev/null and b/docs/datasets/images/tablebank_demo/005.png differ diff --git a/docs/datasets/images/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg b/docs/datasets/images/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg new file mode 100644 index 0000000000..dfed3a0c0e Binary files /dev/null and b/docs/datasets/images/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg differ diff --git a/docs/datasets/images/wildreceipt_demo/2769.jpeg b/docs/datasets/images/wildreceipt_demo/2769.jpeg new file mode 100644 index 0000000000..d5a28763c9 Binary files /dev/null and b/docs/datasets/images/wildreceipt_demo/2769.jpeg differ diff --git a/docs/datasets/images/xfund_demo/gt_zh_train_0.jpg b/docs/datasets/images/xfund_demo/gt_zh_train_0.jpg new file mode 100644 index 0000000000..95e0cf8201 Binary files /dev/null and b/docs/datasets/images/xfund_demo/gt_zh_train_0.jpg differ diff --git a/docs/datasets/images/xfund_demo/gt_zh_train_1.jpg b/docs/datasets/images/xfund_demo/gt_zh_train_1.jpg new file mode 100644 index 0000000000..6a1e53a3ba Binary files /dev/null and b/docs/datasets/images/xfund_demo/gt_zh_train_1.jpg differ diff --git a/docs/datasets/kie_datasets.en.md b/docs/datasets/kie_datasets.en.md new file mode 100644 index 0000000000..7ac9705033 --- /dev/null +++ b/docs/datasets/kie_datasets.en.md @@ -0,0 +1,47 @@ +--- +comments: true +--- + + +## Key Information Extraction dataset + +Here are the common datasets key information extraction, which are being updated continuously. Welcome to contribute datasets. + +### 1. FUNSD dataset + +- **Data source**: +- **Data Introduction**: The FUNSD dataset is a dataset for form comprehension. It contains 199 real, fully annotated scanned images, including market reports, advertisements, and academic reports, etc., and is divided into 149 training set and 50 test set. The FUNSD dataset is suitable for many types of DocVQA tasks, such as field-level entity classification, field-level entity connection, etc. Part of the image and the annotation box visualization are shown below: + + ![](./images/funsd_demo/gt_train_00040534.jpg) + + ![](./images/funsd_demo/gt_train_00070353.jpg) + + In the figure, the orange area represents `header`, the light blue area represents `question`, the green area represents `answer`, and the pink area represents `other`. + +- **Download address**: + +### 2. XFUND dataset + +- **Data source**: +- **Data introduction**: XFUND is a multilingual form comprehension dataset, which contains form data in 7 different languages, and all are manually annotated in the form of key-value pairs. The data for each language contains 199 form data, which are divided into 149 training sets and 50 test sets. Part of the image and the annotation box visualization are shown below. + + ![](./images/xfund_demo/gt_zh_train_0.jpg) + + ![](./images/xfund_demo/gt_zh_train_1.jpg) + +- **Download address**: + +### 3. wildreceipt dataset + +- **Data source**: +- **Data introduction**: wildreceipt is an English receipt dataset, which contains 26 different categories. There are 1267 training images and 472 evaluation images, in which 50,000 textlines and boxes are annotated. Part of the image and the annotation box visualization are shown below. + + ![](./images/wildreceipt_demo/2769.jpeg) + + ![](./images/wildreceipt_demo/1bbe854b8817dedb8585e0732089fd1f752d2cec.jpeg) + +**Note:** Boxes with category `Ignore` or `Others` are not visualized here. + +- **Download address**: + - Offical dataset: [link](https://download.openmmlab.com/mmocr/data/wildreceipt.tar) + - Dataset converted for PaddleOCR training process: [link](https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar) diff --git a/docs/datasets/kie_datasets.md b/docs/datasets/kie_datasets.md new file mode 100644 index 0000000000..328ae28bf9 --- /dev/null +++ b/docs/datasets/kie_datasets.md @@ -0,0 +1,49 @@ +--- +comments: true +--- + + +# 关键信息抽取数据集 + +这里整理了常见的关键信息抽取数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +## 1. FUNSD数据集 + +- **数据来源**: +- **数据简介**:FUNSD数据集是一个用于表单理解的数据集,它包含199张真实的、完全标注的扫描版图片,类型包括市场报告、广告以及学术报告等,并分为149张训练集以及50张测试集。FUNSD数据集适用于多种类型的DocVQA任务,如字段级实体分类、字段级实体连接等。部分图像以及标注框可视化如下所示: + +
+ + +
+ 图中,橙色区域代表`header`,淡蓝色区域代表`question`, 绿色区域表`answer`,粉红色代区域表`other`。 + +- **下载地址**: + +## 2. XFUND数据集 + +- **数据来源**: +- **数据简介**:XFUND是一个多语种表单理解数据集,它包含7种不同语种的表单数据,并且全部用人工进行了键-值对形式的标注。其中每个语种的数据都包含了199张表单数据,并分为149张训练集以及50张测试集。部分图像以及标注框可视化如下所示: + +
+ + +
+ +- **下载地址**: + +## 3. wildreceipt数据集 + +- **数据来源**: +- **数据简介**:wildreceipt数据集是英文发票数据集,包含26个类别(此处类别体系包含`Ignore`类别),共标注了50000个文本框。其中训练集包含1267张图片,测试集包含472张图片。部分图像以及标注框可视化如下所示: + +
+ + +
+ +**注:** 这里对于类别为`Ignore`或者`Others`的文本,没有进行可视化。 + +- **下载地址**: + - 原始数据下载地址:[链接](https://download.openmmlab.com/mmocr/data/wildreceipt.tar) + - 数据格式转换后适配于PaddleOCR训练的数据下载地址:[链接](https://paddleocr.bj.bcebos.com/ppstructure/dataset/wildreceipt.tar) diff --git a/docs/datasets/layout_datasets.en.md b/docs/datasets/layout_datasets.en.md new file mode 100644 index 0000000000..d3de87a537 --- /dev/null +++ b/docs/datasets/layout_datasets.en.md @@ -0,0 +1,46 @@ +--- +comments: true +--- + + +## Layout Analysis Dataset + +Here are the common datasets of layout anlysis, which are being updated continuously. Welcome to contribute datasets. + +Most of the layout analysis datasets are object detection datasets. In addition to open source datasets, you can also label or synthesize datasets using tools such as [labelme](https://github.com/wkentaro/labelme) and so on. + +### 1. PubLayNet dataset + +- **Data source**: +- **Data introduction**: The PubLayNet dataset contains 350000 training images and 11000 validation images. There are 5 categories in total, namely: `text, title, list, table, figure`. Some images and their annotations as shown below. + + ![](./images/publaynet_demo/gt_PMC3724501_00006.jpg) + + ![](./images/publaynet_demo/gt_PMC5086060_00002.jpg) + +- **Download address**: +- **Note**: When using this dataset, you need to follow [CDLA-Permissive](https://cdla.io/permissive-1-0/) license. + +### 2、CDLA dataset + +- **Data source**: +- **Data introduction**: CDLA dataset contains 5000 training images and 1000 validation images with 10 categories, which are `Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation`. Some images and their annotations as shown below. + + ![](./images/CDLA_demo/val_0633.jpg) + + ![](./images/CDLA_demo/val_0941.jpg) + +- **Download address**: +- **Note**: When you train detection model on CDLA dataset using [PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/tree/develop), you need to remove the label `__ignore__` and `_background_`. + +### 3、TableBank dataet + +- **Data source**: +- **Data introduction**: TableBank dataset contains 2 types of document: Latex (187199 training images, 7265 validation images and 5719 testing images) and Word (73383 training images 2735 validation images and 2281 testing images). Some images and their annotations as shown below. + + ![](./images/tablebank_demo/004.png) + + ![](./images/tablebank_demo/005.png) + +- **Data source**: +- **Note**: When using this dataset, you need to follow [Apache-2.0](https://github.com/doc-analysis/TableBank/blob/master/LICENSE) license. diff --git a/docs/datasets/layout_datasets.md b/docs/datasets/layout_datasets.md new file mode 100644 index 0000000000..7396048eda --- /dev/null +++ b/docs/datasets/layout_datasets.md @@ -0,0 +1,49 @@ +--- +comments: true +--- + + +## 版面分析数据集 + +这里整理了常用版面分析数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +版面分析数据集多为目标检测数据集,除了开源数据,用户还可使用合成工具自行合成,如[labelme](https://github.com/wkentaro/labelme)等。 + +### 1、publaynet数据集 + +- **数据来源**: +- **数据简介**:publaynet数据集的训练集合中包含35万张图像,验证集合中包含1.1万张图像。总共包含5个类别,分别是: `text, title, list, table, figure`。部分图像以及标注框可视化如下所示。 + +
+ + +
+ +- **下载地址**: +- **说明**:使用该数据集时,需要遵守[CDLA-Permissive](https://cdla.io/permissive-1-0/)协议。 + +### 2、CDLA数据集 + +- **数据来源**: +- **数据简介**:CDLA据集的训练集合中包含5000张图像,验证集合中包含1000张图像。总共包含10个类别,分别是: `Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation`。部分图像以及标注框可视化如下所示。 + +
+ + +
+ +- **下载地址**: +- **说明**:基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection/tree/develop)套件,在该数据集上训练目标检测模型时,在转换label时,需要将`label.txt`中的`__ignore__`与`_background_`去除。 + +### 3、TableBank数据集 + +- **数据来源**: +- **数据简介**:TableBank数据集包含Latex(训练集187199张,验证集7265张,测试集5719张)与Word(训练集73383张,验证集2735张,测试集2281张)两种类别的文档。仅包含`Table` 1个类别。部分图像以及标注框可视化如下所示。 + +
+ + +
+ +- **下载地址**: +- **说明**:使用该数据集时,需要遵守[Apache-2.0](https://github.com/doc-analysis/TableBank/blob/master/LICENSE)协议。 diff --git a/docs/datasets/ocr_datasets.en.md b/docs/datasets/ocr_datasets.en.md new file mode 100644 index 0000000000..11825d4fda --- /dev/null +++ b/docs/datasets/ocr_datasets.en.md @@ -0,0 +1,154 @@ +--- +comments: true +--- + + +# OCR datasets + +Here is a list of public datasets commonly used in OCR, which are being continuously updated. Welcome to contribute datasets~ + +## 1. Text detection + +### 1.1 PaddleOCR text detection format annotation + +The annotation file formats supported by the PaddleOCR text detection algorithm are as follows, separated by "\t": + +```text linenums="1" +" Image file name Image annotation information encoded by json.dumps" +ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] +``` + +The image annotation after **json.dumps()** encoding is a list containing multiple dictionaries. + +The `points` in the dictionary represent the coordinates (x, y) of the four points of the text box, arranged clockwise from the point at the upper left corner. + +`transcription` represents the text of the current text box. **When its content is "###" it means that the text box is invalid and will be skipped during training.** + +If you want to train PaddleOCR on other datasets, please build the annotation file according to the above format. + +### 1.2 Public dataset + +| dataset | Image download link | PaddleOCR format annotation download link | +|---|---|---| +| ICDAR 2015 | | [train](https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_label.txt) / [test](https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt) | +| ctw1500 | | Included in the downloaded image zip | +| total text | | Included in the downloaded image zip | + +#### 1.2.1 ICDAR 2015 + +The icdar2015 dataset contains train set which has 1000 images obtained with wearable cameras and test set which has 500 images obtained with wearable cameras. The icdar2015 dataset can be downloaded from the link in the table above. Registration is required for downloading. + +After registering and logging in, download the part marked in the red box in the figure below. And, the content downloaded by `Training Set Images` should be saved as the folder `icdar_c4_train_imgs`, and the content downloaded by `Test Set Images` is saved as the folder `ch4_test_images` + +![](./images/ic15_location_download.png) + +Decompress the downloaded dataset to the working directory, assuming it is decompressed under PaddleOCR/train_data/. Then download the PaddleOCR format annotation file from the table above. + +PaddleOCR also provides a data format conversion script, which can convert the official website label to the PaddleOCR format. The data conversion tool is in `ppocr/utils/gen_label.py`, here is the training set as an example: + +```bash linenums="1" +# Convert the label file downloaded from the official website to train_icdar2015_label.txt +python gen_label.py --mode="det" --root_path="/path/to/icdar_c4_train_imgs/" \ + --input_path="/path/to/ch4_training_localization_transcription_gt" \ + --output_label="/path/to/train_icdar2015_label.txt" +``` + +After decompressing the data set and downloading the annotation file, PaddleOCR/train_data/ has two folders and two files, which are: + +```text linenums="1" +/PaddleOCR/train_data/icdar2015/text_localization/ + └─ icdar_c4_train_imgs/ Training data of icdar dataset + └─ ch4_test_images/ Testing data of icdar dataset + └─ train_icdar2015_label.txt Training annotation of icdar dataset + └─ test_icdar2015_label.txt Test annotation of icdar dataset +``` + +## 2. Text recognition + +### 2.1 PaddleOCR text recognition format annotation + +The text recognition algorithm in PaddleOCR supports two data formats: + +- `lmdb` is used to train data sets stored in lmdb format, use [lmdb_dataset.py](../../../ppocr/data/lmdb_dataset.py) to load; +- `common dataset` is used to train data sets stored in text files, use [simple_dataset.py](../../../ppocr/data/simple_dataset.py) to load. + +If you want to use your own data for training, please refer to the following to organize your data. + +#### Training set + +It is recommended to put the training images in the same folder, and use a txt file (rec_gt_train.txt) to store the image path and label. The contents of the txt file are as follows: + +- Note: by default, the image path and image label are split with \t, if you use other methods to split, it will cause training error + +```text linenums="1" +" Image file name Image annotation " + +train_data/rec/train/word_001.jpg 简单可依赖 +train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 +... +``` + +The final training set should have the following file structure: + +```text linenums="1" +|-train_data + |-rec + |- rec_gt_train.txt + |- train + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... +``` + +#### Test set + +Similar to the training set, the test set also needs to be provided a folder containing all images (test) and a rec_gt_test.txt. The structure of the test set is as follows: + +```text linenums="1" +|-train_data + |-rec + |-ic15_data + |- rec_gt_test.txt + |- test + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... +``` + +### 2.2 Public dataset + +| dataset | Image download link | PaddleOCR format annotation download link | +|---|---|---| +| en benchmark(MJ, SJ, IIIT, SVT, IC03, IC13, IC15, SVTP, and CUTE.) | [DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) | LMDB format, which can be loaded directly with [lmdb_dataset.py](../../../ppocr/data/lmdb_dataset.py) | +|ICDAR 2015| | [train](https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt)/ [test](https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt) | +| Multilingual datasets |[Baidu network disk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) Extraction code: frgi
[google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) | Included in the downloaded image zip | + +#### 2.1 ICDAR 2015 + +The ICDAR 2015 dataset can be downloaded from the link in the table above for quick validation. The lmdb format dataset required by en benchmark can also be downloaded from the table above. + +Then download the PaddleOCR format annotation file from the table above. + +PaddleOCR also provides a data format conversion script, which can convert the ICDAR official website label to the data format supported by PaddleOCR. The data conversion tool is in `ppocr/utils/gen_label.py`, here is the training set as an example: + +```bash linenums="1" +# Convert the label file downloaded from the official website to rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + +The data format is as follows, (a) is the original picture, (b) is the Ground Truth text file corresponding to each picture: + +![](./images/icdar_rec.png) + +## 3. Data storage path + +The default storage path for PaddleOCR training data is `PaddleOCR/train_data`, if you already have a dataset on your disk, just create a soft link to the dataset directory: + +```bash linenums="1" +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` diff --git a/docs/datasets/ocr_datasets.md b/docs/datasets/ocr_datasets.md new file mode 100644 index 0000000000..1925d847b2 --- /dev/null +++ b/docs/datasets/ocr_datasets.md @@ -0,0 +1,160 @@ +--- +comments: true +--- + + +# OCR数据集 + +这里整理了OCR中常用的公开数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +## 1. 文本检测 + +### 1.1 PaddleOCR 文字检测数据格式 + +PaddleOCR 中的文本检测算法支持的标注文件格式如下,中间用"\t"分隔: + +```text linenums="1" +" 图像文件名 json.dumps编码的图像标注信息" +ch4_test_images/img_61.jpg [{"transcription": "MASA", "points": [[310, 104], [416, 141], [418, 216], [312, 179]]}, {...}] +``` + +json.dumps编码前的图像标注信息是包含多个字典的list,字典中的 `points` 表示文本框的四个点的坐标(x, y),从左上角的点开始顺时针排列。 +`transcription` 表示当前文本框的文字,**当其内容为“###”时,表示该文本框无效,在训练时会跳过。** + +如果您想在我们未提供的数据集上训练,可以按照上述形式构建标注文件。 + +### 1.2 公开数据集 + +| 数据集名称 |图片下载地址| PaddleOCR 标注下载地址 | +|---|---|---| +| ICDAR 2015 || [train](https://paddleocr.bj.bcebos.com/dataset/train_icdar2015_label.txt) / [test](https://paddleocr.bj.bcebos.com/dataset/test_icdar2015_label.txt) | +| ctw1500 || 图片下载地址中已包含 | +| total text || 图片下载地址中已包含 | +| td tr || 图片下载地址中已包含 | + +#### 1.2.1 ICDAR 2015 + +ICDAR 2015 数据集包含1000张训练图像和500张测试图像。ICDAR 2015 数据集可以从上表中链接下载,首次下载需注册。 +注册完成登陆后,下载下图中红色框标出的部分,其中, `Training Set Images`下载的内容保存在`icdar_c4_train_imgs`文件夹下,`Test Set Images` 下载的内容保存早`ch4_test_images`文件夹下 + + + +将下载到的数据集解压到工作目录下,假设解压在 PaddleOCR/train_data/下。然后从上表中下载转换好的标注文件。 + +PaddleOCR 也提供了数据格式转换脚本,可以将官网 label 转换支持的数据格式。 数据转换工具在 `ppocr/utils/gen_label.py`, 这里以训练集为例: + +```bash linenums="1" +# 将官网下载的标签文件转换为 train_icdar2015_label.txt +python gen_label.py --mode="det" --root_path="/path/to/icdar_c4_train_imgs/" \ + --input_path="/path/to/ch4_training_localization_transcription_gt" \ + --output_label="/path/to/train_icdar2015_label.txt" +``` + +解压数据集和下载标注文件后,PaddleOCR/train_data/ 有两个文件夹和两个文件,按照如下方式组织icdar2015数据集: + +```text linenums="1" +/PaddleOCR/train_data/icdar2015/text_localization/ + └─ icdar_c4_train_imgs/ icdar 2015 数据集的训练数据 + └─ ch4_test_images/ icdar 2015 数据集的测试数据 + └─ train_icdar2015_label.txt icdar 2015 数据集的训练标注 + └─ test_icdar2015_label.txt icdar 2015 数据集的测试标注 +``` + +## 2. 文本识别 + +### 2.1 PaddleOCR 文字识别数据格式 + +PaddleOCR 中的文字识别算法支持两种数据格式: + +- `lmdb` 用于训练以lmdb格式存储的数据集,使用 [lmdb_dataset.py](../../../ppocr/data/lmdb_dataset.py) 进行读取; +- `通用数据` 用于训练以文本文件存储的数据集,使用 [simple_dataset.py](../../../ppocr/data/simple_dataset.py)进行读取。 + +下面以通用数据集为例, 介绍如何准备数据集: + +#### 训练集 + +建议将训练图片放入同一个文件夹,并用一个txt文件(rec_gt_train.txt)记录图片路径和标签,txt文件里的内容如下: + +**注意:** txt文件中默认请将图片路径和图片标签用 \t 分割,如用其他方式分割将造成训练报错。 + +```text linenums="1" +" 图像文件名 图像标注信息 " + +train_data/rec/train/word_001.jpg 简单可依赖 +train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 +... +``` + +最终训练集应有如下文件结构: + +```text linenums="1" +|-train_data + |-rec + |- rec_gt_train.txt + |- train + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... +``` + +除上述单张图像为一行格式之外,PaddleOCR也支持对离线增广后的数据进行训练,为了防止相同样本在同一个batch中被多次采样,我们可以将相同标签对应的图片路径写在一行中,以列表的形式给出,在训练中,PaddleOCR会随机选择列表中的一张图片进行训练。对应地,标注文件的格式如下: + +```text linenums="1" +["11.jpg", "12.jpg"] 简单可依赖 +["21.jpg", "22.jpg", "23.jpg"] 用科技让复杂的世界更简单 +3.jpg ocr +``` + +上述示例标注文件中,"11.jpg"和"12.jpg"的标签相同,都是`简单可依赖`,在训练的时候,对于该行标注,会随机选择其中的一张图片进行训练。 + +#### 验证集 + +同训练集类似,验证集也需要提供一个包含所有图片的文件夹(test)和一个rec_gt_test.txt,验证集的结构如下所示: + +```text linenums="1" +|-train_data + |-rec + |- rec_gt_test.txt + |- test + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... +``` + +### 2.2 公开数据集 + +| 数据集名称 | 图片下载地址 | PaddleOCR 标注下载地址 | +|---|---|---------------------------------------------------------------------| +| en benchmark(MJ, SJ, IIIT, SVT, IC03, IC13, IC15, SVTP, and CUTE.) | [DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) | LMDB格式,可直接用[lmdb_dataset.py](../../../ppocr/data/lmdb_dataset.py)加载 | +|ICDAR 2015| | [train](https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt)/ [test](https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt) | +| 多语言数据集 |[百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 提取码:frgi
[google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) | 图片下载地址中已包含 | + +#### 2.1 ICDAR 2015 + +ICDAR 2015 数据集可以在上表中链接下载,用于快速验证。也可以从上表中下载 en benchmark 所需的lmdb格式数据集。 + +下载完图片后从上表中下载转换好的标注文件。 + +PaddleOCR 也提供了数据格式转换脚本,可以将ICDAR官网 label 转换为PaddleOCR支持的数据格式。 数据转换工具在 `ppocr/utils/gen_label.py`, 这里以训练集为例: + +```bash linenums="1" +# 将官网下载的标签文件转换为 rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + +数据样式格式如下,(a)为原始图片,(b)为每张图片对应的 Ground Truth 文本文件: + +![](./images/icdar_rec.png) + +## 3. 数据存放路径 + +PaddleOCR训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +```bash linenums="1" +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` diff --git a/docs/datasets/table_datasets.en.md b/docs/datasets/table_datasets.en.md new file mode 100644 index 0000000000..b4f1322cc7 --- /dev/null +++ b/docs/datasets/table_datasets.en.md @@ -0,0 +1,40 @@ +--- +comments: true +--- + + +# Table Recognition Datasets + +Here are the commonly used table recognition datasets, which are being updated continuously. Welcome to contribute datasets~ + +## Dataset Summary + +| dataset | Image download link | PPOCR format annotation download link | +|---|---|---| +| PubTabNet || jsonl format, which can be loaded directly with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py) | +| TAL Table Recognition Competition Dataset || jsonl format, which can be loaded directly with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py) | +| WTW Chinese scene table dataset || Conversion is required to load with [pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)| + +## 1. PubTabNet + +- **Data Introduction**:The training set of the PubTabNet dataset contains 500,000 images and the validation set contains 9000 images. Part of the image visualization is shown below. + + ![](./images/table_PubTabNet_demo/PMC524509_007_00.png) + + ![](./images/table_PubTabNet_demo/PMC535543_007_01.png) + +- **illustrate**:When using this dataset, the [CDLA-Permissive](https://cdla.io/permissive-1-0/) protocol is required. + +## 2. TAL Table Recognition Competition Dataset + +- **Data Introduction**:The training set of the TAL table recognition competition dataset contains 16,000 images. The validation set does not give trainable annotations. + + ![](./images/table_tal_demo/1.jpg) + + ![](./images/table_tal_demo/2.jpg) + +## 3. WTW Chinese scene table dataset + +- **Data Introduction**:The WTW Chinese scene table dataset consists of two parts: table detection and table data. The dataset contains images of two scenes, scanned and photographed. + + ![img](./images/20210816_210413.gif) diff --git a/docs/datasets/table_datasets.md b/docs/datasets/table_datasets.md new file mode 100644 index 0000000000..f55a4847b9 --- /dev/null +++ b/docs/datasets/table_datasets.md @@ -0,0 +1,43 @@ +--- +comments: true +typora-copy-images-to: images +--- + + +# 表格识别数据集 + +这里整理了常用表格识别数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +## 数据集汇总 + +| 数据集名称 |图片下载地址| PPOCR标注下载地址 | +|---|---|---| +| PubTabNet || jsonl格式,可直接用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | +| 好未来表格识别竞赛数据集 || jsonl格式,可直接用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | +| WTW中文场景表格数据集 || 需要进行转换后才能用[pubtab_dataset.py](../../../ppocr/data/pubtab_dataset.py)加载 | + +## 1. PubTabNet数据集 + +- **数据简介**:PubTabNet数据集的训练集合中包含50万张图像,验证集合中包含0.9万张图像。部分图像可视化如下所示。 + +
+ + +
+ +- **说明**:使用该数据集时,需要遵守[CDLA-Permissive](https://cdla.io/permissive-1-0/)协议。 + +## 2. 好未来表格识别竞赛数据集 + +- **数据简介**:好未来表格识别竞赛数据集的训练集合中包含1.6万张图像。验证集未给出可训练的标注。 + +
+ + +
+ +## 3. WTW中文场景表格数据集 + +- **数据简介**:WTW中文场景表格数据集包含表格检测和表格数据两部分数据,数据集中同时包含扫描和拍照两张场景的图像。 + + ![img](./images/20210816_210413.gif) diff --git a/docs/datasets/vertical_and_multilingual_datasets.en.md b/docs/datasets/vertical_and_multilingual_datasets.en.md new file mode 100644 index 0000000000..321c66c51a --- /dev/null +++ b/docs/datasets/vertical_and_multilingual_datasets.en.md @@ -0,0 +1,72 @@ +--- +comments: true +--- + + +# Vertical multi-language OCR dataset + +Here we have sorted out the commonly used vertical multi-language OCR dataset datasets, which are being updated continuously. We welcome you to contribute datasets ~ + +- [Chinese urban license plate dataset](#Chinese urban license plate dataset) +- [Bank credit card dataset](#Bank credit card dataset) +- [Captcha dataset-Captcha](#Captcha dataset-Captcha) +- [multi-language dataset](#multi-language dataset) + +## Chinese urban license plate dataset + +- **Data source**:[CCPD](https://github.com/detectRecog/CCPD) + +- **Data introduction**: It contains more than 250000 vehicle license plate images and vehicle license plate detection and recognition information labeling. It contains the following license plate image information in different scenes. + + - CCPD-Base: General license plate picture + - CCPD-DB: The brightness of license plate area is bright, dark or uneven + - CCPD-FN: The license plate is farther or closer to the camera location + - CCPD-Rotate: License plate includes rotation (horizontal 20\~50 degrees, vertical-10\~10 degrees) + - CCPD-Tilt: License plate includes rotation (horizontal 15\~45 degrees, vertical 15\~45 degrees) + - CCPD-Blur: The license plate contains blurring due to camera lens jitter + - CCPD-Weather: The license plate is photographed on rainy, snowy or foggy days + - CCPD-Challenge: So far, some of the most challenging images in license plate detection and recognition tasks + - CCPD-NP: Pictures of new cars without license plates. + + ![](./images/ccpd_demo.png) + +- **Download address** + - Baidu cloud download address (extracted code is hm0U): [https://pan.baidu.com/s/1i5AOjAbtkwb17Zy-NQGqkw](https://pan.baidu.com/s/1i5AOjAbtkwb17Zy-NQGqkw) + - Google drive download address:[https://drive.google.com/file/d/1rdEsCUcIUaYOVRkx5IMTRNA7PcGMmSgc/view](https://drive.google.com/file/d/1rdEsCUcIUaYOVRkx5IMTRNA7PcGMmSgc/view) + +## Bank credit card dataset + +- **Data source**: [source](https://www.kesci.com/home/dataset/5954cf1372ead054a5e25870) + +- **Data introduction**: There are three types of training data + - 1.Sample card data of China Merchants Bank: including card image data and annotation data, a total of 618 pictures + - 2.Single character data: including pictures and annotation data, 37 pictures in total. + - 3.There are only other bank cards, no more detailed information, a total of 50 pictures. + + - The demo image is shown as follows. The annotation information is stored in excel, and the demo image below is marked as + - Top 8 card number: 62257583 + - Card type: card of our bank + - End of validity: 07/41 + - Chinese phonetic alphabet of card users: MICHAEL + + ![](./images/cmb_demo.jpg) + +- **Download address**: [cmb2017-2.zip](https://cdn.kesci.com/cmb2017-2.zip) + +## Captcha dataset-Captcha + +- **Data source**: [captcha](https://github.com/lepture/captcha) +- **Data introduction**: This is a toolkit for data synthesis. You can output captcha images according to the input text. Use the toolkit to generate several demo images as follows. + + ![](./images/captcha_demo.png) + +- **Download address**: The dataset is generated and has no download address. + +## multi-language dataset(Multi-lingual scene text detection and recognition) + +- **Data source**: [source](https://rrc.cvc.uab.es/?ch=15&com=downloads) +- **Data introduction**: Multi language detection dataset MLT contains both language recognition and detection tasks. + - In the detection task, the training set contains 10000 images in 10 languages, and each language contains 1000 training images. The test set contains 10000 images. + - In the recognition task, the training set contains 111998 samples. +- **Download address**: The training set is large and can be downloaded in two parts. It can only be downloaded after registering on the website: +[source](https://rrc.cvc.uab.es/?ch=15&com=downloads) diff --git a/docs/datasets/vertical_and_multilingual_datasets.md b/docs/datasets/vertical_and_multilingual_datasets.md new file mode 100644 index 0000000000..f5386b7bb8 --- /dev/null +++ b/docs/datasets/vertical_and_multilingual_datasets.md @@ -0,0 +1,65 @@ +--- +comments: true +--- + + +# 垂类多语言OCR数据集 + +这里整理了常用垂类和多语言OCR数据集,持续更新中,欢迎各位小伙伴贡献数据集~ + +## 中国城市车牌数据集 + +- **数据来源**:[CCPD](https://github.com/detectRecog/CCPD) +- **数据简介**: 包含超过25万张中国城市车牌图片及车牌检测、识别信息的标注。包含以下几种不同场景中的车牌图片信息。 + - CCPD-Base: 通用车牌图片 + - CCPD-DB: 车牌区域亮度较亮、较暗或者不均匀 + - CCPD-FN: 车牌离摄像头拍摄位置相对更远或者更近 + - CCPD-Rotate: 车牌包含旋转(水平20\~50度,竖直-10\~10度) + - CCPD-Tilt: 车牌包含旋转(水平15\~45度,竖直15\~45度) + - CCPD-Blur: 车牌包含由于摄像机镜头抖动导致的模糊情况 + - CCPD-Weather: 车牌在雨天、雪天或者雾天拍摄得到 + - CCPD-Challenge: 至今在车牌检测识别任务中最有挑战性的一些图片 + - CCPD-NP: 没有安装车牌的新车图片。 + + ![](./images/ccpd_demo.png) + +- **下载地址** + - 百度云下载地址(提取码是hm0U): [link](https://pan.baidu.com/s/1i5AOjAbtkwb17Zy-NQGqkw) + - Google drive下载地址:[link](https://drive.google.com/file/d/1rdEsCUcIUaYOVRkx5IMTRNA7PcGMmSgc/view) + +## 银行信用卡数据集 + +- **数据来源**: [source](https://www.kesci.com/home/dataset/5954cf1372ead054a5e25870) + +- **数据简介**: 训练数据共提供了三类数据 + - 1.招行样卡数据: 包括卡面图片数据及标注数据,总共618张图片 + - 2.单字符数据: 包括图片及标注数据,总共37张图片。 + - 3.仅包含其他银行卡面,不具有更细致的信息,总共50张图片。 + + - demo图片展示如下,标注信息存储在excel表格中,下面的demo图片标注为 + - 前8位卡号:62257583 + - 卡片种类:本行卡 + - 有效期结束:07/41 + - 卡用户拼音:MICHAEL + + ![](./images/cmb_demo.jpg) + +- **下载地址**: [cmb2017-2.zip](https://cdn.kesci.com/cmb2017-2.zip) + +## 验证码数据集-Captcha + +- **数据来源**: [captcha](https://github.com/lepture/captcha) +- **数据简介**: 这是一个数据合成的工具包,可以根据输入的文本,输出验证码图片,使用该工具包生成几张demo图片如下: + + ![](./images/captcha_demo.png) + +- **下载地址**: 该数据集是生成得到,无下载地址。 + +## 多语言数据集(Multi-lingual scene text detection and recognition) + +- **数据来源**: [source](https://rrc.cvc.uab.es/?ch=15&com=downloads) +- **数据简介**: 多语言检测数据集MLT同时包含了语种识别和检测任务。 + - 在检测任务中,训练集包含10000张图片,共有10种语言,每种语言包含1000张训练图片。测试集包含10000张图片。 + - 在识别任务中,训练集包含111998个样本。 +- **下载地址**: 训练集较大,分2部分下载,需要在网站上注册之后才能下载: +[link](https://rrc.cvc.uab.es/?ch=15&com=downloads) diff --git a/docs/images/00006737.jpg b/docs/images/00006737.jpg new file mode 100644 index 0000000000..d7762d2e2c Binary files /dev/null and b/docs/images/00006737.jpg differ diff --git a/docs/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg b/docs/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg new file mode 100644 index 0000000000..6a5fd84c52 Binary files /dev/null and b/docs/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg differ diff --git a/docs/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240708082310650.jpg b/docs/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240708082310650.jpg new file mode 100644 index 0000000000..d406a52da8 Binary files /dev/null and b/docs/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240708082310650.jpg differ diff --git a/docs/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240708082247529.png b/docs/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240708082247529.png new file mode 100644 index 0000000000..429343418e Binary files /dev/null and b/docs/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240708082247529.png differ diff --git a/docs/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240708082316558.jpg b/docs/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240708082316558.jpg new file mode 100644 index 0000000000..bdef2ed23b Binary files /dev/null and b/docs/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240708082316558.jpg differ diff --git a/docs/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d-20240708082323916.png b/docs/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d-20240708082323916.png new file mode 100644 index 0000000000..44845fb5ce Binary files /dev/null and b/docs/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d-20240708082323916.png differ diff --git a/docs/images/186171245-40abc4d7-904f-4949-ade1-250f86ed3a90.jpg b/docs/images/186171245-40abc4d7-904f-4949-ade1-250f86ed3a90.jpg new file mode 100644 index 0000000000..3e3fde4e69 Binary files /dev/null and b/docs/images/186171245-40abc4d7-904f-4949-ade1-250f86ed3a90.jpg differ diff --git a/docs/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f-20240708082253634.png b/docs/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f-20240708082253634.png new file mode 100644 index 0000000000..7b4c271cf2 Binary files /dev/null and b/docs/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f-20240708082253634.png differ diff --git a/docs/images/PP-OCRv3-pic001.jpg b/docs/images/PP-OCRv3-pic001.jpg new file mode 100644 index 0000000000..c35936cc1a Binary files /dev/null and b/docs/images/PP-OCRv3-pic001.jpg differ diff --git a/docs/images/PP-OCRv3-pic002.jpg b/docs/images/PP-OCRv3-pic002.jpg new file mode 100644 index 0000000000..e5ad6a4b2a Binary files /dev/null and b/docs/images/PP-OCRv3-pic002.jpg differ diff --git a/docs/images/PP-OCRv3-pic003.jpg b/docs/images/PP-OCRv3-pic003.jpg new file mode 100644 index 0000000000..dc024296bd Binary files /dev/null and b/docs/images/PP-OCRv3-pic003.jpg differ diff --git a/docs/images/en_1.png b/docs/images/en_1.png new file mode 100644 index 0000000000..36245613e3 Binary files /dev/null and b/docs/images/en_1.png differ diff --git a/docs/images/en_2.png b/docs/images/en_2.png new file mode 100644 index 0000000000..d2df8556ad Binary files /dev/null and b/docs/images/en_2.png differ diff --git a/docs/images/en_3-0398013.png b/docs/images/en_3-0398013.png new file mode 100644 index 0000000000..baf146c010 Binary files /dev/null and b/docs/images/en_3-0398013.png differ diff --git a/docs/images/en_3.png b/docs/images/en_3.png new file mode 100644 index 0000000000..baf146c010 Binary files /dev/null and b/docs/images/en_3.png differ diff --git a/docs/images/japan_2.jpg b/docs/images/japan_2.jpg new file mode 100644 index 0000000000..076ced92ad Binary files /dev/null and b/docs/images/japan_2.jpg differ diff --git a/docs/images/korean_1.jpg b/docs/images/korean_1.jpg new file mode 100644 index 0000000000..f93de40e18 Binary files /dev/null and b/docs/images/korean_1.jpg differ diff --git a/docs/images/ppocrv4.png b/docs/images/ppocrv4.png new file mode 100644 index 0000000000..6be449cf53 Binary files /dev/null and b/docs/images/ppocrv4.png differ diff --git a/docs/images/ppstructure-20240708082235651.gif b/docs/images/ppstructure-20240708082235651.gif new file mode 100644 index 0000000000..bff836e3ea Binary files /dev/null and b/docs/images/ppstructure-20240708082235651.gif differ diff --git a/docs/images/test_add_91.jpg b/docs/images/test_add_91.jpg new file mode 100644 index 0000000000..b5ded6e1de Binary files /dev/null and b/docs/images/test_add_91.jpg differ diff --git a/docs/index.en.md b/docs/index.en.md new file mode 100644 index 0000000000..9c8baf14ad --- /dev/null +++ b/docs/index.en.md @@ -0,0 +1,154 @@ +--- +comments: true +typora-copy-images-to: images +hide: + - navigation + - toc +--- + +
+ +

+ Chat + + + + + + +

+
+ +## Introduction + +PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools that help users train better models and apply them into practice. + +## 🚀 Community + +PaddleOCR is being oversight by a [PMC](https://github.com/PaddlePaddle/PaddleOCR/issues/12122). Issues and PRs will be reviewed on a best-effort basis. For a complete overview of PaddlePaddle community, please visit [community](https://github.com/PaddlePaddle/community). + +⚠️ Note: The [Issues](https://github.com/PaddlePaddle/PaddleOCR/issues) module is only for reporting program 🐞 bugs, for the rest of the questions, please move to the [Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions). Please note that if the Issue mentioned is not a bug, it will be moved to the Discussions module. + +## 📣 Recent updates + +- **🔥2023.8.7 Release PaddleOCR[release/2.7](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7)** + + - Release [PP-OCRv4](./ppocr/overview.en.md), support mobile version and server version + + - PP-OCRv4-mobile:When the speed is comparable, the effect of the Chinese scene is improved by 4.5% compared with PP-OCRv3, the English scene is improved by 10%, and the average recognition accuracy of the 80-language multilingual model is increased by more than 8%. + - PP-OCRv4-server:Release the OCR model with the highest accuracy at present, the detection model accuracy increased by 4.9% in the Chinese and English scenes, and the recognition model accuracy increased by 2% + refer [quickstart](./quick_start.en.md) quick use by one line command, At the same time, the whole process of model training, reasoning, and high-performance deployment can also be completed with few code in the [General OCR Industry Solution](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286) in PaddleX. + + - Release[PP-ChatOCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=332), a new scheme for extracting key information of general scenes using PP-OCR model and ERNIE LLM. + +- 🔨**2022.11 Add implementation of [4 cutting-edge algorithms](./algorithm/overview.en.md)**:Text Detection [DRRG](./algorithm/text_detection/algorithm_det_drrg.en.md), Text Recognition [RFL](./algorithm/text_recognition/algorithm_rec_rfl.en.md), Image Super-Resolution [Text Telescope](./algorithm/super_resolution/algorithm_sr_telescope.en.md),Handwritten Mathematical Expression Recognition [CAN](./algorithm/formula_recognition/algorithm_rec_can.en.md) + +- **2022.10 release [optimized JS version PP-OCRv3 model](./ppocr/infer_deploy/paddle_js.en.md)** with 4.3M model size, 8x faster inference time, and a ready-to-use web demo + + - 💥 **Live Playback: Introduction to PP-StructureV2 optimization strategy**. Scan [the QR code below](#Community) using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.) + +- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** + + - Release [PP-StructureV2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/model_train/recovery_to_doc.en.md) and **one line command to convert PDF to Word**; + - [Layout Analysis](./ppstructure/model_train/train_layout.en.md) optimization: model storage reduced by 95%, while speed increased by 11 times, and the average CPU time-cost is only 41ms; + - [Table Recognition](./ppstructure/model_train/train_table.en.md) optimization: 3 optimization strategies are designed, and the model accuracy is improved by 6% under comparable time consumption; + - [Key Information Extraction](./ppstructure/model_train/train_kie.en.md) optimization:a visual-independent model structure is designed, the accuracy of semantic entity recognition is increased by 2.8%, and the accuracy of relation extraction is increased by 9.1%. + +- **🔥2022.8 Release [OCR scene application collection](./applications/overview.md)** + + - Release **9 vertical models** such as digital tube, LCD screen, license plate, handwriting recognition model, high-precision SVTR model, etc, covering the main OCR vertical applications in general, manufacturing, finance, and transportation industries. + +- **2022.8 Add implementation of [8 cutting-edge algorithms](./algorithm/overview.en.md)** + + - Text Detection: [FCENet](./algorithm/text_detection/algorithm_det_fcenet.en.md), [DB++](./algorithm/text_detection/algorithm_det_db.en.md) + - Text Recognition: [ViTSTR](./algorithm/text_recognition/algorithm_rec_vitstr.en.md), [ABINet](./algorithm/text_recognition/algorithm_rec_abinet.en.md), [VisionLAN](./algorithm/text_recognition/algorithm_rec_visionlan.en.md), [SPIN](./algorithm/text_recognition/algorithm_rec_spin.en.md), [RobustScanner](./algorithm/text_recognition/algorithm_rec_robustscanner.en.md) + - Table Recognition: [TableMaster](./algorithm/table_recognition/algorithm_table_master.en.md) +- **2022.5.9 Release PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** + + - Release [PP-OCRv3](./ppocr/overview.en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. + - Release [PPOCRLabelv2](https://github.com/PFCCLab/PPOCRLabel): Add the annotation function for table recognition task, key information extraction task and irregular text image. + - Release interactive e-book [*"Dive into OCR"*](./blog/ocr_book.en.md), covers the cutting-edge theory and code practice of OCR full stack technology. +- [more](./update.en.md) + +## 🌟 Features + +PaddleOCR support a variety of cutting-edge algorithms related to OCR, and developed industrial featured models/solution [PP-OCR](./ppocr/overview.md)、[PP-Structure](./ppstructure/overview.md) and [PP-ChatOCR](https://aistudio.baidu.com/aistudio/projectdetail/6488689) on this basis, and get through the whole process of data production, model training, compression, inference and deployment. + +![img](./images/186171245-40abc4d7-904f-4949-ade1-250f86ed3a90.jpg) + +> It is recommended to start with the “quick experience” in the document tutorial + +## 📖 Technical exchange and cooperation + +- PaddleX provides a one-stop full-process high-efficiency development platform for flying paddle ecological model training, pressure, and push. Its mission is to help AI technology quickly land, and its vision is to make everyone an AI Developer! + + - PaddleX currently covers areas such as image classification, object detection, image segmentation, 3D, OCR, and time series prediction, and has built-in 36 basic single models, such as RP-DETR, PP-YOLOE, PP-HGNet, PP-LCNet, PP- LiteSeg, etc.; integrated 12 practical industrial solutions, such as PP-OCRv4, PP-ChatOCR, PP-ShiTu, PP-TS, vehicle-mounted road waste detection, identification of prohibited wildlife products, etc. + - PaddleX provides two AI development modes: "Toolbox" and "Developer". The toolbox mode can tune key hyperparameters without code, and the developer mode can perform single-model training, push and multi-model serial inference with low code, and supports both cloud and local terminals. + - PaddleX also supports joint innovation and development, profit sharing! At present, PaddleX is rapidly iterating, and welcomes the participation of individual developers and enterprise developers to create a prosperous AI technology ecosystem! + +## 🇺🇳 Guideline for New Language Requests + +If you want to request a new language support, a PR with 1 following files are needed: + +- In folder [ppocr/utils/dict](./ppocr/utils/dict), +it is necessary to submit the dict text to this path and name it with `{language}_dict.txt` that contains a list of all characters. Please see the format example from other files in that folder. + +If your language has unique elements, please tell me in advance within any way, such as useful links, wikipedia and so on. + +More details, please refer to [Multilingual OCR Development Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048). + +## Visualization + +## PP-OCRv3 + +### PP-OCRv3 Chinese model + +![img](./images/test_add_91.jpg) + +![img](./images/00006737.jpg) + +![](./images/PP-OCRv3-pic001.jpg) + +![](./images/PP-OCRv3-pic002.jpg) + +![](./images/PP-OCRv3-pic003.jpg) + +### PP-OCRv3 English model + +![](./images/en_1.png) + +![](./images/en_2.png) + +![](./images/en_3-0398013.png) + +### PP-OCRv3 Multilingual model + +![img](./images/japan_2.jpg) + +![img](./images/korean_1.jpg) + +#### PP-StructureV2 + +- layout analysis + table recognition + + ![img](./images/ppstructure-20240708082235651.gif) + +- SER (Semantic entity recognition) + + ![img](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240708082238739.jpg) + + ![img](./images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240708082247529.png) + + ![img](./images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f-20240708082253634.png) + +- RE (Relation Extraction) + + ![img](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240708082310650.jpg) + + ![img](./images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240708082316558.jpg) + + ![img](./images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d-20240708082323916.png) + +## 📄 License + +This project is released under Apache 2.0 license diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000..93e112a929 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,137 @@ +--- +comments: true +typora-copy-images-to: images +hide: + - navigation + - toc +--- + +
+ +

+ Chat + + + + + + +

+
+ +## 简介 + +PaddleOCR 旨在打造一套丰富、领先、且实用的 OCR 工具库,助力开发者训练出更好的模型,并应用落地。 + +## 🚀 社区 + +PaddleOCR 由 [PMC](https://github.com/PaddlePaddle/PaddleOCR/issues/12122) 监督。Issues 和 PRs 将在尽力的基础上进行审查。欲了解 PaddlePaddle 社区的完整概况,请访问 [community](https://github.com/PaddlePaddle/community)。 + +⚠️注意:[Issues](https://github.com/PaddlePaddle/PaddleOCR/issues)模块仅用来报告程序🐞Bug,其余提问请移步[Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions)模块提问。如所提Issue不是Bug,会被移到Discussions模块,敬请谅解。 + +## 📣 近期更新 + +- **🔥2024.7 添加 PaddleOCR 算法模型挑战赛冠军方案**: + + - 赛题一:OCR 端到端识别任务冠军方案——[场景文本识别算法-SVTRv2](./algorithm/text_recognition/algorithm_rec_svtrv2.md); + - 赛题二:通用表格识别任务冠军方案——[表格识别算法-SLANet-LCNetV2](./algorithm/table_recognition/algorithm_table_slanet.md)。 + +- **💥2024.6.27 飞桨低代码开发工具 [PaddleX 3.0](https://github.com/paddlepaddle/paddlex) 重磅更新!** + + - 低代码开发范式:支持 OCR 模型全流程低代码开发,提供 Python API,支持用户自定义串联模型; + - 多硬件训推支持:支持英伟达 GPU、昆仑芯、昇腾和寒武纪等多种硬件进行模型训练与推理。PaddleOCR支持的模型见 [模型列表](./model/hardware/install_other_devices.md) + +- **📚直播和OCR实战打卡营预告**:《PP-ChatOCRv2赋能金融报告信息智能化抽取,新金融效率再升级》课程上线,破解复杂版面、表格识别、信息抽取OCR解析难题,直播时间:6月6日(周四)19:00。并于6月11日启动【政务采购合同信息抽取】实战打卡营。报名链接: + +- **🔥2024.5.10 上线星河零代码产线(OCR 相关)**:全面覆盖了以下四大 OCR 核心任务,提供极便捷的 Badcase 分析和实用的在线体验: + + - [通用 OCR](https://aistudio.baidu.com/community/app/91660) (PP-OCRv4)。 + - [通用表格识别](https://aistudio.baidu.com/community/app/91661) (SLANet)。 + - [通用图像信息抽取](https://aistudio.baidu.com/community/app/91662) (PP-ChatOCRv2-common)。 + - [文档场景信息抽取](https://aistudio.baidu.com/community/app/70303) (PP-ChatOCRv2-doc)。 + + 同时采用了 **[全新的场景任务开发范式](https://aistudio.baidu.com/pipeline/mine)** ,将模型统一汇聚,实现训练部署的零代码开发,并支持在线服务化部署和导出离线服务化部署包。 + +- **🔥2023.8.7 发布 PaddleOCR [release/2.7](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7)** + + - 发布[PP-OCRv4](./ppocr/blog/PP-OCRv4_introduction.md),提供 mobile 和 server 两种模型 + - PP-OCRv4-mobile:速度可比情况下,中文场景效果相比于 PP-OCRv3 再提升 4.5%,英文场景提升 10%,80 语种多语言模型平均识别准确率提升 8%以上 + - PP-OCRv4-server:发布了目前精度最高的 OCR 模型,中英文场景上检测模型精度提升 4.9%, 识别模型精度提升 2% + 可参考[快速开始](./quick_start.md) 一行命令快速使用,同时也可在飞桨 AI 套件(PaddleX)中的[通用 OCR 产业方案](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286)中低代码完成模型训练、推理、高性能部署全流程 + +- 🔨**2022.11 新增实现[4 种前沿算法](./algorithm/overview.md)**:文本检测 [DRRG](./algorithm/text_detection/algorithm_det_drrg.md), 文本识别 [RFL](./algorithm/text_recognition/algorithm_rec_rfl.md), 文本超分[Text Telescope](./algorithm/super_resolution/algorithm_sr_telescope.md),公式识别[CAN](./algorithm/formula_recognition/algorithm_rec_can.md) +- **2022.10 优化[JS 版 PP-OCRv3 模型](./ppocr/infer_deploy/paddle_js.md)**:模型大小仅 4.3M,预测速度提升 8 倍,配套 web demo 开箱即用 + +- **💥 直播回放:PaddleOCR 研发团队详解 PP-StructureV2 优化策略**。微信扫描[下方二维码](#开源社区),关注公众号并填写问卷后进入官方交流群,获取直播回放链接与 20G 重磅 OCR 学习大礼包(内含 PDF 转 Word 应用程序、10 种垂类模型、《动手学 OCR》电子书等) + +- **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)**: + + - 发布[PP-StructureV2](./ppstructure/overview.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/model_train/recovery_to_doc.md),支持**一行命令完成 PDF 转 Word**; + - [版面分析](./ppstructure/model_train/train_layout.md)模型优化:模型存储减少 95%,速度提升 11 倍,平均 CPU 耗时仅需 41ms; + - [表格识别](./ppstructure/model_train/train_table.md)模型优化:设计 3 大优化策略,预测耗时不变情况下,模型精度提升 6%; + - [关键信息抽取](./ppstructure/model_train/train_kie.md)模型优化:设计视觉无关模型结构,语义实体识别精度提升 2.8%,关系抽取精度提升 9.1%。 +- 🔥**2022.8 发布 [OCR 场景应用集合](./applications/overview.md)**:包含数码管、液晶屏、车牌、高精度 SVTR 模型、手写体识别等**9 个垂类模型**,覆盖通用,制造、金融、交通行业的主要 OCR 垂类应用。 + +> [更多](./update.md) + +## 🌟 特性 + +支持多种 OCR 相关前沿算法,在此基础上打造产业级特色模型[PP-OCR](./ppocr/overview.md)、[PP-Structure](./ppstructure/overview.md)和[PP-ChatOCRv2](https://aistudio.baidu.com/community/app/70303),并打通数据生产、模型训练、压缩、预测部署全流程。 + + + +## 效果展示 + +### 超轻量PP-OCRv3效果展示 + +#### PP-OCRv3中文模型 + +![img](./images/test_add_91.jpg) + + + + + + + + + +#### PP-OCRv3英文数字模型 + + + + + + + +#### PP-OCRv3多语言模型 + + + + + +#### PP-Structure 文档分析 + +- 版面分析+表格识别 + + + +- SER(语义实体识别) + + + + + + + +- RE(关系提取) + + + + + + + +## 许可证书 + +本项目的发布受Apache 2.0 license许可认证。 diff --git a/docs/javascripts/katex.min.js b/docs/javascripts/katex.min.js new file mode 100644 index 0000000000..fee59b98a5 --- /dev/null +++ b/docs/javascripts/katex.min.js @@ -0,0 +1,12 @@ +document$.subscribe(({ body }) => { + renderMathInElement(body, { + delimiters: [ + { left: "$$", right: "$$", display: true }, + { left: "$", right: "$", display: false }, + { left: "\\(", right: "\\)", display: false }, + { left: "\\[", right: "\\]", display: true } + ], + }) +}) + +!function (e, t) { "object" == typeof exports && "object" == typeof module ? module.exports = t() : "function" == typeof define && define.amd ? define([], t) : "object" == typeof exports ? exports.katex = t() : e.katex = t() }("undefined" != typeof self ? self : this, (function () { return function () { "use strict"; var e = { d: function (t, r) { for (var n in r) e.o(r, n) && !e.o(t, n) && Object.defineProperty(t, n, { enumerable: !0, get: r[n] }) }, o: function (e, t) { return Object.prototype.hasOwnProperty.call(e, t) } }, t = {}; e.d(t, { default: function () { return Yn } }); class r { constructor(e, t) { this.name = void 0, this.position = void 0, this.length = void 0, this.rawMessage = void 0; let n, o, s = "KaTeX parse error: " + e; const i = t && t.loc; if (i && i.start <= i.end) { const e = i.lexer.input; n = i.start, o = i.end, n === e.length ? s += " at end of input: " : s += " at position " + (n + 1) + ": "; const t = e.slice(n, o).replace(/[^]/g, "$&\u0332"); let r, a; r = n > 15 ? "\u2026" + e.slice(n - 15, n) : e.slice(0, n), a = o + 15 < e.length ? e.slice(o, o + 15) + "\u2026" : e.slice(o), s += r + t + a } const a = new Error(s); return a.name = "ParseError", a.__proto__ = r.prototype, a.position = n, null != n && null != o && (a.length = o - n), a.rawMessage = e, a } } r.prototype.__proto__ = Error.prototype; var n = r; const o = /([A-Z])/g, s = { "&": "&", ">": ">", "<": "<", '"': """, "'": "'" }, i = /[&><"']/g; const a = function (e) { return "ordgroup" === e.type || "color" === e.type ? 1 === e.body.length ? a(e.body[0]) : e : "font" === e.type ? a(e.body) : e }; var l = { contains: function (e, t) { return -1 !== e.indexOf(t) }, deflt: function (e, t) { return void 0 === e ? t : e }, escape: function (e) { return String(e).replace(i, (e => s[e])) }, hyphenate: function (e) { return e.replace(o, "-$1").toLowerCase() }, getBaseElem: a, isCharacterBox: function (e) { const t = a(e); return "mathord" === t.type || "textord" === t.type || "atom" === t.type }, protocolFromUrl: function (e) { const t = /^[\x00-\x20]*([^\\/#?]*?)(:|�*58|�*3a|&colon)/i.exec(e); return t ? ":" !== t[2] ? null : /^[a-zA-Z][a-zA-Z0-9+\-.]*$/.test(t[1]) ? t[1].toLowerCase() : null : "_relative" } }; const h = { displayMode: { type: "boolean", description: "Render math in display mode, which puts the math in display style (so \\int and \\sum are large, for example), and centers the math on the page on its own line.", cli: "-d, --display-mode" }, output: { type: { enum: ["htmlAndMathml", "html", "mathml"] }, description: "Determines the markup language of the output.", cli: "-F, --format " }, leqno: { type: "boolean", description: "Render display math in leqno style (left-justified tags)." }, fleqn: { type: "boolean", description: "Render display math flush left." }, throwOnError: { type: "boolean", default: !0, cli: "-t, --no-throw-on-error", cliDescription: "Render errors (in the color given by --error-color) instead of throwing a ParseError exception when encountering an error." }, errorColor: { type: "string", default: "#cc0000", cli: "-c, --error-color ", cliDescription: "A color string given in the format 'rgb' or 'rrggbb' (no #). This option determines the color of errors rendered by the -t option.", cliProcessor: e => "#" + e }, macros: { type: "object", cli: "-m, --macro ", cliDescription: "Define custom macro of the form '\\foo:expansion' (use multiple -m arguments for multiple macros).", cliDefault: [], cliProcessor: (e, t) => (t.push(e), t) }, minRuleThickness: { type: "number", description: "Specifies a minimum thickness, in ems, for fraction lines, `\\sqrt` top lines, `{array}` vertical lines, `\\hline`, `\\hdashline`, `\\underline`, `\\overline`, and the borders of `\\fbox`, `\\boxed`, and `\\fcolorbox`.", processor: e => Math.max(0, e), cli: "--min-rule-thickness ", cliProcessor: parseFloat }, colorIsTextColor: { type: "boolean", description: "Makes \\color behave like LaTeX's 2-argument \\textcolor, instead of LaTeX's one-argument \\color mode change.", cli: "-b, --color-is-text-color" }, strict: { type: [{ enum: ["warn", "ignore", "error"] }, "boolean", "function"], description: "Turn on strict / LaTeX faithfulness mode, which throws an error if the input uses features that are not supported by LaTeX.", cli: "-S, --strict", cliDefault: !1 }, trust: { type: ["boolean", "function"], description: "Trust the input, enabling all HTML features such as \\url.", cli: "-T, --trust" }, maxSize: { type: "number", default: 1 / 0, description: "If non-zero, all user-specified sizes, e.g. in \\rule{500em}{500em}, will be capped to maxSize ems. Otherwise, elements and spaces can be arbitrarily large", processor: e => Math.max(0, e), cli: "-s, --max-size ", cliProcessor: parseInt }, maxExpand: { type: "number", default: 1e3, description: "Limit the number of macro expansions to the specified number, to prevent e.g. infinite macro loops. If set to Infinity, the macro expander will try to fully expand as in LaTeX.", processor: e => Math.max(0, e), cli: "-e, --max-expand ", cliProcessor: e => "Infinity" === e ? 1 / 0 : parseInt(e) }, globalGroup: { type: "boolean", cli: !1 } }; function c(e) { if (e.default) return e.default; const t = e.type, r = Array.isArray(t) ? t[0] : t; if ("string" != typeof r) return r.enum[0]; switch (r) { case "boolean": return !1; case "string": return ""; case "number": return 0; case "object": return {} } } class m { constructor(e) { this.displayMode = void 0, this.output = void 0, this.leqno = void 0, this.fleqn = void 0, this.throwOnError = void 0, this.errorColor = void 0, this.macros = void 0, this.minRuleThickness = void 0, this.colorIsTextColor = void 0, this.strict = void 0, this.trust = void 0, this.maxSize = void 0, this.maxExpand = void 0, this.globalGroup = void 0, e = e || {}; for (const t in h) if (h.hasOwnProperty(t)) { const r = h[t]; this[t] = void 0 !== e[t] ? r.processor ? r.processor(e[t]) : e[t] : c(r) } } reportNonstrict(e, t, r) { let o = this.strict; if ("function" == typeof o && (o = o(e, t, r)), o && "ignore" !== o) { if (!0 === o || "error" === o) throw new n("LaTeX-incompatible input and strict mode is set to 'error': " + t + " [" + e + "]", r); "warn" === o ? "undefined" != typeof console && console.warn("LaTeX-incompatible input and strict mode is set to 'warn': " + t + " [" + e + "]") : "undefined" != typeof console && console.warn("LaTeX-incompatible input and strict mode is set to unrecognized '" + o + "': " + t + " [" + e + "]") } } useStrictBehavior(e, t, r) { let n = this.strict; if ("function" == typeof n) try { n = n(e, t, r) } catch (e) { n = "error" } return !(!n || "ignore" === n) && (!0 === n || "error" === n || ("warn" === n ? ("undefined" != typeof console && console.warn("LaTeX-incompatible input and strict mode is set to 'warn': " + t + " [" + e + "]"), !1) : ("undefined" != typeof console && console.warn("LaTeX-incompatible input and strict mode is set to unrecognized '" + n + "': " + t + " [" + e + "]"), !1))) } isTrusted(e) { if (e.url && !e.protocol) { const t = l.protocolFromUrl(e.url); if (null == t) return !1; e.protocol = t } const t = "function" == typeof this.trust ? this.trust(e) : this.trust; return Boolean(t) } } class p { constructor(e, t, r) { this.id = void 0, this.size = void 0, this.cramped = void 0, this.id = e, this.size = t, this.cramped = r } sup() { return u[d[this.id]] } sub() { return u[g[this.id]] } fracNum() { return u[f[this.id]] } fracDen() { return u[b[this.id]] } cramp() { return u[y[this.id]] } text() { return u[x[this.id]] } isTight() { return this.size >= 2 } } const u = [new p(0, 0, !1), new p(1, 0, !0), new p(2, 1, !1), new p(3, 1, !0), new p(4, 2, !1), new p(5, 2, !0), new p(6, 3, !1), new p(7, 3, !0)], d = [4, 5, 4, 5, 6, 7, 6, 7], g = [5, 5, 5, 5, 7, 7, 7, 7], f = [2, 3, 4, 5, 6, 7, 6, 7], b = [3, 3, 5, 5, 7, 7, 7, 7], y = [1, 1, 3, 3, 5, 5, 7, 7], x = [0, 1, 2, 3, 2, 3, 2, 3]; var w = { DISPLAY: u[0], TEXT: u[2], SCRIPT: u[4], SCRIPTSCRIPT: u[6] }; const v = [{ name: "latin", blocks: [[256, 591], [768, 879]] }, { name: "cyrillic", blocks: [[1024, 1279]] }, { name: "armenian", blocks: [[1328, 1423]] }, { name: "brahmic", blocks: [[2304, 4255]] }, { name: "georgian", blocks: [[4256, 4351]] }, { name: "cjk", blocks: [[12288, 12543], [19968, 40879], [65280, 65376]] }, { name: "hangul", blocks: [[44032, 55215]] }]; const k = []; function S(e) { for (let t = 0; t < k.length; t += 2)if (e >= k[t] && e <= k[t + 1]) return !0; return !1 } v.forEach((e => e.blocks.forEach((e => k.push(...e))))); const M = 80, z = { doubleleftarrow: "M262 157\nl10-10c34-36 62.7-77 86-123 3.3-8 5-13.3 5-16 0-5.3-6.7-8-20-8-7.3\n 0-12.2.5-14.5 1.5-2.3 1-4.8 4.5-7.5 10.5-49.3 97.3-121.7 169.3-217 216-28\n 14-57.3 25-88 33-6.7 2-11 3.8-13 5.5-2 1.7-3 4.2-3 7.5s1 5.8 3 7.5\nc2 1.7 6.3 3.5 13 5.5 68 17.3 128.2 47.8 180.5 91.5 52.3 43.7 93.8 96.2 124.5\n 157.5 9.3 8 15.3 12.3 18 13h6c12-.7 18-4 18-10 0-2-1.7-7-5-15-23.3-46-52-87\n-86-123l-10-10h399738v-40H218c328 0 0 0 0 0l-10-8c-26.7-20-65.7-43-117-69 2.7\n-2 6-3.7 10-5 36.7-16 72.3-37.3 107-64l10-8h399782v-40z\nm8 0v40h399730v-40zm0 194v40h399730v-40z", doublerightarrow: "M399738 392l\n-10 10c-34 36-62.7 77-86 123-3.3 8-5 13.3-5 16 0 5.3 6.7 8 20 8 7.3 0 12.2-.5\n 14.5-1.5 2.3-1 4.8-4.5 7.5-10.5 49.3-97.3 121.7-169.3 217-216 28-14 57.3-25 88\n-33 6.7-2 11-3.8 13-5.5 2-1.7 3-4.2 3-7.5s-1-5.8-3-7.5c-2-1.7-6.3-3.5-13-5.5-68\n-17.3-128.2-47.8-180.5-91.5-52.3-43.7-93.8-96.2-124.5-157.5-9.3-8-15.3-12.3-18\n-13h-6c-12 .7-18 4-18 10 0 2 1.7 7 5 15 23.3 46 52 87 86 123l10 10H0v40h399782\nc-328 0 0 0 0 0l10 8c26.7 20 65.7 43 117 69-2.7 2-6 3.7-10 5-36.7 16-72.3 37.3\n-107 64l-10 8H0v40zM0 157v40h399730v-40zm0 194v40h399730v-40z", leftarrow: "M400000 241H110l3-3c68.7-52.7 113.7-120\n 135-202 4-14.7 6-23 6-25 0-7.3-7-11-21-11-8 0-13.2.8-15.5 2.5-2.3 1.7-4.2 5.8\n-5.5 12.5-1.3 4.7-2.7 10.3-4 17-12 48.7-34.8 92-68.5 130S65.3 228.3 18 247\nc-10 4-16 7.7-18 11 0 8.7 6 14.3 18 17 47.3 18.7 87.8 47 121.5 85S196 441.3 208\n 490c.7 2 1.3 5 2 9s1.2 6.7 1.5 8c.3 1.3 1 3.3 2 6s2.2 4.5 3.5 5.5c1.3 1 3.3\n 1.8 6 2.5s6 1 10 1c14 0 21-3.7 21-11 0-2-2-10.3-6-25-20-79.3-65-146.7-135-202\n l-3-3h399890zM100 241v40h399900v-40z", leftbrace: "M6 548l-6-6v-35l6-11c56-104 135.3-181.3 238-232 57.3-28.7 117\n-45 179-50h399577v120H403c-43.3 7-81 15-113 26-100.7 33-179.7 91-237 174-2.7\n 5-6 9-10 13-.7 1-7.3 1-20 1H6z", leftbraceunder: "M0 6l6-6h17c12.688 0 19.313.3 20 1 4 4 7.313 8.3 10 13\n 35.313 51.3 80.813 93.8 136.5 127.5 55.688 33.7 117.188 55.8 184.5 66.5.688\n 0 2 .3 4 1 18.688 2.7 76 4.3 172 5h399450v120H429l-6-1c-124.688-8-235-61.7\n-331-161C60.687 138.7 32.312 99.3 7 54L0 41V6z", leftgroup: "M400000 80\nH435C64 80 168.3 229.4 21 260c-5.9 1.2-18 0-18 0-2 0-3-1-3-3v-38C76 61 257 0\n 435 0h399565z", leftgroupunder: "M400000 262\nH435C64 262 168.3 112.6 21 82c-5.9-1.2-18 0-18 0-2 0-3 1-3 3v38c76 158 257 219\n 435 219h399565z", leftharpoon: "M0 267c.7 5.3 3 10 7 14h399993v-40H93c3.3\n-3.3 10.2-9.5 20.5-18.5s17.8-15.8 22.5-20.5c50.7-52 88-110.3 112-175 4-11.3 5\n-18.3 3-21-1.3-4-7.3-6-18-6-8 0-13 .7-15 2s-4.7 6.7-8 16c-42 98.7-107.3 174.7\n-196 228-6.7 4.7-10.7 8-12 10-1.3 2-2 5.7-2 11zm100-26v40h399900v-40z", leftharpoonplus: "M0 267c.7 5.3 3 10 7 14h399993v-40H93c3.3-3.3 10.2-9.5\n 20.5-18.5s17.8-15.8 22.5-20.5c50.7-52 88-110.3 112-175 4-11.3 5-18.3 3-21-1.3\n-4-7.3-6-18-6-8 0-13 .7-15 2s-4.7 6.7-8 16c-42 98.7-107.3 174.7-196 228-6.7 4.7\n-10.7 8-12 10-1.3 2-2 5.7-2 11zm100-26v40h399900v-40zM0 435v40h400000v-40z\nm0 0v40h400000v-40z", leftharpoondown: "M7 241c-4 4-6.333 8.667-7 14 0 5.333.667 9 2 11s5.333\n 5.333 12 10c90.667 54 156 130 196 228 3.333 10.667 6.333 16.333 9 17 2 .667 5\n 1 9 1h5c10.667 0 16.667-2 18-6 2-2.667 1-9.667-3-21-32-87.333-82.667-157.667\n-152-211l-3-3h399907v-40zM93 281 H400000 v-40L7 241z", leftharpoondownplus: "M7 435c-4 4-6.3 8.7-7 14 0 5.3.7 9 2 11s5.3 5.3 12\n 10c90.7 54 156 130 196 228 3.3 10.7 6.3 16.3 9 17 2 .7 5 1 9 1h5c10.7 0 16.7\n-2 18-6 2-2.7 1-9.7-3-21-32-87.3-82.7-157.7-152-211l-3-3h399907v-40H7zm93 0\nv40h399900v-40zM0 241v40h399900v-40zm0 0v40h399900v-40z", lefthook: "M400000 281 H103s-33-11.2-61-33.5S0 197.3 0 164s14.2-61.2 42.5\n-83.5C70.8 58.2 104 47 142 47 c16.7 0 25 6.7 25 20 0 12-8.7 18.7-26 20-40 3.3\n-68.7 15.7-86 37-10 12-15 25.3-15 40 0 22.7 9.8 40.7 29.5 54 19.7 13.3 43.5 21\n 71.5 23h399859zM103 281v-40h399897v40z", leftlinesegment: "M40 281 V428 H0 V94 H40 V241 H400000 v40z\nM40 281 V428 H0 V94 H40 V241 H400000 v40z", leftmapsto: "M40 281 V448H0V74H40V241H400000v40z\nM40 281 V448H0V74H40V241H400000v40z", leftToFrom: "M0 147h400000v40H0zm0 214c68 40 115.7 95.7 143 167h22c15.3 0 23\n-.3 23-1 0-1.3-5.3-13.7-16-37-18-35.3-41.3-69-70-101l-7-8h399905v-40H95l7-8\nc28.7-32 52-65.7 70-101 10.7-23.3 16-35.7 16-37 0-.7-7.7-1-23-1h-22C115.7 265.3\n 68 321 0 361zm0-174v-40h399900v40zm100 154v40h399900v-40z", longequal: "M0 50 h400000 v40H0z m0 194h40000v40H0z\nM0 50 h400000 v40H0z m0 194h40000v40H0z", midbrace: "M200428 334\nc-100.7-8.3-195.3-44-280-108-55.3-42-101.7-93-139-153l-9-14c-2.7 4-5.7 8.7-9 14\n-53.3 86.7-123.7 153-211 199-66.7 36-137.3 56.3-212 62H0V214h199568c178.3-11.7\n 311.7-78.3 403-201 6-8 9.7-12 11-12 .7-.7 6.7-1 18-1s17.3.3 18 1c1.3 0 5 4 11\n 12 44.7 59.3 101.3 106.3 170 141s145.3 54.3 229 60h199572v120z", midbraceunder: "M199572 214\nc100.7 8.3 195.3 44 280 108 55.3 42 101.7 93 139 153l9 14c2.7-4 5.7-8.7 9-14\n 53.3-86.7 123.7-153 211-199 66.7-36 137.3-56.3 212-62h199568v120H200432c-178.3\n 11.7-311.7 78.3-403 201-6 8-9.7 12-11 12-.7.7-6.7 1-18 1s-17.3-.3-18-1c-1.3 0\n-5-4-11-12-44.7-59.3-101.3-106.3-170-141s-145.3-54.3-229-60H0V214z", oiintSize1: "M512.6 71.6c272.6 0 320.3 106.8 320.3 178.2 0 70.8-47.7 177.6\n-320.3 177.6S193.1 320.6 193.1 249.8c0-71.4 46.9-178.2 319.5-178.2z\nm368.1 178.2c0-86.4-60.9-215.4-368.1-215.4-306.4 0-367.3 129-367.3 215.4 0 85.8\n60.9 214.8 367.3 214.8 307.2 0 368.1-129 368.1-214.8z", oiintSize2: "M757.8 100.1c384.7 0 451.1 137.6 451.1 230 0 91.3-66.4 228.8\n-451.1 228.8-386.3 0-452.7-137.5-452.7-228.8 0-92.4 66.4-230 452.7-230z\nm502.4 230c0-111.2-82.4-277.2-502.4-277.2s-504 166-504 277.2\nc0 110 84 276 504 276s502.4-166 502.4-276z", oiiintSize1: "M681.4 71.6c408.9 0 480.5 106.8 480.5 178.2 0 70.8-71.6 177.6\n-480.5 177.6S202.1 320.6 202.1 249.8c0-71.4 70.5-178.2 479.3-178.2z\nm525.8 178.2c0-86.4-86.8-215.4-525.7-215.4-437.9 0-524.7 129-524.7 215.4 0\n85.8 86.8 214.8 524.7 214.8 438.9 0 525.7-129 525.7-214.8z", oiiintSize2: "M1021.2 53c603.6 0 707.8 165.8 707.8 277.2 0 110-104.2 275.8\n-707.8 275.8-606 0-710.2-165.8-710.2-275.8C311 218.8 415.2 53 1021.2 53z\nm770.4 277.1c0-131.2-126.4-327.6-770.5-327.6S248.4 198.9 248.4 330.1\nc0 130 128.8 326.4 772.7 326.4s770.5-196.4 770.5-326.4z", rightarrow: "M0 241v40h399891c-47.3 35.3-84 78-110 128\n-16.7 32-27.7 63.7-33 95 0 1.3-.2 2.7-.5 4-.3 1.3-.5 2.3-.5 3 0 7.3 6.7 11 20\n 11 8 0 13.2-.8 15.5-2.5 2.3-1.7 4.2-5.5 5.5-11.5 2-13.3 5.7-27 11-41 14.7-44.7\n 39-84.5 73-119.5s73.7-60.2 119-75.5c6-2 9-5.7 9-11s-3-9-9-11c-45.3-15.3-85\n-40.5-119-75.5s-58.3-74.8-73-119.5c-4.7-14-8.3-27.3-11-40-1.3-6.7-3.2-10.8-5.5\n-12.5-2.3-1.7-7.5-2.5-15.5-2.5-14 0-21 3.7-21 11 0 2 2 10.3 6 25 20.7 83.3 67\n 151.7 139 205zm0 0v40h399900v-40z", rightbrace: "M400000 542l\n-6 6h-17c-12.7 0-19.3-.3-20-1-4-4-7.3-8.3-10-13-35.3-51.3-80.8-93.8-136.5-127.5\ns-117.2-55.8-184.5-66.5c-.7 0-2-.3-4-1-18.7-2.7-76-4.3-172-5H0V214h399571l6 1\nc124.7 8 235 61.7 331 161 31.3 33.3 59.7 72.7 85 118l7 13v35z", rightbraceunder: "M399994 0l6 6v35l-6 11c-56 104-135.3 181.3-238 232-57.3\n 28.7-117 45-179 50H-300V214h399897c43.3-7 81-15 113-26 100.7-33 179.7-91 237\n-174 2.7-5 6-9 10-13 .7-1 7.3-1 20-1h17z", rightgroup: "M0 80h399565c371 0 266.7 149.4 414 180 5.9 1.2 18 0 18 0 2 0\n 3-1 3-3v-38c-76-158-257-219-435-219H0z", rightgroupunder: "M0 262h399565c371 0 266.7-149.4 414-180 5.9-1.2 18 0 18\n 0 2 0 3 1 3 3v38c-76 158-257 219-435 219H0z", rightharpoon: "M0 241v40h399993c4.7-4.7 7-9.3 7-14 0-9.3\n-3.7-15.3-11-18-92.7-56.7-159-133.7-199-231-3.3-9.3-6-14.7-8-16-2-1.3-7-2-15-2\n-10.7 0-16.7 2-18 6-2 2.7-1 9.7 3 21 15.3 42 36.7 81.8 64 119.5 27.3 37.7 58\n 69.2 92 94.5zm0 0v40h399900v-40z", rightharpoonplus: "M0 241v40h399993c4.7-4.7 7-9.3 7-14 0-9.3-3.7-15.3-11\n-18-92.7-56.7-159-133.7-199-231-3.3-9.3-6-14.7-8-16-2-1.3-7-2-15-2-10.7 0-16.7\n 2-18 6-2 2.7-1 9.7 3 21 15.3 42 36.7 81.8 64 119.5 27.3 37.7 58 69.2 92 94.5z\nm0 0v40h399900v-40z m100 194v40h399900v-40zm0 0v40h399900v-40z", rightharpoondown: "M399747 511c0 7.3 6.7 11 20 11 8 0 13-.8 15-2.5s4.7-6.8\n 8-15.5c40-94 99.3-166.3 178-217 13.3-8 20.3-12.3 21-13 5.3-3.3 8.5-5.8 9.5\n-7.5 1-1.7 1.5-5.2 1.5-10.5s-2.3-10.3-7-15H0v40h399908c-34 25.3-64.7 57-92 95\n-27.3 38-48.7 77.7-64 119-3.3 8.7-5 14-5 16zM0 241v40h399900v-40z", rightharpoondownplus: "M399747 705c0 7.3 6.7 11 20 11 8 0 13-.8\n 15-2.5s4.7-6.8 8-15.5c40-94 99.3-166.3 178-217 13.3-8 20.3-12.3 21-13 5.3-3.3\n 8.5-5.8 9.5-7.5 1-1.7 1.5-5.2 1.5-10.5s-2.3-10.3-7-15H0v40h399908c-34 25.3\n-64.7 57-92 95-27.3 38-48.7 77.7-64 119-3.3 8.7-5 14-5 16zM0 435v40h399900v-40z\nm0-194v40h400000v-40zm0 0v40h400000v-40z", righthook: "M399859 241c-764 0 0 0 0 0 40-3.3 68.7-15.7 86-37 10-12 15-25.3\n 15-40 0-22.7-9.8-40.7-29.5-54-19.7-13.3-43.5-21-71.5-23-17.3-1.3-26-8-26-20 0\n-13.3 8.7-20 26-20 38 0 71 11.2 99 33.5 0 0 7 5.6 21 16.7 14 11.2 21 33.5 21\n 66.8s-14 61.2-42 83.5c-28 22.3-61 33.5-99 33.5L0 241z M0 281v-40h399859v40z", rightlinesegment: "M399960 241 V94 h40 V428 h-40 V281 H0 v-40z\nM399960 241 V94 h40 V428 h-40 V281 H0 v-40z", rightToFrom: "M400000 167c-70.7-42-118-97.7-142-167h-23c-15.3 0-23 .3-23\n 1 0 1.3 5.3 13.7 16 37 18 35.3 41.3 69 70 101l7 8H0v40h399905l-7 8c-28.7 32\n-52 65.7-70 101-10.7 23.3-16 35.7-16 37 0 .7 7.7 1 23 1h23c24-69.3 71.3-125 142\n-167z M100 147v40h399900v-40zM0 341v40h399900v-40z", twoheadleftarrow: "M0 167c68 40\n 115.7 95.7 143 167h22c15.3 0 23-.3 23-1 0-1.3-5.3-13.7-16-37-18-35.3-41.3-69\n-70-101l-7-8h125l9 7c50.7 39.3 85 86 103 140h46c0-4.7-6.3-18.7-19-42-18-35.3\n-40-67.3-66-96l-9-9h399716v-40H284l9-9c26-28.7 48-60.7 66-96 12.7-23.333 19\n-37.333 19-42h-46c-18 54-52.3 100.7-103 140l-9 7H95l7-8c28.7-32 52-65.7 70-101\n 10.7-23.333 16-35.7 16-37 0-.7-7.7-1-23-1h-22C115.7 71.3 68 127 0 167z", twoheadrightarrow: "M400000 167\nc-68-40-115.7-95.7-143-167h-22c-15.3 0-23 .3-23 1 0 1.3 5.3 13.7 16 37 18 35.3\n 41.3 69 70 101l7 8h-125l-9-7c-50.7-39.3-85-86-103-140h-46c0 4.7 6.3 18.7 19 42\n 18 35.3 40 67.3 66 96l9 9H0v40h399716l-9 9c-26 28.7-48 60.7-66 96-12.7 23.333\n-19 37.333-19 42h46c18-54 52.3-100.7 103-140l9-7h125l-7 8c-28.7 32-52 65.7-70\n 101-10.7 23.333-16 35.7-16 37 0 .7 7.7 1 23 1h22c27.3-71.3 75-127 143-167z", tilde1: "M200 55.538c-77 0-168 73.953-177 73.953-3 0-7\n-2.175-9-5.437L2 97c-1-2-2-4-2-6 0-4 2-7 5-9l20-12C116 12 171 0 207 0c86 0\n 114 68 191 68 78 0 168-68 177-68 4 0 7 2 9 5l12 19c1 2.175 2 4.35 2 6.525 0\n 4.35-2 7.613-5 9.788l-19 13.05c-92 63.077-116.937 75.308-183 76.128\n-68.267.847-113-73.952-191-73.952z", tilde2: "M344 55.266c-142 0-300.638 81.316-311.5 86.418\n-8.01 3.762-22.5 10.91-23.5 5.562L1 120c-1-2-1-3-1-4 0-5 3-9 8-10l18.4-9C160.9\n 31.9 283 0 358 0c148 0 188 122 331 122s314-97 326-97c4 0 8 2 10 7l7 21.114\nc1 2.14 1 3.21 1 4.28 0 5.347-3 9.626-7 10.696l-22.3 12.622C852.6 158.372 751\n 181.476 676 181.476c-149 0-189-126.21-332-126.21z", tilde3: "M786 59C457 59 32 175.242 13 175.242c-6 0-10-3.457\n-11-10.37L.15 138c-1-7 3-12 10-13l19.2-6.4C378.4 40.7 634.3 0 804.3 0c337 0\n 411.8 157 746.8 157 328 0 754-112 773-112 5 0 10 3 11 9l1 14.075c1 8.066-.697\n 16.595-6.697 17.492l-21.052 7.31c-367.9 98.146-609.15 122.696-778.15 122.696\n -338 0-409-156.573-744-156.573z", tilde4: "M786 58C457 58 32 177.487 13 177.487c-6 0-10-3.345\n-11-10.035L.15 143c-1-7 3-12 10-13l22-6.7C381.2 35 637.15 0 807.15 0c337 0 409\n 177 744 177 328 0 754-127 773-127 5 0 10 3 11 9l1 14.794c1 7.805-3 13.38-9\n 14.495l-20.7 5.574c-366.85 99.79-607.3 139.372-776.3 139.372-338 0-409\n -175.236-744-175.236z", vec: "M377 20c0-5.333 1.833-10 5.5-14S391 0 397 0c4.667 0 8.667 1.667 12 5\n3.333 2.667 6.667 9 10 19 6.667 24.667 20.333 43.667 41 57 7.333 4.667 11\n10.667 11 18 0 6-1 10-3 12s-6.667 5-14 9c-28.667 14.667-53.667 35.667-75 63\n-1.333 1.333-3.167 3.5-5.5 6.5s-4 4.833-5 5.5c-1 .667-2.5 1.333-4.5 2s-4.333 1\n-7 1c-4.667 0-9.167-1.833-13.5-5.5S337 184 337 178c0-12.667 15.667-32.333 47-59\nH213l-171-1c-8.667-6-13-12.333-13-19 0-4.667 4.333-11.333 13-20h359\nc-16-25.333-24-45-24-59z", widehat1: "M529 0h5l519 115c5 1 9 5 9 10 0 1-1 2-1 3l-4 22\nc-1 5-5 9-11 9h-2L532 67 19 159h-2c-5 0-9-4-11-9l-5-22c-1-6 2-12 8-13z", widehat2: "M1181 0h2l1171 176c6 0 10 5 10 11l-2 23c-1 6-5 10\n-11 10h-1L1182 67 15 220h-1c-6 0-10-4-11-10l-2-23c-1-6 4-11 10-11z", widehat3: "M1181 0h2l1171 236c6 0 10 5 10 11l-2 23c-1 6-5 10\n-11 10h-1L1182 67 15 280h-1c-6 0-10-4-11-10l-2-23c-1-6 4-11 10-11z", widehat4: "M1181 0h2l1171 296c6 0 10 5 10 11l-2 23c-1 6-5 10\n-11 10h-1L1182 67 15 340h-1c-6 0-10-4-11-10l-2-23c-1-6 4-11 10-11z", widecheck1: "M529,159h5l519,-115c5,-1,9,-5,9,-10c0,-1,-1,-2,-1,-3l-4,-22c-1,\n-5,-5,-9,-11,-9h-2l-512,92l-513,-92h-2c-5,0,-9,4,-11,9l-5,22c-1,6,2,12,8,13z", widecheck2: "M1181,220h2l1171,-176c6,0,10,-5,10,-11l-2,-23c-1,-6,-5,-10,\n-11,-10h-1l-1168,153l-1167,-153h-1c-6,0,-10,4,-11,10l-2,23c-1,6,4,11,10,11z", widecheck3: "M1181,280h2l1171,-236c6,0,10,-5,10,-11l-2,-23c-1,-6,-5,-10,\n-11,-10h-1l-1168,213l-1167,-213h-1c-6,0,-10,4,-11,10l-2,23c-1,6,4,11,10,11z", widecheck4: "M1181,340h2l1171,-296c6,0,10,-5,10,-11l-2,-23c-1,-6,-5,-10,\n-11,-10h-1l-1168,273l-1167,-273h-1c-6,0,-10,4,-11,10l-2,23c-1,6,4,11,10,11z", baraboveleftarrow: "M400000 620h-399890l3 -3c68.7 -52.7 113.7 -120 135 -202\nc4 -14.7 6 -23 6 -25c0 -7.3 -7 -11 -21 -11c-8 0 -13.2 0.8 -15.5 2.5\nc-2.3 1.7 -4.2 5.8 -5.5 12.5c-1.3 4.7 -2.7 10.3 -4 17c-12 48.7 -34.8 92 -68.5 130\ns-74.2 66.3 -121.5 85c-10 4 -16 7.7 -18 11c0 8.7 6 14.3 18 17c47.3 18.7 87.8 47\n121.5 85s56.5 81.3 68.5 130c0.7 2 1.3 5 2 9s1.2 6.7 1.5 8c0.3 1.3 1 3.3 2 6\ns2.2 4.5 3.5 5.5c1.3 1 3.3 1.8 6 2.5s6 1 10 1c14 0 21 -3.7 21 -11\nc0 -2 -2 -10.3 -6 -25c-20 -79.3 -65 -146.7 -135 -202l-3 -3h399890z\nM100 620v40h399900v-40z M0 241v40h399900v-40zM0 241v40h399900v-40z", rightarrowabovebar: "M0 241v40h399891c-47.3 35.3-84 78-110 128-16.7 32\n-27.7 63.7-33 95 0 1.3-.2 2.7-.5 4-.3 1.3-.5 2.3-.5 3 0 7.3 6.7 11 20 11 8 0\n13.2-.8 15.5-2.5 2.3-1.7 4.2-5.5 5.5-11.5 2-13.3 5.7-27 11-41 14.7-44.7 39\n-84.5 73-119.5s73.7-60.2 119-75.5c6-2 9-5.7 9-11s-3-9-9-11c-45.3-15.3-85-40.5\n-119-75.5s-58.3-74.8-73-119.5c-4.7-14-8.3-27.3-11-40-1.3-6.7-3.2-10.8-5.5\n-12.5-2.3-1.7-7.5-2.5-15.5-2.5-14 0-21 3.7-21 11 0 2 2 10.3 6 25 20.7 83.3 67\n151.7 139 205zm96 379h399894v40H0zm0 0h399904v40H0z", baraboveshortleftharpoon: "M507,435c-4,4,-6.3,8.7,-7,14c0,5.3,0.7,9,2,11\nc1.3,2,5.3,5.3,12,10c90.7,54,156,130,196,228c3.3,10.7,6.3,16.3,9,17\nc2,0.7,5,1,9,1c0,0,5,0,5,0c10.7,0,16.7,-2,18,-6c2,-2.7,1,-9.7,-3,-21\nc-32,-87.3,-82.7,-157.7,-152,-211c0,0,-3,-3,-3,-3l399351,0l0,-40\nc-398570,0,-399437,0,-399437,0z M593 435 v40 H399500 v-40z\nM0 281 v-40 H399908 v40z M0 281 v-40 H399908 v40z", rightharpoonaboveshortbar: "M0,241 l0,40c399126,0,399993,0,399993,0\nc4.7,-4.7,7,-9.3,7,-14c0,-9.3,-3.7,-15.3,-11,-18c-92.7,-56.7,-159,-133.7,-199,\n-231c-3.3,-9.3,-6,-14.7,-8,-16c-2,-1.3,-7,-2,-15,-2c-10.7,0,-16.7,2,-18,6\nc-2,2.7,-1,9.7,3,21c15.3,42,36.7,81.8,64,119.5c27.3,37.7,58,69.2,92,94.5z\nM0 241 v40 H399908 v-40z M0 475 v-40 H399500 v40z M0 475 v-40 H399500 v40z", shortbaraboveleftharpoon: "M7,435c-4,4,-6.3,8.7,-7,14c0,5.3,0.7,9,2,11\nc1.3,2,5.3,5.3,12,10c90.7,54,156,130,196,228c3.3,10.7,6.3,16.3,9,17c2,0.7,5,1,9,\n1c0,0,5,0,5,0c10.7,0,16.7,-2,18,-6c2,-2.7,1,-9.7,-3,-21c-32,-87.3,-82.7,-157.7,\n-152,-211c0,0,-3,-3,-3,-3l399907,0l0,-40c-399126,0,-399993,0,-399993,0z\nM93 435 v40 H400000 v-40z M500 241 v40 H400000 v-40z M500 241 v40 H400000 v-40z", shortrightharpoonabovebar: "M53,241l0,40c398570,0,399437,0,399437,0\nc4.7,-4.7,7,-9.3,7,-14c0,-9.3,-3.7,-15.3,-11,-18c-92.7,-56.7,-159,-133.7,-199,\n-231c-3.3,-9.3,-6,-14.7,-8,-16c-2,-1.3,-7,-2,-15,-2c-10.7,0,-16.7,2,-18,6\nc-2,2.7,-1,9.7,3,21c15.3,42,36.7,81.8,64,119.5c27.3,37.7,58,69.2,92,94.5z\nM500 241 v40 H399408 v-40z M500 435 v40 H400000 v-40z" }; class A { constructor(e) { this.children = void 0, this.classes = void 0, this.height = void 0, this.depth = void 0, this.maxFontSize = void 0, this.style = void 0, this.children = e, this.classes = [], this.height = 0, this.depth = 0, this.maxFontSize = 0, this.style = {} } hasClass(e) { return l.contains(this.classes, e) } toNode() { const e = document.createDocumentFragment(); for (let t = 0; t < this.children.length; t++)e.appendChild(this.children[t].toNode()); return e } toMarkup() { let e = ""; for (let t = 0; t < this.children.length; t++)e += this.children[t].toMarkup(); return e } toText() { return this.children.map((e => e.toText())).join("") } } var T = { "AMS-Regular": { 32: [0, 0, 0, 0, .25], 65: [0, .68889, 0, 0, .72222], 66: [0, .68889, 0, 0, .66667], 67: [0, .68889, 0, 0, .72222], 68: [0, .68889, 0, 0, .72222], 69: [0, .68889, 0, 0, .66667], 70: [0, .68889, 0, 0, .61111], 71: [0, .68889, 0, 0, .77778], 72: [0, .68889, 0, 0, .77778], 73: [0, .68889, 0, 0, .38889], 74: [.16667, .68889, 0, 0, .5], 75: [0, .68889, 0, 0, .77778], 76: [0, .68889, 0, 0, .66667], 77: [0, .68889, 0, 0, .94445], 78: [0, .68889, 0, 0, .72222], 79: [.16667, .68889, 0, 0, .77778], 80: [0, .68889, 0, 0, .61111], 81: [.16667, .68889, 0, 0, .77778], 82: [0, .68889, 0, 0, .72222], 83: [0, .68889, 0, 0, .55556], 84: [0, .68889, 0, 0, .66667], 85: [0, .68889, 0, 0, .72222], 86: [0, .68889, 0, 0, .72222], 87: [0, .68889, 0, 0, 1], 88: [0, .68889, 0, 0, .72222], 89: [0, .68889, 0, 0, .72222], 90: [0, .68889, 0, 0, .66667], 107: [0, .68889, 0, 0, .55556], 160: [0, 0, 0, 0, .25], 165: [0, .675, .025, 0, .75], 174: [.15559, .69224, 0, 0, .94666], 240: [0, .68889, 0, 0, .55556], 295: [0, .68889, 0, 0, .54028], 710: [0, .825, 0, 0, 2.33334], 732: [0, .9, 0, 0, 2.33334], 770: [0, .825, 0, 0, 2.33334], 771: [0, .9, 0, 0, 2.33334], 989: [.08167, .58167, 0, 0, .77778], 1008: [0, .43056, .04028, 0, .66667], 8245: [0, .54986, 0, 0, .275], 8463: [0, .68889, 0, 0, .54028], 8487: [0, .68889, 0, 0, .72222], 8498: [0, .68889, 0, 0, .55556], 8502: [0, .68889, 0, 0, .66667], 8503: [0, .68889, 0, 0, .44445], 8504: [0, .68889, 0, 0, .66667], 8513: [0, .68889, 0, 0, .63889], 8592: [-.03598, .46402, 0, 0, .5], 8594: [-.03598, .46402, 0, 0, .5], 8602: [-.13313, .36687, 0, 0, 1], 8603: [-.13313, .36687, 0, 0, 1], 8606: [.01354, .52239, 0, 0, 1], 8608: [.01354, .52239, 0, 0, 1], 8610: [.01354, .52239, 0, 0, 1.11111], 8611: [.01354, .52239, 0, 0, 1.11111], 8619: [0, .54986, 0, 0, 1], 8620: [0, .54986, 0, 0, 1], 8621: [-.13313, .37788, 0, 0, 1.38889], 8622: [-.13313, .36687, 0, 0, 1], 8624: [0, .69224, 0, 0, .5], 8625: [0, .69224, 0, 0, .5], 8630: [0, .43056, 0, 0, 1], 8631: [0, .43056, 0, 0, 1], 8634: [.08198, .58198, 0, 0, .77778], 8635: [.08198, .58198, 0, 0, .77778], 8638: [.19444, .69224, 0, 0, .41667], 8639: [.19444, .69224, 0, 0, .41667], 8642: [.19444, .69224, 0, 0, .41667], 8643: [.19444, .69224, 0, 0, .41667], 8644: [.1808, .675, 0, 0, 1], 8646: [.1808, .675, 0, 0, 1], 8647: [.1808, .675, 0, 0, 1], 8648: [.19444, .69224, 0, 0, .83334], 8649: [.1808, .675, 0, 0, 1], 8650: [.19444, .69224, 0, 0, .83334], 8651: [.01354, .52239, 0, 0, 1], 8652: [.01354, .52239, 0, 0, 1], 8653: [-.13313, .36687, 0, 0, 1], 8654: [-.13313, .36687, 0, 0, 1], 8655: [-.13313, .36687, 0, 0, 1], 8666: [.13667, .63667, 0, 0, 1], 8667: [.13667, .63667, 0, 0, 1], 8669: [-.13313, .37788, 0, 0, 1], 8672: [-.064, .437, 0, 0, 1.334], 8674: [-.064, .437, 0, 0, 1.334], 8705: [0, .825, 0, 0, .5], 8708: [0, .68889, 0, 0, .55556], 8709: [.08167, .58167, 0, 0, .77778], 8717: [0, .43056, 0, 0, .42917], 8722: [-.03598, .46402, 0, 0, .5], 8724: [.08198, .69224, 0, 0, .77778], 8726: [.08167, .58167, 0, 0, .77778], 8733: [0, .69224, 0, 0, .77778], 8736: [0, .69224, 0, 0, .72222], 8737: [0, .69224, 0, 0, .72222], 8738: [.03517, .52239, 0, 0, .72222], 8739: [.08167, .58167, 0, 0, .22222], 8740: [.25142, .74111, 0, 0, .27778], 8741: [.08167, .58167, 0, 0, .38889], 8742: [.25142, .74111, 0, 0, .5], 8756: [0, .69224, 0, 0, .66667], 8757: [0, .69224, 0, 0, .66667], 8764: [-.13313, .36687, 0, 0, .77778], 8765: [-.13313, .37788, 0, 0, .77778], 8769: [-.13313, .36687, 0, 0, .77778], 8770: [-.03625, .46375, 0, 0, .77778], 8774: [.30274, .79383, 0, 0, .77778], 8776: [-.01688, .48312, 0, 0, .77778], 8778: [.08167, .58167, 0, 0, .77778], 8782: [.06062, .54986, 0, 0, .77778], 8783: [.06062, .54986, 0, 0, .77778], 8785: [.08198, .58198, 0, 0, .77778], 8786: [.08198, .58198, 0, 0, .77778], 8787: [.08198, .58198, 0, 0, .77778], 8790: [0, .69224, 0, 0, .77778], 8791: [.22958, .72958, 0, 0, .77778], 8796: [.08198, .91667, 0, 0, .77778], 8806: [.25583, .75583, 0, 0, .77778], 8807: [.25583, .75583, 0, 0, .77778], 8808: [.25142, .75726, 0, 0, .77778], 8809: [.25142, .75726, 0, 0, .77778], 8812: [.25583, .75583, 0, 0, .5], 8814: [.20576, .70576, 0, 0, .77778], 8815: [.20576, .70576, 0, 0, .77778], 8816: [.30274, .79383, 0, 0, .77778], 8817: [.30274, .79383, 0, 0, .77778], 8818: [.22958, .72958, 0, 0, .77778], 8819: [.22958, .72958, 0, 0, .77778], 8822: [.1808, .675, 0, 0, .77778], 8823: [.1808, .675, 0, 0, .77778], 8828: [.13667, .63667, 0, 0, .77778], 8829: [.13667, .63667, 0, 0, .77778], 8830: [.22958, .72958, 0, 0, .77778], 8831: [.22958, .72958, 0, 0, .77778], 8832: [.20576, .70576, 0, 0, .77778], 8833: [.20576, .70576, 0, 0, .77778], 8840: [.30274, .79383, 0, 0, .77778], 8841: [.30274, .79383, 0, 0, .77778], 8842: [.13597, .63597, 0, 0, .77778], 8843: [.13597, .63597, 0, 0, .77778], 8847: [.03517, .54986, 0, 0, .77778], 8848: [.03517, .54986, 0, 0, .77778], 8858: [.08198, .58198, 0, 0, .77778], 8859: [.08198, .58198, 0, 0, .77778], 8861: [.08198, .58198, 0, 0, .77778], 8862: [0, .675, 0, 0, .77778], 8863: [0, .675, 0, 0, .77778], 8864: [0, .675, 0, 0, .77778], 8865: [0, .675, 0, 0, .77778], 8872: [0, .69224, 0, 0, .61111], 8873: [0, .69224, 0, 0, .72222], 8874: [0, .69224, 0, 0, .88889], 8876: [0, .68889, 0, 0, .61111], 8877: [0, .68889, 0, 0, .61111], 8878: [0, .68889, 0, 0, .72222], 8879: [0, .68889, 0, 0, .72222], 8882: [.03517, .54986, 0, 0, .77778], 8883: [.03517, .54986, 0, 0, .77778], 8884: [.13667, .63667, 0, 0, .77778], 8885: [.13667, .63667, 0, 0, .77778], 8888: [0, .54986, 0, 0, 1.11111], 8890: [.19444, .43056, 0, 0, .55556], 8891: [.19444, .69224, 0, 0, .61111], 8892: [.19444, .69224, 0, 0, .61111], 8901: [0, .54986, 0, 0, .27778], 8903: [.08167, .58167, 0, 0, .77778], 8905: [.08167, .58167, 0, 0, .77778], 8906: [.08167, .58167, 0, 0, .77778], 8907: [0, .69224, 0, 0, .77778], 8908: [0, .69224, 0, 0, .77778], 8909: [-.03598, .46402, 0, 0, .77778], 8910: [0, .54986, 0, 0, .76042], 8911: [0, .54986, 0, 0, .76042], 8912: [.03517, .54986, 0, 0, .77778], 8913: [.03517, .54986, 0, 0, .77778], 8914: [0, .54986, 0, 0, .66667], 8915: [0, .54986, 0, 0, .66667], 8916: [0, .69224, 0, 0, .66667], 8918: [.0391, .5391, 0, 0, .77778], 8919: [.0391, .5391, 0, 0, .77778], 8920: [.03517, .54986, 0, 0, 1.33334], 8921: [.03517, .54986, 0, 0, 1.33334], 8922: [.38569, .88569, 0, 0, .77778], 8923: [.38569, .88569, 0, 0, .77778], 8926: [.13667, .63667, 0, 0, .77778], 8927: [.13667, .63667, 0, 0, .77778], 8928: [.30274, .79383, 0, 0, .77778], 8929: [.30274, .79383, 0, 0, .77778], 8934: [.23222, .74111, 0, 0, .77778], 8935: [.23222, .74111, 0, 0, .77778], 8936: [.23222, .74111, 0, 0, .77778], 8937: [.23222, .74111, 0, 0, .77778], 8938: [.20576, .70576, 0, 0, .77778], 8939: [.20576, .70576, 0, 0, .77778], 8940: [.30274, .79383, 0, 0, .77778], 8941: [.30274, .79383, 0, 0, .77778], 8994: [.19444, .69224, 0, 0, .77778], 8995: [.19444, .69224, 0, 0, .77778], 9416: [.15559, .69224, 0, 0, .90222], 9484: [0, .69224, 0, 0, .5], 9488: [0, .69224, 0, 0, .5], 9492: [0, .37788, 0, 0, .5], 9496: [0, .37788, 0, 0, .5], 9585: [.19444, .68889, 0, 0, .88889], 9586: [.19444, .74111, 0, 0, .88889], 9632: [0, .675, 0, 0, .77778], 9633: [0, .675, 0, 0, .77778], 9650: [0, .54986, 0, 0, .72222], 9651: [0, .54986, 0, 0, .72222], 9654: [.03517, .54986, 0, 0, .77778], 9660: [0, .54986, 0, 0, .72222], 9661: [0, .54986, 0, 0, .72222], 9664: [.03517, .54986, 0, 0, .77778], 9674: [.11111, .69224, 0, 0, .66667], 9733: [.19444, .69224, 0, 0, .94445], 10003: [0, .69224, 0, 0, .83334], 10016: [0, .69224, 0, 0, .83334], 10731: [.11111, .69224, 0, 0, .66667], 10846: [.19444, .75583, 0, 0, .61111], 10877: [.13667, .63667, 0, 0, .77778], 10878: [.13667, .63667, 0, 0, .77778], 10885: [.25583, .75583, 0, 0, .77778], 10886: [.25583, .75583, 0, 0, .77778], 10887: [.13597, .63597, 0, 0, .77778], 10888: [.13597, .63597, 0, 0, .77778], 10889: [.26167, .75726, 0, 0, .77778], 10890: [.26167, .75726, 0, 0, .77778], 10891: [.48256, .98256, 0, 0, .77778], 10892: [.48256, .98256, 0, 0, .77778], 10901: [.13667, .63667, 0, 0, .77778], 10902: [.13667, .63667, 0, 0, .77778], 10933: [.25142, .75726, 0, 0, .77778], 10934: [.25142, .75726, 0, 0, .77778], 10935: [.26167, .75726, 0, 0, .77778], 10936: [.26167, .75726, 0, 0, .77778], 10937: [.26167, .75726, 0, 0, .77778], 10938: [.26167, .75726, 0, 0, .77778], 10949: [.25583, .75583, 0, 0, .77778], 10950: [.25583, .75583, 0, 0, .77778], 10955: [.28481, .79383, 0, 0, .77778], 10956: [.28481, .79383, 0, 0, .77778], 57350: [.08167, .58167, 0, 0, .22222], 57351: [.08167, .58167, 0, 0, .38889], 57352: [.08167, .58167, 0, 0, .77778], 57353: [0, .43056, .04028, 0, .66667], 57356: [.25142, .75726, 0, 0, .77778], 57357: [.25142, .75726, 0, 0, .77778], 57358: [.41951, .91951, 0, 0, .77778], 57359: [.30274, .79383, 0, 0, .77778], 57360: [.30274, .79383, 0, 0, .77778], 57361: [.41951, .91951, 0, 0, .77778], 57366: [.25142, .75726, 0, 0, .77778], 57367: [.25142, .75726, 0, 0, .77778], 57368: [.25142, .75726, 0, 0, .77778], 57369: [.25142, .75726, 0, 0, .77778], 57370: [.13597, .63597, 0, 0, .77778], 57371: [.13597, .63597, 0, 0, .77778] }, "Caligraphic-Regular": { 32: [0, 0, 0, 0, .25], 65: [0, .68333, 0, .19445, .79847], 66: [0, .68333, .03041, .13889, .65681], 67: [0, .68333, .05834, .13889, .52653], 68: [0, .68333, .02778, .08334, .77139], 69: [0, .68333, .08944, .11111, .52778], 70: [0, .68333, .09931, .11111, .71875], 71: [.09722, .68333, .0593, .11111, .59487], 72: [0, .68333, .00965, .11111, .84452], 73: [0, .68333, .07382, 0, .54452], 74: [.09722, .68333, .18472, .16667, .67778], 75: [0, .68333, .01445, .05556, .76195], 76: [0, .68333, 0, .13889, .68972], 77: [0, .68333, 0, .13889, 1.2009], 78: [0, .68333, .14736, .08334, .82049], 79: [0, .68333, .02778, .11111, .79611], 80: [0, .68333, .08222, .08334, .69556], 81: [.09722, .68333, 0, .11111, .81667], 82: [0, .68333, 0, .08334, .8475], 83: [0, .68333, .075, .13889, .60556], 84: [0, .68333, .25417, 0, .54464], 85: [0, .68333, .09931, .08334, .62583], 86: [0, .68333, .08222, 0, .61278], 87: [0, .68333, .08222, .08334, .98778], 88: [0, .68333, .14643, .13889, .7133], 89: [.09722, .68333, .08222, .08334, .66834], 90: [0, .68333, .07944, .13889, .72473], 160: [0, 0, 0, 0, .25] }, "Fraktur-Regular": { 32: [0, 0, 0, 0, .25], 33: [0, .69141, 0, 0, .29574], 34: [0, .69141, 0, 0, .21471], 38: [0, .69141, 0, 0, .73786], 39: [0, .69141, 0, 0, .21201], 40: [.24982, .74947, 0, 0, .38865], 41: [.24982, .74947, 0, 0, .38865], 42: [0, .62119, 0, 0, .27764], 43: [.08319, .58283, 0, 0, .75623], 44: [0, .10803, 0, 0, .27764], 45: [.08319, .58283, 0, 0, .75623], 46: [0, .10803, 0, 0, .27764], 47: [.24982, .74947, 0, 0, .50181], 48: [0, .47534, 0, 0, .50181], 49: [0, .47534, 0, 0, .50181], 50: [0, .47534, 0, 0, .50181], 51: [.18906, .47534, 0, 0, .50181], 52: [.18906, .47534, 0, 0, .50181], 53: [.18906, .47534, 0, 0, .50181], 54: [0, .69141, 0, 0, .50181], 55: [.18906, .47534, 0, 0, .50181], 56: [0, .69141, 0, 0, .50181], 57: [.18906, .47534, 0, 0, .50181], 58: [0, .47534, 0, 0, .21606], 59: [.12604, .47534, 0, 0, .21606], 61: [-.13099, .36866, 0, 0, .75623], 63: [0, .69141, 0, 0, .36245], 65: [0, .69141, 0, 0, .7176], 66: [0, .69141, 0, 0, .88397], 67: [0, .69141, 0, 0, .61254], 68: [0, .69141, 0, 0, .83158], 69: [0, .69141, 0, 0, .66278], 70: [.12604, .69141, 0, 0, .61119], 71: [0, .69141, 0, 0, .78539], 72: [.06302, .69141, 0, 0, .7203], 73: [0, .69141, 0, 0, .55448], 74: [.12604, .69141, 0, 0, .55231], 75: [0, .69141, 0, 0, .66845], 76: [0, .69141, 0, 0, .66602], 77: [0, .69141, 0, 0, 1.04953], 78: [0, .69141, 0, 0, .83212], 79: [0, .69141, 0, 0, .82699], 80: [.18906, .69141, 0, 0, .82753], 81: [.03781, .69141, 0, 0, .82699], 82: [0, .69141, 0, 0, .82807], 83: [0, .69141, 0, 0, .82861], 84: [0, .69141, 0, 0, .66899], 85: [0, .69141, 0, 0, .64576], 86: [0, .69141, 0, 0, .83131], 87: [0, .69141, 0, 0, 1.04602], 88: [0, .69141, 0, 0, .71922], 89: [.18906, .69141, 0, 0, .83293], 90: [.12604, .69141, 0, 0, .60201], 91: [.24982, .74947, 0, 0, .27764], 93: [.24982, .74947, 0, 0, .27764], 94: [0, .69141, 0, 0, .49965], 97: [0, .47534, 0, 0, .50046], 98: [0, .69141, 0, 0, .51315], 99: [0, .47534, 0, 0, .38946], 100: [0, .62119, 0, 0, .49857], 101: [0, .47534, 0, 0, .40053], 102: [.18906, .69141, 0, 0, .32626], 103: [.18906, .47534, 0, 0, .5037], 104: [.18906, .69141, 0, 0, .52126], 105: [0, .69141, 0, 0, .27899], 106: [0, .69141, 0, 0, .28088], 107: [0, .69141, 0, 0, .38946], 108: [0, .69141, 0, 0, .27953], 109: [0, .47534, 0, 0, .76676], 110: [0, .47534, 0, 0, .52666], 111: [0, .47534, 0, 0, .48885], 112: [.18906, .52396, 0, 0, .50046], 113: [.18906, .47534, 0, 0, .48912], 114: [0, .47534, 0, 0, .38919], 115: [0, .47534, 0, 0, .44266], 116: [0, .62119, 0, 0, .33301], 117: [0, .47534, 0, 0, .5172], 118: [0, .52396, 0, 0, .5118], 119: [0, .52396, 0, 0, .77351], 120: [.18906, .47534, 0, 0, .38865], 121: [.18906, .47534, 0, 0, .49884], 122: [.18906, .47534, 0, 0, .39054], 160: [0, 0, 0, 0, .25], 8216: [0, .69141, 0, 0, .21471], 8217: [0, .69141, 0, 0, .21471], 58112: [0, .62119, 0, 0, .49749], 58113: [0, .62119, 0, 0, .4983], 58114: [.18906, .69141, 0, 0, .33328], 58115: [.18906, .69141, 0, 0, .32923], 58116: [.18906, .47534, 0, 0, .50343], 58117: [0, .69141, 0, 0, .33301], 58118: [0, .62119, 0, 0, .33409], 58119: [0, .47534, 0, 0, .50073] }, "Main-Bold": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, 0, 0, .35], 34: [0, .69444, 0, 0, .60278], 35: [.19444, .69444, 0, 0, .95833], 36: [.05556, .75, 0, 0, .575], 37: [.05556, .75, 0, 0, .95833], 38: [0, .69444, 0, 0, .89444], 39: [0, .69444, 0, 0, .31944], 40: [.25, .75, 0, 0, .44722], 41: [.25, .75, 0, 0, .44722], 42: [0, .75, 0, 0, .575], 43: [.13333, .63333, 0, 0, .89444], 44: [.19444, .15556, 0, 0, .31944], 45: [0, .44444, 0, 0, .38333], 46: [0, .15556, 0, 0, .31944], 47: [.25, .75, 0, 0, .575], 48: [0, .64444, 0, 0, .575], 49: [0, .64444, 0, 0, .575], 50: [0, .64444, 0, 0, .575], 51: [0, .64444, 0, 0, .575], 52: [0, .64444, 0, 0, .575], 53: [0, .64444, 0, 0, .575], 54: [0, .64444, 0, 0, .575], 55: [0, .64444, 0, 0, .575], 56: [0, .64444, 0, 0, .575], 57: [0, .64444, 0, 0, .575], 58: [0, .44444, 0, 0, .31944], 59: [.19444, .44444, 0, 0, .31944], 60: [.08556, .58556, 0, 0, .89444], 61: [-.10889, .39111, 0, 0, .89444], 62: [.08556, .58556, 0, 0, .89444], 63: [0, .69444, 0, 0, .54305], 64: [0, .69444, 0, 0, .89444], 65: [0, .68611, 0, 0, .86944], 66: [0, .68611, 0, 0, .81805], 67: [0, .68611, 0, 0, .83055], 68: [0, .68611, 0, 0, .88194], 69: [0, .68611, 0, 0, .75555], 70: [0, .68611, 0, 0, .72361], 71: [0, .68611, 0, 0, .90416], 72: [0, .68611, 0, 0, .9], 73: [0, .68611, 0, 0, .43611], 74: [0, .68611, 0, 0, .59444], 75: [0, .68611, 0, 0, .90138], 76: [0, .68611, 0, 0, .69166], 77: [0, .68611, 0, 0, 1.09166], 78: [0, .68611, 0, 0, .9], 79: [0, .68611, 0, 0, .86388], 80: [0, .68611, 0, 0, .78611], 81: [.19444, .68611, 0, 0, .86388], 82: [0, .68611, 0, 0, .8625], 83: [0, .68611, 0, 0, .63889], 84: [0, .68611, 0, 0, .8], 85: [0, .68611, 0, 0, .88472], 86: [0, .68611, .01597, 0, .86944], 87: [0, .68611, .01597, 0, 1.18888], 88: [0, .68611, 0, 0, .86944], 89: [0, .68611, .02875, 0, .86944], 90: [0, .68611, 0, 0, .70277], 91: [.25, .75, 0, 0, .31944], 92: [.25, .75, 0, 0, .575], 93: [.25, .75, 0, 0, .31944], 94: [0, .69444, 0, 0, .575], 95: [.31, .13444, .03194, 0, .575], 97: [0, .44444, 0, 0, .55902], 98: [0, .69444, 0, 0, .63889], 99: [0, .44444, 0, 0, .51111], 100: [0, .69444, 0, 0, .63889], 101: [0, .44444, 0, 0, .52708], 102: [0, .69444, .10903, 0, .35139], 103: [.19444, .44444, .01597, 0, .575], 104: [0, .69444, 0, 0, .63889], 105: [0, .69444, 0, 0, .31944], 106: [.19444, .69444, 0, 0, .35139], 107: [0, .69444, 0, 0, .60694], 108: [0, .69444, 0, 0, .31944], 109: [0, .44444, 0, 0, .95833], 110: [0, .44444, 0, 0, .63889], 111: [0, .44444, 0, 0, .575], 112: [.19444, .44444, 0, 0, .63889], 113: [.19444, .44444, 0, 0, .60694], 114: [0, .44444, 0, 0, .47361], 115: [0, .44444, 0, 0, .45361], 116: [0, .63492, 0, 0, .44722], 117: [0, .44444, 0, 0, .63889], 118: [0, .44444, .01597, 0, .60694], 119: [0, .44444, .01597, 0, .83055], 120: [0, .44444, 0, 0, .60694], 121: [.19444, .44444, .01597, 0, .60694], 122: [0, .44444, 0, 0, .51111], 123: [.25, .75, 0, 0, .575], 124: [.25, .75, 0, 0, .31944], 125: [.25, .75, 0, 0, .575], 126: [.35, .34444, 0, 0, .575], 160: [0, 0, 0, 0, .25], 163: [0, .69444, 0, 0, .86853], 168: [0, .69444, 0, 0, .575], 172: [0, .44444, 0, 0, .76666], 176: [0, .69444, 0, 0, .86944], 177: [.13333, .63333, 0, 0, .89444], 184: [.17014, 0, 0, 0, .51111], 198: [0, .68611, 0, 0, 1.04166], 215: [.13333, .63333, 0, 0, .89444], 216: [.04861, .73472, 0, 0, .89444], 223: [0, .69444, 0, 0, .59722], 230: [0, .44444, 0, 0, .83055], 247: [.13333, .63333, 0, 0, .89444], 248: [.09722, .54167, 0, 0, .575], 305: [0, .44444, 0, 0, .31944], 338: [0, .68611, 0, 0, 1.16944], 339: [0, .44444, 0, 0, .89444], 567: [.19444, .44444, 0, 0, .35139], 710: [0, .69444, 0, 0, .575], 711: [0, .63194, 0, 0, .575], 713: [0, .59611, 0, 0, .575], 714: [0, .69444, 0, 0, .575], 715: [0, .69444, 0, 0, .575], 728: [0, .69444, 0, 0, .575], 729: [0, .69444, 0, 0, .31944], 730: [0, .69444, 0, 0, .86944], 732: [0, .69444, 0, 0, .575], 733: [0, .69444, 0, 0, .575], 915: [0, .68611, 0, 0, .69166], 916: [0, .68611, 0, 0, .95833], 920: [0, .68611, 0, 0, .89444], 923: [0, .68611, 0, 0, .80555], 926: [0, .68611, 0, 0, .76666], 928: [0, .68611, 0, 0, .9], 931: [0, .68611, 0, 0, .83055], 933: [0, .68611, 0, 0, .89444], 934: [0, .68611, 0, 0, .83055], 936: [0, .68611, 0, 0, .89444], 937: [0, .68611, 0, 0, .83055], 8211: [0, .44444, .03194, 0, .575], 8212: [0, .44444, .03194, 0, 1.14999], 8216: [0, .69444, 0, 0, .31944], 8217: [0, .69444, 0, 0, .31944], 8220: [0, .69444, 0, 0, .60278], 8221: [0, .69444, 0, 0, .60278], 8224: [.19444, .69444, 0, 0, .51111], 8225: [.19444, .69444, 0, 0, .51111], 8242: [0, .55556, 0, 0, .34444], 8407: [0, .72444, .15486, 0, .575], 8463: [0, .69444, 0, 0, .66759], 8465: [0, .69444, 0, 0, .83055], 8467: [0, .69444, 0, 0, .47361], 8472: [.19444, .44444, 0, 0, .74027], 8476: [0, .69444, 0, 0, .83055], 8501: [0, .69444, 0, 0, .70277], 8592: [-.10889, .39111, 0, 0, 1.14999], 8593: [.19444, .69444, 0, 0, .575], 8594: [-.10889, .39111, 0, 0, 1.14999], 8595: [.19444, .69444, 0, 0, .575], 8596: [-.10889, .39111, 0, 0, 1.14999], 8597: [.25, .75, 0, 0, .575], 8598: [.19444, .69444, 0, 0, 1.14999], 8599: [.19444, .69444, 0, 0, 1.14999], 8600: [.19444, .69444, 0, 0, 1.14999], 8601: [.19444, .69444, 0, 0, 1.14999], 8636: [-.10889, .39111, 0, 0, 1.14999], 8637: [-.10889, .39111, 0, 0, 1.14999], 8640: [-.10889, .39111, 0, 0, 1.14999], 8641: [-.10889, .39111, 0, 0, 1.14999], 8656: [-.10889, .39111, 0, 0, 1.14999], 8657: [.19444, .69444, 0, 0, .70277], 8658: [-.10889, .39111, 0, 0, 1.14999], 8659: [.19444, .69444, 0, 0, .70277], 8660: [-.10889, .39111, 0, 0, 1.14999], 8661: [.25, .75, 0, 0, .70277], 8704: [0, .69444, 0, 0, .63889], 8706: [0, .69444, .06389, 0, .62847], 8707: [0, .69444, 0, 0, .63889], 8709: [.05556, .75, 0, 0, .575], 8711: [0, .68611, 0, 0, .95833], 8712: [.08556, .58556, 0, 0, .76666], 8715: [.08556, .58556, 0, 0, .76666], 8722: [.13333, .63333, 0, 0, .89444], 8723: [.13333, .63333, 0, 0, .89444], 8725: [.25, .75, 0, 0, .575], 8726: [.25, .75, 0, 0, .575], 8727: [-.02778, .47222, 0, 0, .575], 8728: [-.02639, .47361, 0, 0, .575], 8729: [-.02639, .47361, 0, 0, .575], 8730: [.18, .82, 0, 0, .95833], 8733: [0, .44444, 0, 0, .89444], 8734: [0, .44444, 0, 0, 1.14999], 8736: [0, .69224, 0, 0, .72222], 8739: [.25, .75, 0, 0, .31944], 8741: [.25, .75, 0, 0, .575], 8743: [0, .55556, 0, 0, .76666], 8744: [0, .55556, 0, 0, .76666], 8745: [0, .55556, 0, 0, .76666], 8746: [0, .55556, 0, 0, .76666], 8747: [.19444, .69444, .12778, 0, .56875], 8764: [-.10889, .39111, 0, 0, .89444], 8768: [.19444, .69444, 0, 0, .31944], 8771: [.00222, .50222, 0, 0, .89444], 8773: [.027, .638, 0, 0, .894], 8776: [.02444, .52444, 0, 0, .89444], 8781: [.00222, .50222, 0, 0, .89444], 8801: [.00222, .50222, 0, 0, .89444], 8804: [.19667, .69667, 0, 0, .89444], 8805: [.19667, .69667, 0, 0, .89444], 8810: [.08556, .58556, 0, 0, 1.14999], 8811: [.08556, .58556, 0, 0, 1.14999], 8826: [.08556, .58556, 0, 0, .89444], 8827: [.08556, .58556, 0, 0, .89444], 8834: [.08556, .58556, 0, 0, .89444], 8835: [.08556, .58556, 0, 0, .89444], 8838: [.19667, .69667, 0, 0, .89444], 8839: [.19667, .69667, 0, 0, .89444], 8846: [0, .55556, 0, 0, .76666], 8849: [.19667, .69667, 0, 0, .89444], 8850: [.19667, .69667, 0, 0, .89444], 8851: [0, .55556, 0, 0, .76666], 8852: [0, .55556, 0, 0, .76666], 8853: [.13333, .63333, 0, 0, .89444], 8854: [.13333, .63333, 0, 0, .89444], 8855: [.13333, .63333, 0, 0, .89444], 8856: [.13333, .63333, 0, 0, .89444], 8857: [.13333, .63333, 0, 0, .89444], 8866: [0, .69444, 0, 0, .70277], 8867: [0, .69444, 0, 0, .70277], 8868: [0, .69444, 0, 0, .89444], 8869: [0, .69444, 0, 0, .89444], 8900: [-.02639, .47361, 0, 0, .575], 8901: [-.02639, .47361, 0, 0, .31944], 8902: [-.02778, .47222, 0, 0, .575], 8968: [.25, .75, 0, 0, .51111], 8969: [.25, .75, 0, 0, .51111], 8970: [.25, .75, 0, 0, .51111], 8971: [.25, .75, 0, 0, .51111], 8994: [-.13889, .36111, 0, 0, 1.14999], 8995: [-.13889, .36111, 0, 0, 1.14999], 9651: [.19444, .69444, 0, 0, 1.02222], 9657: [-.02778, .47222, 0, 0, .575], 9661: [.19444, .69444, 0, 0, 1.02222], 9667: [-.02778, .47222, 0, 0, .575], 9711: [.19444, .69444, 0, 0, 1.14999], 9824: [.12963, .69444, 0, 0, .89444], 9825: [.12963, .69444, 0, 0, .89444], 9826: [.12963, .69444, 0, 0, .89444], 9827: [.12963, .69444, 0, 0, .89444], 9837: [0, .75, 0, 0, .44722], 9838: [.19444, .69444, 0, 0, .44722], 9839: [.19444, .69444, 0, 0, .44722], 10216: [.25, .75, 0, 0, .44722], 10217: [.25, .75, 0, 0, .44722], 10815: [0, .68611, 0, 0, .9], 10927: [.19667, .69667, 0, 0, .89444], 10928: [.19667, .69667, 0, 0, .89444], 57376: [.19444, .69444, 0, 0, 0] }, "Main-BoldItalic": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, .11417, 0, .38611], 34: [0, .69444, .07939, 0, .62055], 35: [.19444, .69444, .06833, 0, .94444], 37: [.05556, .75, .12861, 0, .94444], 38: [0, .69444, .08528, 0, .88555], 39: [0, .69444, .12945, 0, .35555], 40: [.25, .75, .15806, 0, .47333], 41: [.25, .75, .03306, 0, .47333], 42: [0, .75, .14333, 0, .59111], 43: [.10333, .60333, .03306, 0, .88555], 44: [.19444, .14722, 0, 0, .35555], 45: [0, .44444, .02611, 0, .41444], 46: [0, .14722, 0, 0, .35555], 47: [.25, .75, .15806, 0, .59111], 48: [0, .64444, .13167, 0, .59111], 49: [0, .64444, .13167, 0, .59111], 50: [0, .64444, .13167, 0, .59111], 51: [0, .64444, .13167, 0, .59111], 52: [.19444, .64444, .13167, 0, .59111], 53: [0, .64444, .13167, 0, .59111], 54: [0, .64444, .13167, 0, .59111], 55: [.19444, .64444, .13167, 0, .59111], 56: [0, .64444, .13167, 0, .59111], 57: [0, .64444, .13167, 0, .59111], 58: [0, .44444, .06695, 0, .35555], 59: [.19444, .44444, .06695, 0, .35555], 61: [-.10889, .39111, .06833, 0, .88555], 63: [0, .69444, .11472, 0, .59111], 64: [0, .69444, .09208, 0, .88555], 65: [0, .68611, 0, 0, .86555], 66: [0, .68611, .0992, 0, .81666], 67: [0, .68611, .14208, 0, .82666], 68: [0, .68611, .09062, 0, .87555], 69: [0, .68611, .11431, 0, .75666], 70: [0, .68611, .12903, 0, .72722], 71: [0, .68611, .07347, 0, .89527], 72: [0, .68611, .17208, 0, .8961], 73: [0, .68611, .15681, 0, .47166], 74: [0, .68611, .145, 0, .61055], 75: [0, .68611, .14208, 0, .89499], 76: [0, .68611, 0, 0, .69777], 77: [0, .68611, .17208, 0, 1.07277], 78: [0, .68611, .17208, 0, .8961], 79: [0, .68611, .09062, 0, .85499], 80: [0, .68611, .0992, 0, .78721], 81: [.19444, .68611, .09062, 0, .85499], 82: [0, .68611, .02559, 0, .85944], 83: [0, .68611, .11264, 0, .64999], 84: [0, .68611, .12903, 0, .7961], 85: [0, .68611, .17208, 0, .88083], 86: [0, .68611, .18625, 0, .86555], 87: [0, .68611, .18625, 0, 1.15999], 88: [0, .68611, .15681, 0, .86555], 89: [0, .68611, .19803, 0, .86555], 90: [0, .68611, .14208, 0, .70888], 91: [.25, .75, .1875, 0, .35611], 93: [.25, .75, .09972, 0, .35611], 94: [0, .69444, .06709, 0, .59111], 95: [.31, .13444, .09811, 0, .59111], 97: [0, .44444, .09426, 0, .59111], 98: [0, .69444, .07861, 0, .53222], 99: [0, .44444, .05222, 0, .53222], 100: [0, .69444, .10861, 0, .59111], 101: [0, .44444, .085, 0, .53222], 102: [.19444, .69444, .21778, 0, .4], 103: [.19444, .44444, .105, 0, .53222], 104: [0, .69444, .09426, 0, .59111], 105: [0, .69326, .11387, 0, .35555], 106: [.19444, .69326, .1672, 0, .35555], 107: [0, .69444, .11111, 0, .53222], 108: [0, .69444, .10861, 0, .29666], 109: [0, .44444, .09426, 0, .94444], 110: [0, .44444, .09426, 0, .64999], 111: [0, .44444, .07861, 0, .59111], 112: [.19444, .44444, .07861, 0, .59111], 113: [.19444, .44444, .105, 0, .53222], 114: [0, .44444, .11111, 0, .50167], 115: [0, .44444, .08167, 0, .48694], 116: [0, .63492, .09639, 0, .385], 117: [0, .44444, .09426, 0, .62055], 118: [0, .44444, .11111, 0, .53222], 119: [0, .44444, .11111, 0, .76777], 120: [0, .44444, .12583, 0, .56055], 121: [.19444, .44444, .105, 0, .56166], 122: [0, .44444, .13889, 0, .49055], 126: [.35, .34444, .11472, 0, .59111], 160: [0, 0, 0, 0, .25], 168: [0, .69444, .11473, 0, .59111], 176: [0, .69444, 0, 0, .94888], 184: [.17014, 0, 0, 0, .53222], 198: [0, .68611, .11431, 0, 1.02277], 216: [.04861, .73472, .09062, 0, .88555], 223: [.19444, .69444, .09736, 0, .665], 230: [0, .44444, .085, 0, .82666], 248: [.09722, .54167, .09458, 0, .59111], 305: [0, .44444, .09426, 0, .35555], 338: [0, .68611, .11431, 0, 1.14054], 339: [0, .44444, .085, 0, .82666], 567: [.19444, .44444, .04611, 0, .385], 710: [0, .69444, .06709, 0, .59111], 711: [0, .63194, .08271, 0, .59111], 713: [0, .59444, .10444, 0, .59111], 714: [0, .69444, .08528, 0, .59111], 715: [0, .69444, 0, 0, .59111], 728: [0, .69444, .10333, 0, .59111], 729: [0, .69444, .12945, 0, .35555], 730: [0, .69444, 0, 0, .94888], 732: [0, .69444, .11472, 0, .59111], 733: [0, .69444, .11472, 0, .59111], 915: [0, .68611, .12903, 0, .69777], 916: [0, .68611, 0, 0, .94444], 920: [0, .68611, .09062, 0, .88555], 923: [0, .68611, 0, 0, .80666], 926: [0, .68611, .15092, 0, .76777], 928: [0, .68611, .17208, 0, .8961], 931: [0, .68611, .11431, 0, .82666], 933: [0, .68611, .10778, 0, .88555], 934: [0, .68611, .05632, 0, .82666], 936: [0, .68611, .10778, 0, .88555], 937: [0, .68611, .0992, 0, .82666], 8211: [0, .44444, .09811, 0, .59111], 8212: [0, .44444, .09811, 0, 1.18221], 8216: [0, .69444, .12945, 0, .35555], 8217: [0, .69444, .12945, 0, .35555], 8220: [0, .69444, .16772, 0, .62055], 8221: [0, .69444, .07939, 0, .62055] }, "Main-Italic": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, .12417, 0, .30667], 34: [0, .69444, .06961, 0, .51444], 35: [.19444, .69444, .06616, 0, .81777], 37: [.05556, .75, .13639, 0, .81777], 38: [0, .69444, .09694, 0, .76666], 39: [0, .69444, .12417, 0, .30667], 40: [.25, .75, .16194, 0, .40889], 41: [.25, .75, .03694, 0, .40889], 42: [0, .75, .14917, 0, .51111], 43: [.05667, .56167, .03694, 0, .76666], 44: [.19444, .10556, 0, 0, .30667], 45: [0, .43056, .02826, 0, .35778], 46: [0, .10556, 0, 0, .30667], 47: [.25, .75, .16194, 0, .51111], 48: [0, .64444, .13556, 0, .51111], 49: [0, .64444, .13556, 0, .51111], 50: [0, .64444, .13556, 0, .51111], 51: [0, .64444, .13556, 0, .51111], 52: [.19444, .64444, .13556, 0, .51111], 53: [0, .64444, .13556, 0, .51111], 54: [0, .64444, .13556, 0, .51111], 55: [.19444, .64444, .13556, 0, .51111], 56: [0, .64444, .13556, 0, .51111], 57: [0, .64444, .13556, 0, .51111], 58: [0, .43056, .0582, 0, .30667], 59: [.19444, .43056, .0582, 0, .30667], 61: [-.13313, .36687, .06616, 0, .76666], 63: [0, .69444, .1225, 0, .51111], 64: [0, .69444, .09597, 0, .76666], 65: [0, .68333, 0, 0, .74333], 66: [0, .68333, .10257, 0, .70389], 67: [0, .68333, .14528, 0, .71555], 68: [0, .68333, .09403, 0, .755], 69: [0, .68333, .12028, 0, .67833], 70: [0, .68333, .13305, 0, .65277], 71: [0, .68333, .08722, 0, .77361], 72: [0, .68333, .16389, 0, .74333], 73: [0, .68333, .15806, 0, .38555], 74: [0, .68333, .14028, 0, .525], 75: [0, .68333, .14528, 0, .76888], 76: [0, .68333, 0, 0, .62722], 77: [0, .68333, .16389, 0, .89666], 78: [0, .68333, .16389, 0, .74333], 79: [0, .68333, .09403, 0, .76666], 80: [0, .68333, .10257, 0, .67833], 81: [.19444, .68333, .09403, 0, .76666], 82: [0, .68333, .03868, 0, .72944], 83: [0, .68333, .11972, 0, .56222], 84: [0, .68333, .13305, 0, .71555], 85: [0, .68333, .16389, 0, .74333], 86: [0, .68333, .18361, 0, .74333], 87: [0, .68333, .18361, 0, .99888], 88: [0, .68333, .15806, 0, .74333], 89: [0, .68333, .19383, 0, .74333], 90: [0, .68333, .14528, 0, .61333], 91: [.25, .75, .1875, 0, .30667], 93: [.25, .75, .10528, 0, .30667], 94: [0, .69444, .06646, 0, .51111], 95: [.31, .12056, .09208, 0, .51111], 97: [0, .43056, .07671, 0, .51111], 98: [0, .69444, .06312, 0, .46], 99: [0, .43056, .05653, 0, .46], 100: [0, .69444, .10333, 0, .51111], 101: [0, .43056, .07514, 0, .46], 102: [.19444, .69444, .21194, 0, .30667], 103: [.19444, .43056, .08847, 0, .46], 104: [0, .69444, .07671, 0, .51111], 105: [0, .65536, .1019, 0, .30667], 106: [.19444, .65536, .14467, 0, .30667], 107: [0, .69444, .10764, 0, .46], 108: [0, .69444, .10333, 0, .25555], 109: [0, .43056, .07671, 0, .81777], 110: [0, .43056, .07671, 0, .56222], 111: [0, .43056, .06312, 0, .51111], 112: [.19444, .43056, .06312, 0, .51111], 113: [.19444, .43056, .08847, 0, .46], 114: [0, .43056, .10764, 0, .42166], 115: [0, .43056, .08208, 0, .40889], 116: [0, .61508, .09486, 0, .33222], 117: [0, .43056, .07671, 0, .53666], 118: [0, .43056, .10764, 0, .46], 119: [0, .43056, .10764, 0, .66444], 120: [0, .43056, .12042, 0, .46389], 121: [.19444, .43056, .08847, 0, .48555], 122: [0, .43056, .12292, 0, .40889], 126: [.35, .31786, .11585, 0, .51111], 160: [0, 0, 0, 0, .25], 168: [0, .66786, .10474, 0, .51111], 176: [0, .69444, 0, 0, .83129], 184: [.17014, 0, 0, 0, .46], 198: [0, .68333, .12028, 0, .88277], 216: [.04861, .73194, .09403, 0, .76666], 223: [.19444, .69444, .10514, 0, .53666], 230: [0, .43056, .07514, 0, .71555], 248: [.09722, .52778, .09194, 0, .51111], 338: [0, .68333, .12028, 0, .98499], 339: [0, .43056, .07514, 0, .71555], 710: [0, .69444, .06646, 0, .51111], 711: [0, .62847, .08295, 0, .51111], 713: [0, .56167, .10333, 0, .51111], 714: [0, .69444, .09694, 0, .51111], 715: [0, .69444, 0, 0, .51111], 728: [0, .69444, .10806, 0, .51111], 729: [0, .66786, .11752, 0, .30667], 730: [0, .69444, 0, 0, .83129], 732: [0, .66786, .11585, 0, .51111], 733: [0, .69444, .1225, 0, .51111], 915: [0, .68333, .13305, 0, .62722], 916: [0, .68333, 0, 0, .81777], 920: [0, .68333, .09403, 0, .76666], 923: [0, .68333, 0, 0, .69222], 926: [0, .68333, .15294, 0, .66444], 928: [0, .68333, .16389, 0, .74333], 931: [0, .68333, .12028, 0, .71555], 933: [0, .68333, .11111, 0, .76666], 934: [0, .68333, .05986, 0, .71555], 936: [0, .68333, .11111, 0, .76666], 937: [0, .68333, .10257, 0, .71555], 8211: [0, .43056, .09208, 0, .51111], 8212: [0, .43056, .09208, 0, 1.02222], 8216: [0, .69444, .12417, 0, .30667], 8217: [0, .69444, .12417, 0, .30667], 8220: [0, .69444, .1685, 0, .51444], 8221: [0, .69444, .06961, 0, .51444], 8463: [0, .68889, 0, 0, .54028] }, "Main-Regular": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, 0, 0, .27778], 34: [0, .69444, 0, 0, .5], 35: [.19444, .69444, 0, 0, .83334], 36: [.05556, .75, 0, 0, .5], 37: [.05556, .75, 0, 0, .83334], 38: [0, .69444, 0, 0, .77778], 39: [0, .69444, 0, 0, .27778], 40: [.25, .75, 0, 0, .38889], 41: [.25, .75, 0, 0, .38889], 42: [0, .75, 0, 0, .5], 43: [.08333, .58333, 0, 0, .77778], 44: [.19444, .10556, 0, 0, .27778], 45: [0, .43056, 0, 0, .33333], 46: [0, .10556, 0, 0, .27778], 47: [.25, .75, 0, 0, .5], 48: [0, .64444, 0, 0, .5], 49: [0, .64444, 0, 0, .5], 50: [0, .64444, 0, 0, .5], 51: [0, .64444, 0, 0, .5], 52: [0, .64444, 0, 0, .5], 53: [0, .64444, 0, 0, .5], 54: [0, .64444, 0, 0, .5], 55: [0, .64444, 0, 0, .5], 56: [0, .64444, 0, 0, .5], 57: [0, .64444, 0, 0, .5], 58: [0, .43056, 0, 0, .27778], 59: [.19444, .43056, 0, 0, .27778], 60: [.0391, .5391, 0, 0, .77778], 61: [-.13313, .36687, 0, 0, .77778], 62: [.0391, .5391, 0, 0, .77778], 63: [0, .69444, 0, 0, .47222], 64: [0, .69444, 0, 0, .77778], 65: [0, .68333, 0, 0, .75], 66: [0, .68333, 0, 0, .70834], 67: [0, .68333, 0, 0, .72222], 68: [0, .68333, 0, 0, .76389], 69: [0, .68333, 0, 0, .68056], 70: [0, .68333, 0, 0, .65278], 71: [0, .68333, 0, 0, .78472], 72: [0, .68333, 0, 0, .75], 73: [0, .68333, 0, 0, .36111], 74: [0, .68333, 0, 0, .51389], 75: [0, .68333, 0, 0, .77778], 76: [0, .68333, 0, 0, .625], 77: [0, .68333, 0, 0, .91667], 78: [0, .68333, 0, 0, .75], 79: [0, .68333, 0, 0, .77778], 80: [0, .68333, 0, 0, .68056], 81: [.19444, .68333, 0, 0, .77778], 82: [0, .68333, 0, 0, .73611], 83: [0, .68333, 0, 0, .55556], 84: [0, .68333, 0, 0, .72222], 85: [0, .68333, 0, 0, .75], 86: [0, .68333, .01389, 0, .75], 87: [0, .68333, .01389, 0, 1.02778], 88: [0, .68333, 0, 0, .75], 89: [0, .68333, .025, 0, .75], 90: [0, .68333, 0, 0, .61111], 91: [.25, .75, 0, 0, .27778], 92: [.25, .75, 0, 0, .5], 93: [.25, .75, 0, 0, .27778], 94: [0, .69444, 0, 0, .5], 95: [.31, .12056, .02778, 0, .5], 97: [0, .43056, 0, 0, .5], 98: [0, .69444, 0, 0, .55556], 99: [0, .43056, 0, 0, .44445], 100: [0, .69444, 0, 0, .55556], 101: [0, .43056, 0, 0, .44445], 102: [0, .69444, .07778, 0, .30556], 103: [.19444, .43056, .01389, 0, .5], 104: [0, .69444, 0, 0, .55556], 105: [0, .66786, 0, 0, .27778], 106: [.19444, .66786, 0, 0, .30556], 107: [0, .69444, 0, 0, .52778], 108: [0, .69444, 0, 0, .27778], 109: [0, .43056, 0, 0, .83334], 110: [0, .43056, 0, 0, .55556], 111: [0, .43056, 0, 0, .5], 112: [.19444, .43056, 0, 0, .55556], 113: [.19444, .43056, 0, 0, .52778], 114: [0, .43056, 0, 0, .39167], 115: [0, .43056, 0, 0, .39445], 116: [0, .61508, 0, 0, .38889], 117: [0, .43056, 0, 0, .55556], 118: [0, .43056, .01389, 0, .52778], 119: [0, .43056, .01389, 0, .72222], 120: [0, .43056, 0, 0, .52778], 121: [.19444, .43056, .01389, 0, .52778], 122: [0, .43056, 0, 0, .44445], 123: [.25, .75, 0, 0, .5], 124: [.25, .75, 0, 0, .27778], 125: [.25, .75, 0, 0, .5], 126: [.35, .31786, 0, 0, .5], 160: [0, 0, 0, 0, .25], 163: [0, .69444, 0, 0, .76909], 167: [.19444, .69444, 0, 0, .44445], 168: [0, .66786, 0, 0, .5], 172: [0, .43056, 0, 0, .66667], 176: [0, .69444, 0, 0, .75], 177: [.08333, .58333, 0, 0, .77778], 182: [.19444, .69444, 0, 0, .61111], 184: [.17014, 0, 0, 0, .44445], 198: [0, .68333, 0, 0, .90278], 215: [.08333, .58333, 0, 0, .77778], 216: [.04861, .73194, 0, 0, .77778], 223: [0, .69444, 0, 0, .5], 230: [0, .43056, 0, 0, .72222], 247: [.08333, .58333, 0, 0, .77778], 248: [.09722, .52778, 0, 0, .5], 305: [0, .43056, 0, 0, .27778], 338: [0, .68333, 0, 0, 1.01389], 339: [0, .43056, 0, 0, .77778], 567: [.19444, .43056, 0, 0, .30556], 710: [0, .69444, 0, 0, .5], 711: [0, .62847, 0, 0, .5], 713: [0, .56778, 0, 0, .5], 714: [0, .69444, 0, 0, .5], 715: [0, .69444, 0, 0, .5], 728: [0, .69444, 0, 0, .5], 729: [0, .66786, 0, 0, .27778], 730: [0, .69444, 0, 0, .75], 732: [0, .66786, 0, 0, .5], 733: [0, .69444, 0, 0, .5], 915: [0, .68333, 0, 0, .625], 916: [0, .68333, 0, 0, .83334], 920: [0, .68333, 0, 0, .77778], 923: [0, .68333, 0, 0, .69445], 926: [0, .68333, 0, 0, .66667], 928: [0, .68333, 0, 0, .75], 931: [0, .68333, 0, 0, .72222], 933: [0, .68333, 0, 0, .77778], 934: [0, .68333, 0, 0, .72222], 936: [0, .68333, 0, 0, .77778], 937: [0, .68333, 0, 0, .72222], 8211: [0, .43056, .02778, 0, .5], 8212: [0, .43056, .02778, 0, 1], 8216: [0, .69444, 0, 0, .27778], 8217: [0, .69444, 0, 0, .27778], 8220: [0, .69444, 0, 0, .5], 8221: [0, .69444, 0, 0, .5], 8224: [.19444, .69444, 0, 0, .44445], 8225: [.19444, .69444, 0, 0, .44445], 8230: [0, .123, 0, 0, 1.172], 8242: [0, .55556, 0, 0, .275], 8407: [0, .71444, .15382, 0, .5], 8463: [0, .68889, 0, 0, .54028], 8465: [0, .69444, 0, 0, .72222], 8467: [0, .69444, 0, .11111, .41667], 8472: [.19444, .43056, 0, .11111, .63646], 8476: [0, .69444, 0, 0, .72222], 8501: [0, .69444, 0, 0, .61111], 8592: [-.13313, .36687, 0, 0, 1], 8593: [.19444, .69444, 0, 0, .5], 8594: [-.13313, .36687, 0, 0, 1], 8595: [.19444, .69444, 0, 0, .5], 8596: [-.13313, .36687, 0, 0, 1], 8597: [.25, .75, 0, 0, .5], 8598: [.19444, .69444, 0, 0, 1], 8599: [.19444, .69444, 0, 0, 1], 8600: [.19444, .69444, 0, 0, 1], 8601: [.19444, .69444, 0, 0, 1], 8614: [.011, .511, 0, 0, 1], 8617: [.011, .511, 0, 0, 1.126], 8618: [.011, .511, 0, 0, 1.126], 8636: [-.13313, .36687, 0, 0, 1], 8637: [-.13313, .36687, 0, 0, 1], 8640: [-.13313, .36687, 0, 0, 1], 8641: [-.13313, .36687, 0, 0, 1], 8652: [.011, .671, 0, 0, 1], 8656: [-.13313, .36687, 0, 0, 1], 8657: [.19444, .69444, 0, 0, .61111], 8658: [-.13313, .36687, 0, 0, 1], 8659: [.19444, .69444, 0, 0, .61111], 8660: [-.13313, .36687, 0, 0, 1], 8661: [.25, .75, 0, 0, .61111], 8704: [0, .69444, 0, 0, .55556], 8706: [0, .69444, .05556, .08334, .5309], 8707: [0, .69444, 0, 0, .55556], 8709: [.05556, .75, 0, 0, .5], 8711: [0, .68333, 0, 0, .83334], 8712: [.0391, .5391, 0, 0, .66667], 8715: [.0391, .5391, 0, 0, .66667], 8722: [.08333, .58333, 0, 0, .77778], 8723: [.08333, .58333, 0, 0, .77778], 8725: [.25, .75, 0, 0, .5], 8726: [.25, .75, 0, 0, .5], 8727: [-.03472, .46528, 0, 0, .5], 8728: [-.05555, .44445, 0, 0, .5], 8729: [-.05555, .44445, 0, 0, .5], 8730: [.2, .8, 0, 0, .83334], 8733: [0, .43056, 0, 0, .77778], 8734: [0, .43056, 0, 0, 1], 8736: [0, .69224, 0, 0, .72222], 8739: [.25, .75, 0, 0, .27778], 8741: [.25, .75, 0, 0, .5], 8743: [0, .55556, 0, 0, .66667], 8744: [0, .55556, 0, 0, .66667], 8745: [0, .55556, 0, 0, .66667], 8746: [0, .55556, 0, 0, .66667], 8747: [.19444, .69444, .11111, 0, .41667], 8764: [-.13313, .36687, 0, 0, .77778], 8768: [.19444, .69444, 0, 0, .27778], 8771: [-.03625, .46375, 0, 0, .77778], 8773: [-.022, .589, 0, 0, .778], 8776: [-.01688, .48312, 0, 0, .77778], 8781: [-.03625, .46375, 0, 0, .77778], 8784: [-.133, .673, 0, 0, .778], 8801: [-.03625, .46375, 0, 0, .77778], 8804: [.13597, .63597, 0, 0, .77778], 8805: [.13597, .63597, 0, 0, .77778], 8810: [.0391, .5391, 0, 0, 1], 8811: [.0391, .5391, 0, 0, 1], 8826: [.0391, .5391, 0, 0, .77778], 8827: [.0391, .5391, 0, 0, .77778], 8834: [.0391, .5391, 0, 0, .77778], 8835: [.0391, .5391, 0, 0, .77778], 8838: [.13597, .63597, 0, 0, .77778], 8839: [.13597, .63597, 0, 0, .77778], 8846: [0, .55556, 0, 0, .66667], 8849: [.13597, .63597, 0, 0, .77778], 8850: [.13597, .63597, 0, 0, .77778], 8851: [0, .55556, 0, 0, .66667], 8852: [0, .55556, 0, 0, .66667], 8853: [.08333, .58333, 0, 0, .77778], 8854: [.08333, .58333, 0, 0, .77778], 8855: [.08333, .58333, 0, 0, .77778], 8856: [.08333, .58333, 0, 0, .77778], 8857: [.08333, .58333, 0, 0, .77778], 8866: [0, .69444, 0, 0, .61111], 8867: [0, .69444, 0, 0, .61111], 8868: [0, .69444, 0, 0, .77778], 8869: [0, .69444, 0, 0, .77778], 8872: [.249, .75, 0, 0, .867], 8900: [-.05555, .44445, 0, 0, .5], 8901: [-.05555, .44445, 0, 0, .27778], 8902: [-.03472, .46528, 0, 0, .5], 8904: [.005, .505, 0, 0, .9], 8942: [.03, .903, 0, 0, .278], 8943: [-.19, .313, 0, 0, 1.172], 8945: [-.1, .823, 0, 0, 1.282], 8968: [.25, .75, 0, 0, .44445], 8969: [.25, .75, 0, 0, .44445], 8970: [.25, .75, 0, 0, .44445], 8971: [.25, .75, 0, 0, .44445], 8994: [-.14236, .35764, 0, 0, 1], 8995: [-.14236, .35764, 0, 0, 1], 9136: [.244, .744, 0, 0, .412], 9137: [.244, .745, 0, 0, .412], 9651: [.19444, .69444, 0, 0, .88889], 9657: [-.03472, .46528, 0, 0, .5], 9661: [.19444, .69444, 0, 0, .88889], 9667: [-.03472, .46528, 0, 0, .5], 9711: [.19444, .69444, 0, 0, 1], 9824: [.12963, .69444, 0, 0, .77778], 9825: [.12963, .69444, 0, 0, .77778], 9826: [.12963, .69444, 0, 0, .77778], 9827: [.12963, .69444, 0, 0, .77778], 9837: [0, .75, 0, 0, .38889], 9838: [.19444, .69444, 0, 0, .38889], 9839: [.19444, .69444, 0, 0, .38889], 10216: [.25, .75, 0, 0, .38889], 10217: [.25, .75, 0, 0, .38889], 10222: [.244, .744, 0, 0, .412], 10223: [.244, .745, 0, 0, .412], 10229: [.011, .511, 0, 0, 1.609], 10230: [.011, .511, 0, 0, 1.638], 10231: [.011, .511, 0, 0, 1.859], 10232: [.024, .525, 0, 0, 1.609], 10233: [.024, .525, 0, 0, 1.638], 10234: [.024, .525, 0, 0, 1.858], 10236: [.011, .511, 0, 0, 1.638], 10815: [0, .68333, 0, 0, .75], 10927: [.13597, .63597, 0, 0, .77778], 10928: [.13597, .63597, 0, 0, .77778], 57376: [.19444, .69444, 0, 0, 0] }, "Math-BoldItalic": { 32: [0, 0, 0, 0, .25], 48: [0, .44444, 0, 0, .575], 49: [0, .44444, 0, 0, .575], 50: [0, .44444, 0, 0, .575], 51: [.19444, .44444, 0, 0, .575], 52: [.19444, .44444, 0, 0, .575], 53: [.19444, .44444, 0, 0, .575], 54: [0, .64444, 0, 0, .575], 55: [.19444, .44444, 0, 0, .575], 56: [0, .64444, 0, 0, .575], 57: [.19444, .44444, 0, 0, .575], 65: [0, .68611, 0, 0, .86944], 66: [0, .68611, .04835, 0, .8664], 67: [0, .68611, .06979, 0, .81694], 68: [0, .68611, .03194, 0, .93812], 69: [0, .68611, .05451, 0, .81007], 70: [0, .68611, .15972, 0, .68889], 71: [0, .68611, 0, 0, .88673], 72: [0, .68611, .08229, 0, .98229], 73: [0, .68611, .07778, 0, .51111], 74: [0, .68611, .10069, 0, .63125], 75: [0, .68611, .06979, 0, .97118], 76: [0, .68611, 0, 0, .75555], 77: [0, .68611, .11424, 0, 1.14201], 78: [0, .68611, .11424, 0, .95034], 79: [0, .68611, .03194, 0, .83666], 80: [0, .68611, .15972, 0, .72309], 81: [.19444, .68611, 0, 0, .86861], 82: [0, .68611, .00421, 0, .87235], 83: [0, .68611, .05382, 0, .69271], 84: [0, .68611, .15972, 0, .63663], 85: [0, .68611, .11424, 0, .80027], 86: [0, .68611, .25555, 0, .67778], 87: [0, .68611, .15972, 0, 1.09305], 88: [0, .68611, .07778, 0, .94722], 89: [0, .68611, .25555, 0, .67458], 90: [0, .68611, .06979, 0, .77257], 97: [0, .44444, 0, 0, .63287], 98: [0, .69444, 0, 0, .52083], 99: [0, .44444, 0, 0, .51342], 100: [0, .69444, 0, 0, .60972], 101: [0, .44444, 0, 0, .55361], 102: [.19444, .69444, .11042, 0, .56806], 103: [.19444, .44444, .03704, 0, .5449], 104: [0, .69444, 0, 0, .66759], 105: [0, .69326, 0, 0, .4048], 106: [.19444, .69326, .0622, 0, .47083], 107: [0, .69444, .01852, 0, .6037], 108: [0, .69444, .0088, 0, .34815], 109: [0, .44444, 0, 0, 1.0324], 110: [0, .44444, 0, 0, .71296], 111: [0, .44444, 0, 0, .58472], 112: [.19444, .44444, 0, 0, .60092], 113: [.19444, .44444, .03704, 0, .54213], 114: [0, .44444, .03194, 0, .5287], 115: [0, .44444, 0, 0, .53125], 116: [0, .63492, 0, 0, .41528], 117: [0, .44444, 0, 0, .68102], 118: [0, .44444, .03704, 0, .56666], 119: [0, .44444, .02778, 0, .83148], 120: [0, .44444, 0, 0, .65903], 121: [.19444, .44444, .03704, 0, .59028], 122: [0, .44444, .04213, 0, .55509], 160: [0, 0, 0, 0, .25], 915: [0, .68611, .15972, 0, .65694], 916: [0, .68611, 0, 0, .95833], 920: [0, .68611, .03194, 0, .86722], 923: [0, .68611, 0, 0, .80555], 926: [0, .68611, .07458, 0, .84125], 928: [0, .68611, .08229, 0, .98229], 931: [0, .68611, .05451, 0, .88507], 933: [0, .68611, .15972, 0, .67083], 934: [0, .68611, 0, 0, .76666], 936: [0, .68611, .11653, 0, .71402], 937: [0, .68611, .04835, 0, .8789], 945: [0, .44444, 0, 0, .76064], 946: [.19444, .69444, .03403, 0, .65972], 947: [.19444, .44444, .06389, 0, .59003], 948: [0, .69444, .03819, 0, .52222], 949: [0, .44444, 0, 0, .52882], 950: [.19444, .69444, .06215, 0, .50833], 951: [.19444, .44444, .03704, 0, .6], 952: [0, .69444, .03194, 0, .5618], 953: [0, .44444, 0, 0, .41204], 954: [0, .44444, 0, 0, .66759], 955: [0, .69444, 0, 0, .67083], 956: [.19444, .44444, 0, 0, .70787], 957: [0, .44444, .06898, 0, .57685], 958: [.19444, .69444, .03021, 0, .50833], 959: [0, .44444, 0, 0, .58472], 960: [0, .44444, .03704, 0, .68241], 961: [.19444, .44444, 0, 0, .6118], 962: [.09722, .44444, .07917, 0, .42361], 963: [0, .44444, .03704, 0, .68588], 964: [0, .44444, .13472, 0, .52083], 965: [0, .44444, .03704, 0, .63055], 966: [.19444, .44444, 0, 0, .74722], 967: [.19444, .44444, 0, 0, .71805], 968: [.19444, .69444, .03704, 0, .75833], 969: [0, .44444, .03704, 0, .71782], 977: [0, .69444, 0, 0, .69155], 981: [.19444, .69444, 0, 0, .7125], 982: [0, .44444, .03194, 0, .975], 1009: [.19444, .44444, 0, 0, .6118], 1013: [0, .44444, 0, 0, .48333], 57649: [0, .44444, 0, 0, .39352], 57911: [.19444, .44444, 0, 0, .43889] }, "Math-Italic": { 32: [0, 0, 0, 0, .25], 48: [0, .43056, 0, 0, .5], 49: [0, .43056, 0, 0, .5], 50: [0, .43056, 0, 0, .5], 51: [.19444, .43056, 0, 0, .5], 52: [.19444, .43056, 0, 0, .5], 53: [.19444, .43056, 0, 0, .5], 54: [0, .64444, 0, 0, .5], 55: [.19444, .43056, 0, 0, .5], 56: [0, .64444, 0, 0, .5], 57: [.19444, .43056, 0, 0, .5], 65: [0, .68333, 0, .13889, .75], 66: [0, .68333, .05017, .08334, .75851], 67: [0, .68333, .07153, .08334, .71472], 68: [0, .68333, .02778, .05556, .82792], 69: [0, .68333, .05764, .08334, .7382], 70: [0, .68333, .13889, .08334, .64306], 71: [0, .68333, 0, .08334, .78625], 72: [0, .68333, .08125, .05556, .83125], 73: [0, .68333, .07847, .11111, .43958], 74: [0, .68333, .09618, .16667, .55451], 75: [0, .68333, .07153, .05556, .84931], 76: [0, .68333, 0, .02778, .68056], 77: [0, .68333, .10903, .08334, .97014], 78: [0, .68333, .10903, .08334, .80347], 79: [0, .68333, .02778, .08334, .76278], 80: [0, .68333, .13889, .08334, .64201], 81: [.19444, .68333, 0, .08334, .79056], 82: [0, .68333, .00773, .08334, .75929], 83: [0, .68333, .05764, .08334, .6132], 84: [0, .68333, .13889, .08334, .58438], 85: [0, .68333, .10903, .02778, .68278], 86: [0, .68333, .22222, 0, .58333], 87: [0, .68333, .13889, 0, .94445], 88: [0, .68333, .07847, .08334, .82847], 89: [0, .68333, .22222, 0, .58056], 90: [0, .68333, .07153, .08334, .68264], 97: [0, .43056, 0, 0, .52859], 98: [0, .69444, 0, 0, .42917], 99: [0, .43056, 0, .05556, .43276], 100: [0, .69444, 0, .16667, .52049], 101: [0, .43056, 0, .05556, .46563], 102: [.19444, .69444, .10764, .16667, .48959], 103: [.19444, .43056, .03588, .02778, .47697], 104: [0, .69444, 0, 0, .57616], 105: [0, .65952, 0, 0, .34451], 106: [.19444, .65952, .05724, 0, .41181], 107: [0, .69444, .03148, 0, .5206], 108: [0, .69444, .01968, .08334, .29838], 109: [0, .43056, 0, 0, .87801], 110: [0, .43056, 0, 0, .60023], 111: [0, .43056, 0, .05556, .48472], 112: [.19444, .43056, 0, .08334, .50313], 113: [.19444, .43056, .03588, .08334, .44641], 114: [0, .43056, .02778, .05556, .45116], 115: [0, .43056, 0, .05556, .46875], 116: [0, .61508, 0, .08334, .36111], 117: [0, .43056, 0, .02778, .57246], 118: [0, .43056, .03588, .02778, .48472], 119: [0, .43056, .02691, .08334, .71592], 120: [0, .43056, 0, .02778, .57153], 121: [.19444, .43056, .03588, .05556, .49028], 122: [0, .43056, .04398, .05556, .46505], 160: [0, 0, 0, 0, .25], 915: [0, .68333, .13889, .08334, .61528], 916: [0, .68333, 0, .16667, .83334], 920: [0, .68333, .02778, .08334, .76278], 923: [0, .68333, 0, .16667, .69445], 926: [0, .68333, .07569, .08334, .74236], 928: [0, .68333, .08125, .05556, .83125], 931: [0, .68333, .05764, .08334, .77986], 933: [0, .68333, .13889, .05556, .58333], 934: [0, .68333, 0, .08334, .66667], 936: [0, .68333, .11, .05556, .61222], 937: [0, .68333, .05017, .08334, .7724], 945: [0, .43056, .0037, .02778, .6397], 946: [.19444, .69444, .05278, .08334, .56563], 947: [.19444, .43056, .05556, 0, .51773], 948: [0, .69444, .03785, .05556, .44444], 949: [0, .43056, 0, .08334, .46632], 950: [.19444, .69444, .07378, .08334, .4375], 951: [.19444, .43056, .03588, .05556, .49653], 952: [0, .69444, .02778, .08334, .46944], 953: [0, .43056, 0, .05556, .35394], 954: [0, .43056, 0, 0, .57616], 955: [0, .69444, 0, 0, .58334], 956: [.19444, .43056, 0, .02778, .60255], 957: [0, .43056, .06366, .02778, .49398], 958: [.19444, .69444, .04601, .11111, .4375], 959: [0, .43056, 0, .05556, .48472], 960: [0, .43056, .03588, 0, .57003], 961: [.19444, .43056, 0, .08334, .51702], 962: [.09722, .43056, .07986, .08334, .36285], 963: [0, .43056, .03588, 0, .57141], 964: [0, .43056, .1132, .02778, .43715], 965: [0, .43056, .03588, .02778, .54028], 966: [.19444, .43056, 0, .08334, .65417], 967: [.19444, .43056, 0, .05556, .62569], 968: [.19444, .69444, .03588, .11111, .65139], 969: [0, .43056, .03588, 0, .62245], 977: [0, .69444, 0, .08334, .59144], 981: [.19444, .69444, 0, .08334, .59583], 982: [0, .43056, .02778, 0, .82813], 1009: [.19444, .43056, 0, .08334, .51702], 1013: [0, .43056, 0, .05556, .4059], 57649: [0, .43056, 0, .02778, .32246], 57911: [.19444, .43056, 0, .08334, .38403] }, "SansSerif-Bold": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, 0, 0, .36667], 34: [0, .69444, 0, 0, .55834], 35: [.19444, .69444, 0, 0, .91667], 36: [.05556, .75, 0, 0, .55], 37: [.05556, .75, 0, 0, 1.02912], 38: [0, .69444, 0, 0, .83056], 39: [0, .69444, 0, 0, .30556], 40: [.25, .75, 0, 0, .42778], 41: [.25, .75, 0, 0, .42778], 42: [0, .75, 0, 0, .55], 43: [.11667, .61667, 0, 0, .85556], 44: [.10556, .13056, 0, 0, .30556], 45: [0, .45833, 0, 0, .36667], 46: [0, .13056, 0, 0, .30556], 47: [.25, .75, 0, 0, .55], 48: [0, .69444, 0, 0, .55], 49: [0, .69444, 0, 0, .55], 50: [0, .69444, 0, 0, .55], 51: [0, .69444, 0, 0, .55], 52: [0, .69444, 0, 0, .55], 53: [0, .69444, 0, 0, .55], 54: [0, .69444, 0, 0, .55], 55: [0, .69444, 0, 0, .55], 56: [0, .69444, 0, 0, .55], 57: [0, .69444, 0, 0, .55], 58: [0, .45833, 0, 0, .30556], 59: [.10556, .45833, 0, 0, .30556], 61: [-.09375, .40625, 0, 0, .85556], 63: [0, .69444, 0, 0, .51945], 64: [0, .69444, 0, 0, .73334], 65: [0, .69444, 0, 0, .73334], 66: [0, .69444, 0, 0, .73334], 67: [0, .69444, 0, 0, .70278], 68: [0, .69444, 0, 0, .79445], 69: [0, .69444, 0, 0, .64167], 70: [0, .69444, 0, 0, .61111], 71: [0, .69444, 0, 0, .73334], 72: [0, .69444, 0, 0, .79445], 73: [0, .69444, 0, 0, .33056], 74: [0, .69444, 0, 0, .51945], 75: [0, .69444, 0, 0, .76389], 76: [0, .69444, 0, 0, .58056], 77: [0, .69444, 0, 0, .97778], 78: [0, .69444, 0, 0, .79445], 79: [0, .69444, 0, 0, .79445], 80: [0, .69444, 0, 0, .70278], 81: [.10556, .69444, 0, 0, .79445], 82: [0, .69444, 0, 0, .70278], 83: [0, .69444, 0, 0, .61111], 84: [0, .69444, 0, 0, .73334], 85: [0, .69444, 0, 0, .76389], 86: [0, .69444, .01528, 0, .73334], 87: [0, .69444, .01528, 0, 1.03889], 88: [0, .69444, 0, 0, .73334], 89: [0, .69444, .0275, 0, .73334], 90: [0, .69444, 0, 0, .67223], 91: [.25, .75, 0, 0, .34306], 93: [.25, .75, 0, 0, .34306], 94: [0, .69444, 0, 0, .55], 95: [.35, .10833, .03056, 0, .55], 97: [0, .45833, 0, 0, .525], 98: [0, .69444, 0, 0, .56111], 99: [0, .45833, 0, 0, .48889], 100: [0, .69444, 0, 0, .56111], 101: [0, .45833, 0, 0, .51111], 102: [0, .69444, .07639, 0, .33611], 103: [.19444, .45833, .01528, 0, .55], 104: [0, .69444, 0, 0, .56111], 105: [0, .69444, 0, 0, .25556], 106: [.19444, .69444, 0, 0, .28611], 107: [0, .69444, 0, 0, .53056], 108: [0, .69444, 0, 0, .25556], 109: [0, .45833, 0, 0, .86667], 110: [0, .45833, 0, 0, .56111], 111: [0, .45833, 0, 0, .55], 112: [.19444, .45833, 0, 0, .56111], 113: [.19444, .45833, 0, 0, .56111], 114: [0, .45833, .01528, 0, .37222], 115: [0, .45833, 0, 0, .42167], 116: [0, .58929, 0, 0, .40417], 117: [0, .45833, 0, 0, .56111], 118: [0, .45833, .01528, 0, .5], 119: [0, .45833, .01528, 0, .74445], 120: [0, .45833, 0, 0, .5], 121: [.19444, .45833, .01528, 0, .5], 122: [0, .45833, 0, 0, .47639], 126: [.35, .34444, 0, 0, .55], 160: [0, 0, 0, 0, .25], 168: [0, .69444, 0, 0, .55], 176: [0, .69444, 0, 0, .73334], 180: [0, .69444, 0, 0, .55], 184: [.17014, 0, 0, 0, .48889], 305: [0, .45833, 0, 0, .25556], 567: [.19444, .45833, 0, 0, .28611], 710: [0, .69444, 0, 0, .55], 711: [0, .63542, 0, 0, .55], 713: [0, .63778, 0, 0, .55], 728: [0, .69444, 0, 0, .55], 729: [0, .69444, 0, 0, .30556], 730: [0, .69444, 0, 0, .73334], 732: [0, .69444, 0, 0, .55], 733: [0, .69444, 0, 0, .55], 915: [0, .69444, 0, 0, .58056], 916: [0, .69444, 0, 0, .91667], 920: [0, .69444, 0, 0, .85556], 923: [0, .69444, 0, 0, .67223], 926: [0, .69444, 0, 0, .73334], 928: [0, .69444, 0, 0, .79445], 931: [0, .69444, 0, 0, .79445], 933: [0, .69444, 0, 0, .85556], 934: [0, .69444, 0, 0, .79445], 936: [0, .69444, 0, 0, .85556], 937: [0, .69444, 0, 0, .79445], 8211: [0, .45833, .03056, 0, .55], 8212: [0, .45833, .03056, 0, 1.10001], 8216: [0, .69444, 0, 0, .30556], 8217: [0, .69444, 0, 0, .30556], 8220: [0, .69444, 0, 0, .55834], 8221: [0, .69444, 0, 0, .55834] }, "SansSerif-Italic": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, .05733, 0, .31945], 34: [0, .69444, .00316, 0, .5], 35: [.19444, .69444, .05087, 0, .83334], 36: [.05556, .75, .11156, 0, .5], 37: [.05556, .75, .03126, 0, .83334], 38: [0, .69444, .03058, 0, .75834], 39: [0, .69444, .07816, 0, .27778], 40: [.25, .75, .13164, 0, .38889], 41: [.25, .75, .02536, 0, .38889], 42: [0, .75, .11775, 0, .5], 43: [.08333, .58333, .02536, 0, .77778], 44: [.125, .08333, 0, 0, .27778], 45: [0, .44444, .01946, 0, .33333], 46: [0, .08333, 0, 0, .27778], 47: [.25, .75, .13164, 0, .5], 48: [0, .65556, .11156, 0, .5], 49: [0, .65556, .11156, 0, .5], 50: [0, .65556, .11156, 0, .5], 51: [0, .65556, .11156, 0, .5], 52: [0, .65556, .11156, 0, .5], 53: [0, .65556, .11156, 0, .5], 54: [0, .65556, .11156, 0, .5], 55: [0, .65556, .11156, 0, .5], 56: [0, .65556, .11156, 0, .5], 57: [0, .65556, .11156, 0, .5], 58: [0, .44444, .02502, 0, .27778], 59: [.125, .44444, .02502, 0, .27778], 61: [-.13, .37, .05087, 0, .77778], 63: [0, .69444, .11809, 0, .47222], 64: [0, .69444, .07555, 0, .66667], 65: [0, .69444, 0, 0, .66667], 66: [0, .69444, .08293, 0, .66667], 67: [0, .69444, .11983, 0, .63889], 68: [0, .69444, .07555, 0, .72223], 69: [0, .69444, .11983, 0, .59722], 70: [0, .69444, .13372, 0, .56945], 71: [0, .69444, .11983, 0, .66667], 72: [0, .69444, .08094, 0, .70834], 73: [0, .69444, .13372, 0, .27778], 74: [0, .69444, .08094, 0, .47222], 75: [0, .69444, .11983, 0, .69445], 76: [0, .69444, 0, 0, .54167], 77: [0, .69444, .08094, 0, .875], 78: [0, .69444, .08094, 0, .70834], 79: [0, .69444, .07555, 0, .73611], 80: [0, .69444, .08293, 0, .63889], 81: [.125, .69444, .07555, 0, .73611], 82: [0, .69444, .08293, 0, .64584], 83: [0, .69444, .09205, 0, .55556], 84: [0, .69444, .13372, 0, .68056], 85: [0, .69444, .08094, 0, .6875], 86: [0, .69444, .1615, 0, .66667], 87: [0, .69444, .1615, 0, .94445], 88: [0, .69444, .13372, 0, .66667], 89: [0, .69444, .17261, 0, .66667], 90: [0, .69444, .11983, 0, .61111], 91: [.25, .75, .15942, 0, .28889], 93: [.25, .75, .08719, 0, .28889], 94: [0, .69444, .0799, 0, .5], 95: [.35, .09444, .08616, 0, .5], 97: [0, .44444, .00981, 0, .48056], 98: [0, .69444, .03057, 0, .51667], 99: [0, .44444, .08336, 0, .44445], 100: [0, .69444, .09483, 0, .51667], 101: [0, .44444, .06778, 0, .44445], 102: [0, .69444, .21705, 0, .30556], 103: [.19444, .44444, .10836, 0, .5], 104: [0, .69444, .01778, 0, .51667], 105: [0, .67937, .09718, 0, .23889], 106: [.19444, .67937, .09162, 0, .26667], 107: [0, .69444, .08336, 0, .48889], 108: [0, .69444, .09483, 0, .23889], 109: [0, .44444, .01778, 0, .79445], 110: [0, .44444, .01778, 0, .51667], 111: [0, .44444, .06613, 0, .5], 112: [.19444, .44444, .0389, 0, .51667], 113: [.19444, .44444, .04169, 0, .51667], 114: [0, .44444, .10836, 0, .34167], 115: [0, .44444, .0778, 0, .38333], 116: [0, .57143, .07225, 0, .36111], 117: [0, .44444, .04169, 0, .51667], 118: [0, .44444, .10836, 0, .46111], 119: [0, .44444, .10836, 0, .68334], 120: [0, .44444, .09169, 0, .46111], 121: [.19444, .44444, .10836, 0, .46111], 122: [0, .44444, .08752, 0, .43472], 126: [.35, .32659, .08826, 0, .5], 160: [0, 0, 0, 0, .25], 168: [0, .67937, .06385, 0, .5], 176: [0, .69444, 0, 0, .73752], 184: [.17014, 0, 0, 0, .44445], 305: [0, .44444, .04169, 0, .23889], 567: [.19444, .44444, .04169, 0, .26667], 710: [0, .69444, .0799, 0, .5], 711: [0, .63194, .08432, 0, .5], 713: [0, .60889, .08776, 0, .5], 714: [0, .69444, .09205, 0, .5], 715: [0, .69444, 0, 0, .5], 728: [0, .69444, .09483, 0, .5], 729: [0, .67937, .07774, 0, .27778], 730: [0, .69444, 0, 0, .73752], 732: [0, .67659, .08826, 0, .5], 733: [0, .69444, .09205, 0, .5], 915: [0, .69444, .13372, 0, .54167], 916: [0, .69444, 0, 0, .83334], 920: [0, .69444, .07555, 0, .77778], 923: [0, .69444, 0, 0, .61111], 926: [0, .69444, .12816, 0, .66667], 928: [0, .69444, .08094, 0, .70834], 931: [0, .69444, .11983, 0, .72222], 933: [0, .69444, .09031, 0, .77778], 934: [0, .69444, .04603, 0, .72222], 936: [0, .69444, .09031, 0, .77778], 937: [0, .69444, .08293, 0, .72222], 8211: [0, .44444, .08616, 0, .5], 8212: [0, .44444, .08616, 0, 1], 8216: [0, .69444, .07816, 0, .27778], 8217: [0, .69444, .07816, 0, .27778], 8220: [0, .69444, .14205, 0, .5], 8221: [0, .69444, .00316, 0, .5] }, "SansSerif-Regular": { 32: [0, 0, 0, 0, .25], 33: [0, .69444, 0, 0, .31945], 34: [0, .69444, 0, 0, .5], 35: [.19444, .69444, 0, 0, .83334], 36: [.05556, .75, 0, 0, .5], 37: [.05556, .75, 0, 0, .83334], 38: [0, .69444, 0, 0, .75834], 39: [0, .69444, 0, 0, .27778], 40: [.25, .75, 0, 0, .38889], 41: [.25, .75, 0, 0, .38889], 42: [0, .75, 0, 0, .5], 43: [.08333, .58333, 0, 0, .77778], 44: [.125, .08333, 0, 0, .27778], 45: [0, .44444, 0, 0, .33333], 46: [0, .08333, 0, 0, .27778], 47: [.25, .75, 0, 0, .5], 48: [0, .65556, 0, 0, .5], 49: [0, .65556, 0, 0, .5], 50: [0, .65556, 0, 0, .5], 51: [0, .65556, 0, 0, .5], 52: [0, .65556, 0, 0, .5], 53: [0, .65556, 0, 0, .5], 54: [0, .65556, 0, 0, .5], 55: [0, .65556, 0, 0, .5], 56: [0, .65556, 0, 0, .5], 57: [0, .65556, 0, 0, .5], 58: [0, .44444, 0, 0, .27778], 59: [.125, .44444, 0, 0, .27778], 61: [-.13, .37, 0, 0, .77778], 63: [0, .69444, 0, 0, .47222], 64: [0, .69444, 0, 0, .66667], 65: [0, .69444, 0, 0, .66667], 66: [0, .69444, 0, 0, .66667], 67: [0, .69444, 0, 0, .63889], 68: [0, .69444, 0, 0, .72223], 69: [0, .69444, 0, 0, .59722], 70: [0, .69444, 0, 0, .56945], 71: [0, .69444, 0, 0, .66667], 72: [0, .69444, 0, 0, .70834], 73: [0, .69444, 0, 0, .27778], 74: [0, .69444, 0, 0, .47222], 75: [0, .69444, 0, 0, .69445], 76: [0, .69444, 0, 0, .54167], 77: [0, .69444, 0, 0, .875], 78: [0, .69444, 0, 0, .70834], 79: [0, .69444, 0, 0, .73611], 80: [0, .69444, 0, 0, .63889], 81: [.125, .69444, 0, 0, .73611], 82: [0, .69444, 0, 0, .64584], 83: [0, .69444, 0, 0, .55556], 84: [0, .69444, 0, 0, .68056], 85: [0, .69444, 0, 0, .6875], 86: [0, .69444, .01389, 0, .66667], 87: [0, .69444, .01389, 0, .94445], 88: [0, .69444, 0, 0, .66667], 89: [0, .69444, .025, 0, .66667], 90: [0, .69444, 0, 0, .61111], 91: [.25, .75, 0, 0, .28889], 93: [.25, .75, 0, 0, .28889], 94: [0, .69444, 0, 0, .5], 95: [.35, .09444, .02778, 0, .5], 97: [0, .44444, 0, 0, .48056], 98: [0, .69444, 0, 0, .51667], 99: [0, .44444, 0, 0, .44445], 100: [0, .69444, 0, 0, .51667], 101: [0, .44444, 0, 0, .44445], 102: [0, .69444, .06944, 0, .30556], 103: [.19444, .44444, .01389, 0, .5], 104: [0, .69444, 0, 0, .51667], 105: [0, .67937, 0, 0, .23889], 106: [.19444, .67937, 0, 0, .26667], 107: [0, .69444, 0, 0, .48889], 108: [0, .69444, 0, 0, .23889], 109: [0, .44444, 0, 0, .79445], 110: [0, .44444, 0, 0, .51667], 111: [0, .44444, 0, 0, .5], 112: [.19444, .44444, 0, 0, .51667], 113: [.19444, .44444, 0, 0, .51667], 114: [0, .44444, .01389, 0, .34167], 115: [0, .44444, 0, 0, .38333], 116: [0, .57143, 0, 0, .36111], 117: [0, .44444, 0, 0, .51667], 118: [0, .44444, .01389, 0, .46111], 119: [0, .44444, .01389, 0, .68334], 120: [0, .44444, 0, 0, .46111], 121: [.19444, .44444, .01389, 0, .46111], 122: [0, .44444, 0, 0, .43472], 126: [.35, .32659, 0, 0, .5], 160: [0, 0, 0, 0, .25], 168: [0, .67937, 0, 0, .5], 176: [0, .69444, 0, 0, .66667], 184: [.17014, 0, 0, 0, .44445], 305: [0, .44444, 0, 0, .23889], 567: [.19444, .44444, 0, 0, .26667], 710: [0, .69444, 0, 0, .5], 711: [0, .63194, 0, 0, .5], 713: [0, .60889, 0, 0, .5], 714: [0, .69444, 0, 0, .5], 715: [0, .69444, 0, 0, .5], 728: [0, .69444, 0, 0, .5], 729: [0, .67937, 0, 0, .27778], 730: [0, .69444, 0, 0, .66667], 732: [0, .67659, 0, 0, .5], 733: [0, .69444, 0, 0, .5], 915: [0, .69444, 0, 0, .54167], 916: [0, .69444, 0, 0, .83334], 920: [0, .69444, 0, 0, .77778], 923: [0, .69444, 0, 0, .61111], 926: [0, .69444, 0, 0, .66667], 928: [0, .69444, 0, 0, .70834], 931: [0, .69444, 0, 0, .72222], 933: [0, .69444, 0, 0, .77778], 934: [0, .69444, 0, 0, .72222], 936: [0, .69444, 0, 0, .77778], 937: [0, .69444, 0, 0, .72222], 8211: [0, .44444, .02778, 0, .5], 8212: [0, .44444, .02778, 0, 1], 8216: [0, .69444, 0, 0, .27778], 8217: [0, .69444, 0, 0, .27778], 8220: [0, .69444, 0, 0, .5], 8221: [0, .69444, 0, 0, .5] }, "Script-Regular": { 32: [0, 0, 0, 0, .25], 65: [0, .7, .22925, 0, .80253], 66: [0, .7, .04087, 0, .90757], 67: [0, .7, .1689, 0, .66619], 68: [0, .7, .09371, 0, .77443], 69: [0, .7, .18583, 0, .56162], 70: [0, .7, .13634, 0, .89544], 71: [0, .7, .17322, 0, .60961], 72: [0, .7, .29694, 0, .96919], 73: [0, .7, .19189, 0, .80907], 74: [.27778, .7, .19189, 0, 1.05159], 75: [0, .7, .31259, 0, .91364], 76: [0, .7, .19189, 0, .87373], 77: [0, .7, .15981, 0, 1.08031], 78: [0, .7, .3525, 0, .9015], 79: [0, .7, .08078, 0, .73787], 80: [0, .7, .08078, 0, 1.01262], 81: [0, .7, .03305, 0, .88282], 82: [0, .7, .06259, 0, .85], 83: [0, .7, .19189, 0, .86767], 84: [0, .7, .29087, 0, .74697], 85: [0, .7, .25815, 0, .79996], 86: [0, .7, .27523, 0, .62204], 87: [0, .7, .27523, 0, .80532], 88: [0, .7, .26006, 0, .94445], 89: [0, .7, .2939, 0, .70961], 90: [0, .7, .24037, 0, .8212], 160: [0, 0, 0, 0, .25] }, "Size1-Regular": { 32: [0, 0, 0, 0, .25], 40: [.35001, .85, 0, 0, .45834], 41: [.35001, .85, 0, 0, .45834], 47: [.35001, .85, 0, 0, .57778], 91: [.35001, .85, 0, 0, .41667], 92: [.35001, .85, 0, 0, .57778], 93: [.35001, .85, 0, 0, .41667], 123: [.35001, .85, 0, 0, .58334], 125: [.35001, .85, 0, 0, .58334], 160: [0, 0, 0, 0, .25], 710: [0, .72222, 0, 0, .55556], 732: [0, .72222, 0, 0, .55556], 770: [0, .72222, 0, 0, .55556], 771: [0, .72222, 0, 0, .55556], 8214: [-99e-5, .601, 0, 0, .77778], 8593: [1e-5, .6, 0, 0, .66667], 8595: [1e-5, .6, 0, 0, .66667], 8657: [1e-5, .6, 0, 0, .77778], 8659: [1e-5, .6, 0, 0, .77778], 8719: [.25001, .75, 0, 0, .94445], 8720: [.25001, .75, 0, 0, .94445], 8721: [.25001, .75, 0, 0, 1.05556], 8730: [.35001, .85, 0, 0, 1], 8739: [-.00599, .606, 0, 0, .33333], 8741: [-.00599, .606, 0, 0, .55556], 8747: [.30612, .805, .19445, 0, .47222], 8748: [.306, .805, .19445, 0, .47222], 8749: [.306, .805, .19445, 0, .47222], 8750: [.30612, .805, .19445, 0, .47222], 8896: [.25001, .75, 0, 0, .83334], 8897: [.25001, .75, 0, 0, .83334], 8898: [.25001, .75, 0, 0, .83334], 8899: [.25001, .75, 0, 0, .83334], 8968: [.35001, .85, 0, 0, .47222], 8969: [.35001, .85, 0, 0, .47222], 8970: [.35001, .85, 0, 0, .47222], 8971: [.35001, .85, 0, 0, .47222], 9168: [-99e-5, .601, 0, 0, .66667], 10216: [.35001, .85, 0, 0, .47222], 10217: [.35001, .85, 0, 0, .47222], 10752: [.25001, .75, 0, 0, 1.11111], 10753: [.25001, .75, 0, 0, 1.11111], 10754: [.25001, .75, 0, 0, 1.11111], 10756: [.25001, .75, 0, 0, .83334], 10758: [.25001, .75, 0, 0, .83334] }, "Size2-Regular": { 32: [0, 0, 0, 0, .25], 40: [.65002, 1.15, 0, 0, .59722], 41: [.65002, 1.15, 0, 0, .59722], 47: [.65002, 1.15, 0, 0, .81111], 91: [.65002, 1.15, 0, 0, .47222], 92: [.65002, 1.15, 0, 0, .81111], 93: [.65002, 1.15, 0, 0, .47222], 123: [.65002, 1.15, 0, 0, .66667], 125: [.65002, 1.15, 0, 0, .66667], 160: [0, 0, 0, 0, .25], 710: [0, .75, 0, 0, 1], 732: [0, .75, 0, 0, 1], 770: [0, .75, 0, 0, 1], 771: [0, .75, 0, 0, 1], 8719: [.55001, 1.05, 0, 0, 1.27778], 8720: [.55001, 1.05, 0, 0, 1.27778], 8721: [.55001, 1.05, 0, 0, 1.44445], 8730: [.65002, 1.15, 0, 0, 1], 8747: [.86225, 1.36, .44445, 0, .55556], 8748: [.862, 1.36, .44445, 0, .55556], 8749: [.862, 1.36, .44445, 0, .55556], 8750: [.86225, 1.36, .44445, 0, .55556], 8896: [.55001, 1.05, 0, 0, 1.11111], 8897: [.55001, 1.05, 0, 0, 1.11111], 8898: [.55001, 1.05, 0, 0, 1.11111], 8899: [.55001, 1.05, 0, 0, 1.11111], 8968: [.65002, 1.15, 0, 0, .52778], 8969: [.65002, 1.15, 0, 0, .52778], 8970: [.65002, 1.15, 0, 0, .52778], 8971: [.65002, 1.15, 0, 0, .52778], 10216: [.65002, 1.15, 0, 0, .61111], 10217: [.65002, 1.15, 0, 0, .61111], 10752: [.55001, 1.05, 0, 0, 1.51112], 10753: [.55001, 1.05, 0, 0, 1.51112], 10754: [.55001, 1.05, 0, 0, 1.51112], 10756: [.55001, 1.05, 0, 0, 1.11111], 10758: [.55001, 1.05, 0, 0, 1.11111] }, "Size3-Regular": { 32: [0, 0, 0, 0, .25], 40: [.95003, 1.45, 0, 0, .73611], 41: [.95003, 1.45, 0, 0, .73611], 47: [.95003, 1.45, 0, 0, 1.04445], 91: [.95003, 1.45, 0, 0, .52778], 92: [.95003, 1.45, 0, 0, 1.04445], 93: [.95003, 1.45, 0, 0, .52778], 123: [.95003, 1.45, 0, 0, .75], 125: [.95003, 1.45, 0, 0, .75], 160: [0, 0, 0, 0, .25], 710: [0, .75, 0, 0, 1.44445], 732: [0, .75, 0, 0, 1.44445], 770: [0, .75, 0, 0, 1.44445], 771: [0, .75, 0, 0, 1.44445], 8730: [.95003, 1.45, 0, 0, 1], 8968: [.95003, 1.45, 0, 0, .58334], 8969: [.95003, 1.45, 0, 0, .58334], 8970: [.95003, 1.45, 0, 0, .58334], 8971: [.95003, 1.45, 0, 0, .58334], 10216: [.95003, 1.45, 0, 0, .75], 10217: [.95003, 1.45, 0, 0, .75] }, "Size4-Regular": { 32: [0, 0, 0, 0, .25], 40: [1.25003, 1.75, 0, 0, .79167], 41: [1.25003, 1.75, 0, 0, .79167], 47: [1.25003, 1.75, 0, 0, 1.27778], 91: [1.25003, 1.75, 0, 0, .58334], 92: [1.25003, 1.75, 0, 0, 1.27778], 93: [1.25003, 1.75, 0, 0, .58334], 123: [1.25003, 1.75, 0, 0, .80556], 125: [1.25003, 1.75, 0, 0, .80556], 160: [0, 0, 0, 0, .25], 710: [0, .825, 0, 0, 1.8889], 732: [0, .825, 0, 0, 1.8889], 770: [0, .825, 0, 0, 1.8889], 771: [0, .825, 0, 0, 1.8889], 8730: [1.25003, 1.75, 0, 0, 1], 8968: [1.25003, 1.75, 0, 0, .63889], 8969: [1.25003, 1.75, 0, 0, .63889], 8970: [1.25003, 1.75, 0, 0, .63889], 8971: [1.25003, 1.75, 0, 0, .63889], 9115: [.64502, 1.155, 0, 0, .875], 9116: [1e-5, .6, 0, 0, .875], 9117: [.64502, 1.155, 0, 0, .875], 9118: [.64502, 1.155, 0, 0, .875], 9119: [1e-5, .6, 0, 0, .875], 9120: [.64502, 1.155, 0, 0, .875], 9121: [.64502, 1.155, 0, 0, .66667], 9122: [-99e-5, .601, 0, 0, .66667], 9123: [.64502, 1.155, 0, 0, .66667], 9124: [.64502, 1.155, 0, 0, .66667], 9125: [-99e-5, .601, 0, 0, .66667], 9126: [.64502, 1.155, 0, 0, .66667], 9127: [1e-5, .9, 0, 0, .88889], 9128: [.65002, 1.15, 0, 0, .88889], 9129: [.90001, 0, 0, 0, .88889], 9130: [0, .3, 0, 0, .88889], 9131: [1e-5, .9, 0, 0, .88889], 9132: [.65002, 1.15, 0, 0, .88889], 9133: [.90001, 0, 0, 0, .88889], 9143: [.88502, .915, 0, 0, 1.05556], 10216: [1.25003, 1.75, 0, 0, .80556], 10217: [1.25003, 1.75, 0, 0, .80556], 57344: [-.00499, .605, 0, 0, 1.05556], 57345: [-.00499, .605, 0, 0, 1.05556], 57680: [0, .12, 0, 0, .45], 57681: [0, .12, 0, 0, .45], 57682: [0, .12, 0, 0, .45], 57683: [0, .12, 0, 0, .45] }, "Typewriter-Regular": { 32: [0, 0, 0, 0, .525], 33: [0, .61111, 0, 0, .525], 34: [0, .61111, 0, 0, .525], 35: [0, .61111, 0, 0, .525], 36: [.08333, .69444, 0, 0, .525], 37: [.08333, .69444, 0, 0, .525], 38: [0, .61111, 0, 0, .525], 39: [0, .61111, 0, 0, .525], 40: [.08333, .69444, 0, 0, .525], 41: [.08333, .69444, 0, 0, .525], 42: [0, .52083, 0, 0, .525], 43: [-.08056, .53055, 0, 0, .525], 44: [.13889, .125, 0, 0, .525], 45: [-.08056, .53055, 0, 0, .525], 46: [0, .125, 0, 0, .525], 47: [.08333, .69444, 0, 0, .525], 48: [0, .61111, 0, 0, .525], 49: [0, .61111, 0, 0, .525], 50: [0, .61111, 0, 0, .525], 51: [0, .61111, 0, 0, .525], 52: [0, .61111, 0, 0, .525], 53: [0, .61111, 0, 0, .525], 54: [0, .61111, 0, 0, .525], 55: [0, .61111, 0, 0, .525], 56: [0, .61111, 0, 0, .525], 57: [0, .61111, 0, 0, .525], 58: [0, .43056, 0, 0, .525], 59: [.13889, .43056, 0, 0, .525], 60: [-.05556, .55556, 0, 0, .525], 61: [-.19549, .41562, 0, 0, .525], 62: [-.05556, .55556, 0, 0, .525], 63: [0, .61111, 0, 0, .525], 64: [0, .61111, 0, 0, .525], 65: [0, .61111, 0, 0, .525], 66: [0, .61111, 0, 0, .525], 67: [0, .61111, 0, 0, .525], 68: [0, .61111, 0, 0, .525], 69: [0, .61111, 0, 0, .525], 70: [0, .61111, 0, 0, .525], 71: [0, .61111, 0, 0, .525], 72: [0, .61111, 0, 0, .525], 73: [0, .61111, 0, 0, .525], 74: [0, .61111, 0, 0, .525], 75: [0, .61111, 0, 0, .525], 76: [0, .61111, 0, 0, .525], 77: [0, .61111, 0, 0, .525], 78: [0, .61111, 0, 0, .525], 79: [0, .61111, 0, 0, .525], 80: [0, .61111, 0, 0, .525], 81: [.13889, .61111, 0, 0, .525], 82: [0, .61111, 0, 0, .525], 83: [0, .61111, 0, 0, .525], 84: [0, .61111, 0, 0, .525], 85: [0, .61111, 0, 0, .525], 86: [0, .61111, 0, 0, .525], 87: [0, .61111, 0, 0, .525], 88: [0, .61111, 0, 0, .525], 89: [0, .61111, 0, 0, .525], 90: [0, .61111, 0, 0, .525], 91: [.08333, .69444, 0, 0, .525], 92: [.08333, .69444, 0, 0, .525], 93: [.08333, .69444, 0, 0, .525], 94: [0, .61111, 0, 0, .525], 95: [.09514, 0, 0, 0, .525], 96: [0, .61111, 0, 0, .525], 97: [0, .43056, 0, 0, .525], 98: [0, .61111, 0, 0, .525], 99: [0, .43056, 0, 0, .525], 100: [0, .61111, 0, 0, .525], 101: [0, .43056, 0, 0, .525], 102: [0, .61111, 0, 0, .525], 103: [.22222, .43056, 0, 0, .525], 104: [0, .61111, 0, 0, .525], 105: [0, .61111, 0, 0, .525], 106: [.22222, .61111, 0, 0, .525], 107: [0, .61111, 0, 0, .525], 108: [0, .61111, 0, 0, .525], 109: [0, .43056, 0, 0, .525], 110: [0, .43056, 0, 0, .525], 111: [0, .43056, 0, 0, .525], 112: [.22222, .43056, 0, 0, .525], 113: [.22222, .43056, 0, 0, .525], 114: [0, .43056, 0, 0, .525], 115: [0, .43056, 0, 0, .525], 116: [0, .55358, 0, 0, .525], 117: [0, .43056, 0, 0, .525], 118: [0, .43056, 0, 0, .525], 119: [0, .43056, 0, 0, .525], 120: [0, .43056, 0, 0, .525], 121: [.22222, .43056, 0, 0, .525], 122: [0, .43056, 0, 0, .525], 123: [.08333, .69444, 0, 0, .525], 124: [.08333, .69444, 0, 0, .525], 125: [.08333, .69444, 0, 0, .525], 126: [0, .61111, 0, 0, .525], 127: [0, .61111, 0, 0, .525], 160: [0, 0, 0, 0, .525], 176: [0, .61111, 0, 0, .525], 184: [.19445, 0, 0, 0, .525], 305: [0, .43056, 0, 0, .525], 567: [.22222, .43056, 0, 0, .525], 711: [0, .56597, 0, 0, .525], 713: [0, .56555, 0, 0, .525], 714: [0, .61111, 0, 0, .525], 715: [0, .61111, 0, 0, .525], 728: [0, .61111, 0, 0, .525], 730: [0, .61111, 0, 0, .525], 770: [0, .61111, 0, 0, .525], 771: [0, .61111, 0, 0, .525], 776: [0, .61111, 0, 0, .525], 915: [0, .61111, 0, 0, .525], 916: [0, .61111, 0, 0, .525], 920: [0, .61111, 0, 0, .525], 923: [0, .61111, 0, 0, .525], 926: [0, .61111, 0, 0, .525], 928: [0, .61111, 0, 0, .525], 931: [0, .61111, 0, 0, .525], 933: [0, .61111, 0, 0, .525], 934: [0, .61111, 0, 0, .525], 936: [0, .61111, 0, 0, .525], 937: [0, .61111, 0, 0, .525], 8216: [0, .61111, 0, 0, .525], 8217: [0, .61111, 0, 0, .525], 8242: [0, .61111, 0, 0, .525], 9251: [.11111, .21944, 0, 0, .525] } }; const B = { slant: [.25, .25, .25], space: [0, 0, 0], stretch: [0, 0, 0], shrink: [0, 0, 0], xHeight: [.431, .431, .431], quad: [1, 1.171, 1.472], extraSpace: [0, 0, 0], num1: [.677, .732, .925], num2: [.394, .384, .387], num3: [.444, .471, .504], denom1: [.686, .752, 1.025], denom2: [.345, .344, .532], sup1: [.413, .503, .504], sup2: [.363, .431, .404], sup3: [.289, .286, .294], sub1: [.15, .143, .2], sub2: [.247, .286, .4], supDrop: [.386, .353, .494], subDrop: [.05, .071, .1], delim1: [2.39, 1.7, 1.98], delim2: [1.01, 1.157, 1.42], axisHeight: [.25, .25, .25], defaultRuleThickness: [.04, .049, .049], bigOpSpacing1: [.111, .111, .111], bigOpSpacing2: [.166, .166, .166], bigOpSpacing3: [.2, .2, .2], bigOpSpacing4: [.6, .611, .611], bigOpSpacing5: [.1, .143, .143], sqrtRuleThickness: [.04, .04, .04], ptPerEm: [10, 10, 10], doubleRuleSep: [.2, .2, .2], arrayRuleWidth: [.04, .04, .04], fboxsep: [.3, .3, .3], fboxrule: [.04, .04, .04] }, C = { "\xc5": "A", "\xd0": "D", "\xde": "o", "\xe5": "a", "\xf0": "d", "\xfe": "o", "\u0410": "A", "\u0411": "B", "\u0412": "B", "\u0413": "F", "\u0414": "A", "\u0415": "E", "\u0416": "K", "\u0417": "3", "\u0418": "N", "\u0419": "N", "\u041a": "K", "\u041b": "N", "\u041c": "M", "\u041d": "H", "\u041e": "O", "\u041f": "N", "\u0420": "P", "\u0421": "C", "\u0422": "T", "\u0423": "y", "\u0424": "O", "\u0425": "X", "\u0426": "U", "\u0427": "h", "\u0428": "W", "\u0429": "W", "\u042a": "B", "\u042b": "X", "\u042c": "B", "\u042d": "3", "\u042e": "X", "\u042f": "R", "\u0430": "a", "\u0431": "b", "\u0432": "a", "\u0433": "r", "\u0434": "y", "\u0435": "e", "\u0436": "m", "\u0437": "e", "\u0438": "n", "\u0439": "n", "\u043a": "n", "\u043b": "n", "\u043c": "m", "\u043d": "n", "\u043e": "o", "\u043f": "n", "\u0440": "p", "\u0441": "c", "\u0442": "o", "\u0443": "y", "\u0444": "b", "\u0445": "x", "\u0446": "n", "\u0447": "n", "\u0448": "w", "\u0449": "w", "\u044a": "a", "\u044b": "m", "\u044c": "a", "\u044d": "e", "\u044e": "m", "\u044f": "r" }; function N(e, t, r) { if (!T[t]) throw new Error("Font metrics not found for font: " + t + "."); let n = e.charCodeAt(0), o = T[t][n]; if (!o && e[0] in C && (n = C[e[0]].charCodeAt(0), o = T[t][n]), o || "text" !== r || S(n) && (o = T[t][77]), o) return { depth: o[0], height: o[1], italic: o[2], skew: o[3], width: o[4] } } const q = {}; const I = [[1, 1, 1], [2, 1, 1], [3, 1, 1], [4, 2, 1], [5, 2, 1], [6, 3, 1], [7, 4, 2], [8, 6, 3], [9, 7, 6], [10, 8, 7], [11, 10, 9]], R = [.5, .6, .7, .8, .9, 1, 1.2, 1.44, 1.728, 2.074, 2.488], H = function (e, t) { return t.size < 2 ? e : I[e - 1][t.size - 1] }; class O { constructor(e) { this.style = void 0, this.color = void 0, this.size = void 0, this.textSize = void 0, this.phantom = void 0, this.font = void 0, this.fontFamily = void 0, this.fontWeight = void 0, this.fontShape = void 0, this.sizeMultiplier = void 0, this.maxSize = void 0, this.minRuleThickness = void 0, this._fontMetrics = void 0, this.style = e.style, this.color = e.color, this.size = e.size || O.BASESIZE, this.textSize = e.textSize || this.size, this.phantom = !!e.phantom, this.font = e.font || "", this.fontFamily = e.fontFamily || "", this.fontWeight = e.fontWeight || "", this.fontShape = e.fontShape || "", this.sizeMultiplier = R[this.size - 1], this.maxSize = e.maxSize, this.minRuleThickness = e.minRuleThickness, this._fontMetrics = void 0 } extend(e) { const t = { style: this.style, size: this.size, textSize: this.textSize, color: this.color, phantom: this.phantom, font: this.font, fontFamily: this.fontFamily, fontWeight: this.fontWeight, fontShape: this.fontShape, maxSize: this.maxSize, minRuleThickness: this.minRuleThickness }; for (const r in e) e.hasOwnProperty(r) && (t[r] = e[r]); return new O(t) } havingStyle(e) { return this.style === e ? this : this.extend({ style: e, size: H(this.textSize, e) }) } havingCrampedStyle() { return this.havingStyle(this.style.cramp()) } havingSize(e) { return this.size === e && this.textSize === e ? this : this.extend({ style: this.style.text(), size: e, textSize: e, sizeMultiplier: R[e - 1] }) } havingBaseStyle(e) { e = e || this.style.text(); const t = H(O.BASESIZE, e); return this.size === t && this.textSize === O.BASESIZE && this.style === e ? this : this.extend({ style: e, size: t }) } havingBaseSizing() { let e; switch (this.style.id) { case 4: case 5: e = 3; break; case 6: case 7: e = 1; break; default: e = 6 }return this.extend({ style: this.style.text(), size: e }) } withColor(e) { return this.extend({ color: e }) } withPhantom() { return this.extend({ phantom: !0 }) } withFont(e) { return this.extend({ font: e }) } withTextFontFamily(e) { return this.extend({ fontFamily: e, font: "" }) } withTextFontWeight(e) { return this.extend({ fontWeight: e, font: "" }) } withTextFontShape(e) { return this.extend({ fontShape: e, font: "" }) } sizingClasses(e) { return e.size !== this.size ? ["sizing", "reset-size" + e.size, "size" + this.size] : [] } baseSizingClasses() { return this.size !== O.BASESIZE ? ["sizing", "reset-size" + this.size, "size" + O.BASESIZE] : [] } fontMetrics() { return this._fontMetrics || (this._fontMetrics = function (e) { let t; if (t = e >= 5 ? 0 : e >= 3 ? 1 : 2, !q[t]) { const e = q[t] = { cssEmPerMu: B.quad[t] / 18 }; for (const r in B) B.hasOwnProperty(r) && (e[r] = B[r][t]) } return q[t] }(this.size)), this._fontMetrics } getColor() { return this.phantom ? "transparent" : this.color } } O.BASESIZE = 6; var E = O; const L = { pt: 1, mm: 7227 / 2540, cm: 7227 / 254, in: 72.27, bp: 1.00375, pc: 12, dd: 1238 / 1157, cc: 14856 / 1157, nd: 685 / 642, nc: 1370 / 107, sp: 1 / 65536, px: 1.00375 }, D = { ex: !0, em: !0, mu: !0 }, V = function (e) { return "string" != typeof e && (e = e.unit), e in L || e in D || "ex" === e }, P = function (e, t) { let r; if (e.unit in L) r = L[e.unit] / t.fontMetrics().ptPerEm / t.sizeMultiplier; else if ("mu" === e.unit) r = t.fontMetrics().cssEmPerMu; else { let o; if (o = t.style.isTight() ? t.havingStyle(t.style.text()) : t, "ex" === e.unit) r = o.fontMetrics().xHeight; else { if ("em" !== e.unit) throw new n("Invalid unit: '" + e.unit + "'"); r = o.fontMetrics().quad } o !== t && (r *= o.sizeMultiplier / t.sizeMultiplier) } return Math.min(e.number * r, t.maxSize) }, F = function (e) { return +e.toFixed(4) + "em" }, G = function (e) { return e.filter((e => e)).join(" ") }, U = function (e, t, r) { if (this.classes = e || [], this.attributes = {}, this.height = 0, this.depth = 0, this.maxFontSize = 0, this.style = r || {}, t) { t.style.isTight() && this.classes.push("mtight"); const e = t.getColor(); e && (this.style.color = e) } }, Y = function (e) { const t = document.createElement(e); t.className = G(this.classes); for (const e in this.style) this.style.hasOwnProperty(e) && (t.style[e] = this.style[e]); for (const e in this.attributes) this.attributes.hasOwnProperty(e) && t.setAttribute(e, this.attributes[e]); for (let e = 0; e < this.children.length; e++)t.appendChild(this.children[e].toNode()); return t }, X = function (e) { let t = "<" + e; this.classes.length && (t += ' class="' + l.escape(G(this.classes)) + '"'); let r = ""; for (const e in this.style) this.style.hasOwnProperty(e) && (r += l.hyphenate(e) + ":" + this.style[e] + ";"); r && (t += ' style="' + l.escape(r) + '"'); for (const e in this.attributes) this.attributes.hasOwnProperty(e) && (t += " " + e + '="' + l.escape(this.attributes[e]) + '"'); t += ">"; for (let e = 0; e < this.children.length; e++)t += this.children[e].toMarkup(); return t += "", t }; class W { constructor(e, t, r, n) { this.children = void 0, this.attributes = void 0, this.classes = void 0, this.height = void 0, this.depth = void 0, this.width = void 0, this.maxFontSize = void 0, this.style = void 0, U.call(this, e, r, n), this.children = t || [] } setAttribute(e, t) { this.attributes[e] = t } hasClass(e) { return l.contains(this.classes, e) } toNode() { return Y.call(this, "span") } toMarkup() { return X.call(this, "span") } } class _ { constructor(e, t, r, n) { this.children = void 0, this.attributes = void 0, this.classes = void 0, this.height = void 0, this.depth = void 0, this.maxFontSize = void 0, this.style = void 0, U.call(this, t, n), this.children = r || [], this.setAttribute("href", e) } setAttribute(e, t) { this.attributes[e] = t } hasClass(e) { return l.contains(this.classes, e) } toNode() { return Y.call(this, "a") } toMarkup() { return X.call(this, "a") } } class j { constructor(e, t, r) { this.src = void 0, this.alt = void 0, this.classes = void 0, this.height = void 0, this.depth = void 0, this.maxFontSize = void 0, this.style = void 0, this.alt = t, this.src = e, this.classes = ["mord"], this.style = r } hasClass(e) { return l.contains(this.classes, e) } toNode() { const e = document.createElement("img"); e.src = this.src, e.alt = this.alt, e.className = "mord"; for (const t in this.style) this.style.hasOwnProperty(t) && (e.style[t] = this.style[t]); return e } toMarkup() { let e = '' + l.escape(this.alt) + '= n[0] && e <= n[1]) return r.name } } return null }(this.text.charCodeAt(0)); l && this.classes.push(l + "_fallback"), /[\xee\xef\xed\xec]/.test(this.text) && (this.text = $[this.text]) } hasClass(e) { return l.contains(this.classes, e) } toNode() { const e = document.createTextNode(this.text); let t = null; this.italic > 0 && (t = document.createElement("span"), t.style.marginRight = F(this.italic)), this.classes.length > 0 && (t = t || document.createElement("span"), t.className = G(this.classes)); for (const e in this.style) this.style.hasOwnProperty(e) && (t = t || document.createElement("span"), t.style[e] = this.style[e]); return t ? (t.appendChild(e), t) : e } toMarkup() { let e = !1, t = " 0 && (r += "margin-right:" + this.italic + "em;"); for (const e in this.style) this.style.hasOwnProperty(e) && (r += l.hyphenate(e) + ":" + this.style[e] + ";"); r && (e = !0, t += ' style="' + l.escape(r) + '"'); const n = l.escape(this.text); return e ? (t += ">", t += n, t += "", t) : n } } class K { constructor(e, t) { this.children = void 0, this.attributes = void 0, this.children = e || [], this.attributes = t || {} } toNode() { const e = document.createElementNS("http://www.w3.org/2000/svg", "svg"); for (const t in this.attributes) Object.prototype.hasOwnProperty.call(this.attributes, t) && e.setAttribute(t, this.attributes[t]); for (let t = 0; t < this.children.length; t++)e.appendChild(this.children[t].toNode()); return e } toMarkup() { let e = '' : '' } } class Q { constructor(e) { this.attributes = void 0, this.attributes = e || {} } toNode() { const e = document.createElementNS("http://www.w3.org/2000/svg", "line"); for (const t in this.attributes) Object.prototype.hasOwnProperty.call(this.attributes, t) && e.setAttribute(t, this.attributes[t]); return e } toMarkup() { let e = "", "\\gt", !0), se(ie, le, ye, "\u2208", "\\in", !0), se(ie, le, ye, "\ue020", "\\@not"), se(ie, le, ye, "\u2282", "\\subset", !0), se(ie, le, ye, "\u2283", "\\supset", !0), se(ie, le, ye, "\u2286", "\\subseteq", !0), se(ie, le, ye, "\u2287", "\\supseteq", !0), se(ie, he, ye, "\u2288", "\\nsubseteq", !0), se(ie, he, ye, "\u2289", "\\nsupseteq", !0), se(ie, le, ye, "\u22a8", "\\models"), se(ie, le, ye, "\u2190", "\\leftarrow", !0), se(ie, le, ye, "\u2264", "\\le"), se(ie, le, ye, "\u2264", "\\leq", !0), se(ie, le, ye, "<", "\\lt", !0), se(ie, le, ye, "\u2192", "\\rightarrow", !0), se(ie, le, ye, "\u2192", "\\to"), se(ie, he, ye, "\u2271", "\\ngeq", !0), se(ie, he, ye, "\u2270", "\\nleq", !0), se(ie, le, xe, "\xa0", "\\ "), se(ie, le, xe, "\xa0", "\\space"), se(ie, le, xe, "\xa0", "\\nobreakspace"), se(ae, le, xe, "\xa0", "\\ "), se(ae, le, xe, "\xa0", " "), se(ae, le, xe, "\xa0", "\\space"), se(ae, le, xe, "\xa0", "\\nobreakspace"), se(ie, le, xe, null, "\\nobreak"), se(ie, le, xe, null, "\\allowbreak"), se(ie, le, be, ",", ","), se(ie, le, be, ";", ";"), se(ie, he, me, "\u22bc", "\\barwedge", !0), se(ie, he, me, "\u22bb", "\\veebar", !0), se(ie, le, me, "\u2299", "\\odot", !0), se(ie, le, me, "\u2295", "\\oplus", !0), se(ie, le, me, "\u2297", "\\otimes", !0), se(ie, le, we, "\u2202", "\\partial", !0), se(ie, le, me, "\u2298", "\\oslash", !0), se(ie, he, me, "\u229a", "\\circledcirc", !0), se(ie, he, me, "\u22a1", "\\boxdot", !0), se(ie, le, me, "\u25b3", "\\bigtriangleup"), se(ie, le, me, "\u25bd", "\\bigtriangledown"), se(ie, le, me, "\u2020", "\\dagger"), se(ie, le, me, "\u22c4", "\\diamond"), se(ie, le, me, "\u22c6", "\\star"), se(ie, le, me, "\u25c3", "\\triangleleft"), se(ie, le, me, "\u25b9", "\\triangleright"), se(ie, le, fe, "{", "\\{"), se(ae, le, we, "{", "\\{"), se(ae, le, we, "{", "\\textbraceleft"), se(ie, le, pe, "}", "\\}"), se(ae, le, we, "}", "\\}"), se(ae, le, we, "}", "\\textbraceright"), se(ie, le, fe, "{", "\\lbrace"), se(ie, le, pe, "}", "\\rbrace"), se(ie, le, fe, "[", "\\lbrack", !0), se(ae, le, we, "[", "\\lbrack", !0), se(ie, le, pe, "]", "\\rbrack", !0), se(ae, le, we, "]", "\\rbrack", !0), se(ie, le, fe, "(", "\\lparen", !0), se(ie, le, pe, ")", "\\rparen", !0), se(ae, le, we, "<", "\\textless", !0), se(ae, le, we, ">", "\\textgreater", !0), se(ie, le, fe, "\u230a", "\\lfloor", !0), se(ie, le, pe, "\u230b", "\\rfloor", !0), se(ie, le, fe, "\u2308", "\\lceil", !0), se(ie, le, pe, "\u2309", "\\rceil", !0), se(ie, le, we, "\\", "\\backslash"), se(ie, le, we, "\u2223", "|"), se(ie, le, we, "\u2223", "\\vert"), se(ae, le, we, "|", "\\textbar", !0), se(ie, le, we, "\u2225", "\\|"), se(ie, le, we, "\u2225", "\\Vert"), se(ae, le, we, "\u2225", "\\textbardbl"), se(ae, le, we, "~", "\\textasciitilde"), se(ae, le, we, "\\", "\\textbackslash"), se(ae, le, we, "^", "\\textasciicircum"), se(ie, le, ye, "\u2191", "\\uparrow", !0), se(ie, le, ye, "\u21d1", "\\Uparrow", !0), se(ie, le, ye, "\u2193", "\\downarrow", !0), se(ie, le, ye, "\u21d3", "\\Downarrow", !0), se(ie, le, ye, "\u2195", "\\updownarrow", !0), se(ie, le, ye, "\u21d5", "\\Updownarrow", !0), se(ie, le, ge, "\u2210", "\\coprod"), se(ie, le, ge, "\u22c1", "\\bigvee"), se(ie, le, ge, "\u22c0", "\\bigwedge"), se(ie, le, ge, "\u2a04", "\\biguplus"), se(ie, le, ge, "\u22c2", "\\bigcap"), se(ie, le, ge, "\u22c3", "\\bigcup"), se(ie, le, ge, "\u222b", "\\int"), se(ie, le, ge, "\u222b", "\\intop"), se(ie, le, ge, "\u222c", "\\iint"), se(ie, le, ge, "\u222d", "\\iiint"), se(ie, le, ge, "\u220f", "\\prod"), se(ie, le, ge, "\u2211", "\\sum"), se(ie, le, ge, "\u2a02", "\\bigotimes"), se(ie, le, ge, "\u2a01", "\\bigoplus"), se(ie, le, ge, "\u2a00", "\\bigodot"), se(ie, le, ge, "\u222e", "\\oint"), se(ie, le, ge, "\u222f", "\\oiint"), se(ie, le, ge, "\u2230", "\\oiiint"), se(ie, le, ge, "\u2a06", "\\bigsqcup"), se(ie, le, ge, "\u222b", "\\smallint"), se(ae, le, ue, "\u2026", "\\textellipsis"), se(ie, le, ue, "\u2026", "\\mathellipsis"), se(ae, le, ue, "\u2026", "\\ldots", !0), se(ie, le, ue, "\u2026", "\\ldots", !0), se(ie, le, ue, "\u22ef", "\\@cdots", !0), se(ie, le, ue, "\u22f1", "\\ddots", !0), se(ie, le, we, "\u22ee", "\\varvdots"), se(ie, le, ce, "\u02ca", "\\acute"), se(ie, le, ce, "\u02cb", "\\grave"), se(ie, le, ce, "\xa8", "\\ddot"), se(ie, le, ce, "~", "\\tilde"), se(ie, le, ce, "\u02c9", "\\bar"), se(ie, le, ce, "\u02d8", "\\breve"), se(ie, le, ce, "\u02c7", "\\check"), se(ie, le, ce, "^", "\\hat"), se(ie, le, ce, "\u20d7", "\\vec"), se(ie, le, ce, "\u02d9", "\\dot"), se(ie, le, ce, "\u02da", "\\mathring"), se(ie, le, de, "\ue131", "\\@imath"), se(ie, le, de, "\ue237", "\\@jmath"), se(ie, le, we, "\u0131", "\u0131"), se(ie, le, we, "\u0237", "\u0237"), se(ae, le, we, "\u0131", "\\i", !0), se(ae, le, we, "\u0237", "\\j", !0), se(ae, le, we, "\xdf", "\\ss", !0), se(ae, le, we, "\xe6", "\\ae", !0), se(ae, le, we, "\u0153", "\\oe", !0), se(ae, le, we, "\xf8", "\\o", !0), se(ae, le, we, "\xc6", "\\AE", !0), se(ae, le, we, "\u0152", "\\OE", !0), se(ae, le, we, "\xd8", "\\O", !0), se(ae, le, ce, "\u02ca", "\\'"), se(ae, le, ce, "\u02cb", "\\`"), se(ae, le, ce, "\u02c6", "\\^"), se(ae, le, ce, "\u02dc", "\\~"), se(ae, le, ce, "\u02c9", "\\="), se(ae, le, ce, "\u02d8", "\\u"), se(ae, le, ce, "\u02d9", "\\."), se(ae, le, ce, "\xb8", "\\c"), se(ae, le, ce, "\u02da", "\\r"), se(ae, le, ce, "\u02c7", "\\v"), se(ae, le, ce, "\xa8", '\\"'), se(ae, le, ce, "\u02dd", "\\H"), se(ae, le, ce, "\u25ef", "\\textcircled"); const ve = { "--": !0, "---": !0, "``": !0, "''": !0 }; se(ae, le, we, "\u2013", "--", !0), se(ae, le, we, "\u2013", "\\textendash"), se(ae, le, we, "\u2014", "---", !0), se(ae, le, we, "\u2014", "\\textemdash"), se(ae, le, we, "\u2018", "`", !0), se(ae, le, we, "\u2018", "\\textquoteleft"), se(ae, le, we, "\u2019", "'", !0), se(ae, le, we, "\u2019", "\\textquoteright"), se(ae, le, we, "\u201c", "``", !0), se(ae, le, we, "\u201c", "\\textquotedblleft"), se(ae, le, we, "\u201d", "''", !0), se(ae, le, we, "\u201d", "\\textquotedblright"), se(ie, le, we, "\xb0", "\\degree", !0), se(ae, le, we, "\xb0", "\\degree"), se(ae, le, we, "\xb0", "\\textdegree", !0), se(ie, le, we, "\xa3", "\\pounds"), se(ie, le, we, "\xa3", "\\mathsterling", !0), se(ae, le, we, "\xa3", "\\pounds"), se(ae, le, we, "\xa3", "\\textsterling", !0), se(ie, he, we, "\u2720", "\\maltese"), se(ae, he, we, "\u2720", "\\maltese"); const ke = '0123456789/@."'; for (let e = 0; e < ke.length; e++) { const t = ke.charAt(e); se(ie, le, we, t, t) } const Se = '0123456789!@*()-=+";:?/.,'; for (let e = 0; e < Se.length; e++) { const t = Se.charAt(e); se(ae, le, we, t, t) } const Me = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; for (let e = 0; e < Me.length; e++) { const t = Me.charAt(e); se(ie, le, de, t, t), se(ae, le, we, t, t) } se(ie, he, we, "C", "\u2102"), se(ae, he, we, "C", "\u2102"), se(ie, he, we, "H", "\u210d"), se(ae, he, we, "H", "\u210d"), se(ie, he, we, "N", "\u2115"), se(ae, he, we, "N", "\u2115"), se(ie, he, we, "P", "\u2119"), se(ae, he, we, "P", "\u2119"), se(ie, he, we, "Q", "\u211a"), se(ae, he, we, "Q", "\u211a"), se(ie, he, we, "R", "\u211d"), se(ae, he, we, "R", "\u211d"), se(ie, he, we, "Z", "\u2124"), se(ae, he, we, "Z", "\u2124"), se(ie, le, de, "h", "\u210e"), se(ae, le, de, "h", "\u210e"); let ze = ""; for (let e = 0; e < Me.length; e++) { const t = Me.charAt(e); ze = String.fromCharCode(55349, 56320 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56372 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56424 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56580 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56684 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56736 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56788 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56840 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56944 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), e < 26 && (ze = String.fromCharCode(55349, 56632 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 56476 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze)) } ze = String.fromCharCode(55349, 56668), se(ie, le, de, "k", ze), se(ae, le, we, "k", ze); for (let e = 0; e < 10; e++) { const t = e.toString(); ze = String.fromCharCode(55349, 57294 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 57314 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 57324 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze), ze = String.fromCharCode(55349, 57334 + e), se(ie, le, de, t, ze), se(ae, le, we, t, ze) } const Ae = "\xd0\xde\xfe"; for (let e = 0; e < Ae.length; e++) { const t = Ae.charAt(e); se(ie, le, de, t, t), se(ae, le, we, t, t) } const Te = [["mathbf", "textbf", "Main-Bold"], ["mathbf", "textbf", "Main-Bold"], ["mathnormal", "textit", "Math-Italic"], ["mathnormal", "textit", "Math-Italic"], ["boldsymbol", "boldsymbol", "Main-BoldItalic"], ["boldsymbol", "boldsymbol", "Main-BoldItalic"], ["mathscr", "textscr", "Script-Regular"], ["", "", ""], ["", "", ""], ["", "", ""], ["mathfrak", "textfrak", "Fraktur-Regular"], ["mathfrak", "textfrak", "Fraktur-Regular"], ["mathbb", "textbb", "AMS-Regular"], ["mathbb", "textbb", "AMS-Regular"], ["mathboldfrak", "textboldfrak", "Fraktur-Regular"], ["mathboldfrak", "textboldfrak", "Fraktur-Regular"], ["mathsf", "textsf", "SansSerif-Regular"], ["mathsf", "textsf", "SansSerif-Regular"], ["mathboldsf", "textboldsf", "SansSerif-Bold"], ["mathboldsf", "textboldsf", "SansSerif-Bold"], ["mathitsf", "textitsf", "SansSerif-Italic"], ["mathitsf", "textitsf", "SansSerif-Italic"], ["", "", ""], ["", "", ""], ["mathtt", "texttt", "Typewriter-Regular"], ["mathtt", "texttt", "Typewriter-Regular"]], Be = [["mathbf", "textbf", "Main-Bold"], ["", "", ""], ["mathsf", "textsf", "SansSerif-Regular"], ["mathboldsf", "textboldsf", "SansSerif-Bold"], ["mathtt", "texttt", "Typewriter-Regular"]], Ce = function (e, t, r) { return oe[r][e] && oe[r][e].replace && (e = oe[r][e].replace), { value: e, metrics: N(e, t, r) } }, Ne = function (e, t, r, n, o) { const s = Ce(e, t, r), i = s.metrics; let a; if (e = s.value, i) { let t = i.italic; ("text" === r || n && "mathit" === n.font) && (t = 0), a = new Z(e, i.height, i.depth, t, i.skew, i.width, o) } else "undefined" != typeof console && console.warn("No character metrics for '" + e + "' in style '" + t + "' and mode '" + r + "'"), a = new Z(e, 0, 0, 0, 0, 0, o); if (n) { a.maxFontSize = n.sizeMultiplier, n.style.isTight() && a.classes.push("mtight"); const e = n.getColor(); e && (a.style.color = e) } return a }, qe = (e, t) => { if (G(e.classes) !== G(t.classes) || e.skew !== t.skew || e.maxFontSize !== t.maxFontSize) return !1; if (1 === e.classes.length) { const t = e.classes[0]; if ("mbin" === t || "mord" === t) return !1 } for (const r in e.style) if (e.style.hasOwnProperty(r) && e.style[r] !== t.style[r]) return !1; for (const r in t.style) if (t.style.hasOwnProperty(r) && e.style[r] !== t.style[r]) return !1; return !0 }, Ie = function (e) { let t = 0, r = 0, n = 0; for (let o = 0; o < e.children.length; o++) { const s = e.children[o]; s.height > t && (t = s.height), s.depth > r && (r = s.depth), s.maxFontSize > n && (n = s.maxFontSize) } e.height = t, e.depth = r, e.maxFontSize = n }, Re = function (e, t, r, n) { const o = new W(e, t, r, n); return Ie(o), o }, He = (e, t, r, n) => new W(e, t, r, n), Oe = function (e) { const t = new A(e); return Ie(t), t }, Ee = function (e, t, r) { let n, o = ""; switch (e) { case "amsrm": o = "AMS"; break; case "textrm": o = "Main"; break; case "textsf": o = "SansSerif"; break; case "texttt": o = "Typewriter"; break; default: o = e }return n = "textbf" === t && "textit" === r ? "BoldItalic" : "textbf" === t ? "Bold" : "textit" === t ? "Italic" : "Regular", o + "-" + n }, Le = { mathbf: { variant: "bold", fontName: "Main-Bold" }, mathrm: { variant: "normal", fontName: "Main-Regular" }, textit: { variant: "italic", fontName: "Main-Italic" }, mathit: { variant: "italic", fontName: "Main-Italic" }, mathnormal: { variant: "italic", fontName: "Math-Italic" }, mathbb: { variant: "double-struck", fontName: "AMS-Regular" }, mathcal: { variant: "script", fontName: "Caligraphic-Regular" }, mathfrak: { variant: "fraktur", fontName: "Fraktur-Regular" }, mathscr: { variant: "script", fontName: "Script-Regular" }, mathsf: { variant: "sans-serif", fontName: "SansSerif-Regular" }, mathtt: { variant: "monospace", fontName: "Typewriter-Regular" } }, De = { vec: ["vec", .471, .714], oiintSize1: ["oiintSize1", .957, .499], oiintSize2: ["oiintSize2", 1.472, .659], oiiintSize1: ["oiiintSize1", 1.304, .499], oiiintSize2: ["oiiintSize2", 1.98, .659] }; var Ve = { fontMap: Le, makeSymbol: Ne, mathsym: function (e, t, r, n) { return void 0 === n && (n = []), "boldsymbol" === r.font && Ce(e, "Main-Bold", t).metrics ? Ne(e, "Main-Bold", t, r, n.concat(["mathbf"])) : "\\" === e || "main" === oe[t][e].font ? Ne(e, "Main-Regular", t, r, n) : Ne(e, "AMS-Regular", t, r, n.concat(["amsrm"])) }, makeSpan: Re, makeSvgSpan: He, makeLineSpan: function (e, t, r) { const n = Re([e], [], t); return n.height = Math.max(r || t.fontMetrics().defaultRuleThickness, t.minRuleThickness), n.style.borderBottomWidth = F(n.height), n.maxFontSize = 1, n }, makeAnchor: function (e, t, r, n) { const o = new _(e, t, r, n); return Ie(o), o }, makeFragment: Oe, wrapFragment: function (e, t) { return e instanceof A ? Re([], [e], t) : e }, makeVList: function (e, t) { const { children: r, depth: n } = function (e) { if ("individualShift" === e.positionType) { const t = e.children, r = [t[0]], n = -t[0].shift - t[0].elem.depth; let o = n; for (let e = 1; e < t.length; e++) { const n = -t[e].shift - o - t[e].elem.depth, s = n - (t[e - 1].elem.height + t[e - 1].elem.depth); o += n, r.push({ type: "kern", size: s }), r.push(t[e]) } return { children: r, depth: n } } let t; if ("top" === e.positionType) { let r = e.positionData; for (let t = 0; t < e.children.length; t++) { const n = e.children[t]; r -= "kern" === n.type ? n.size : n.elem.height + n.elem.depth } t = r } else if ("bottom" === e.positionType) t = -e.positionData; else { const r = e.children[0]; if ("elem" !== r.type) throw new Error('First child must have type "elem".'); if ("shift" === e.positionType) t = -r.elem.depth - e.positionData; else { if ("firstBaseline" !== e.positionType) throw new Error("Invalid positionType " + e.positionType + "."); t = -r.elem.depth } } return { children: e.children, depth: t } }(e); let o = 0; for (let e = 0; e < r.length; e++) { const t = r[e]; if ("elem" === t.type) { const e = t.elem; o = Math.max(o, e.maxFontSize, e.height) } } o += 2; const s = Re(["pstrut"], []); s.style.height = F(o); const i = []; let a = n, l = n, h = n; for (let e = 0; e < r.length; e++) { const t = r[e]; if ("kern" === t.type) h += t.size; else { const e = t.elem, r = t.wrapperClasses || [], n = t.wrapperStyle || {}, a = Re(r, [s, e], void 0, n); a.style.top = F(-o - h - e.depth), t.marginLeft && (a.style.marginLeft = t.marginLeft), t.marginRight && (a.style.marginRight = t.marginRight), i.push(a), h += e.height + e.depth } a = Math.min(a, h), l = Math.max(l, h) } const c = Re(["vlist"], i); let m; if (c.style.height = F(l), a < 0) { const e = Re([], []), t = Re(["vlist"], [e]); t.style.height = F(-a); const r = Re(["vlist-s"], [new Z("\u200b")]); m = [Re(["vlist-r"], [c, r]), Re(["vlist-r"], [t])] } else m = [Re(["vlist-r"], [c])]; const p = Re(["vlist-t"], m); return 2 === m.length && p.classes.push("vlist-t2"), p.height = l, p.depth = -a, p }, makeOrd: function (e, t, r) { const o = e.mode, s = e.text, i = ["mord"], a = "math" === o || "text" === o && t.font, l = a ? t.font : t.fontFamily; let h = "", c = ""; if (55349 === s.charCodeAt(0) && ([h, c] = function (e, t) { const r = 1024 * (e.charCodeAt(0) - 55296) + (e.charCodeAt(1) - 56320) + 65536, o = "math" === t ? 0 : 1; if (119808 <= r && r < 120484) { const e = Math.floor((r - 119808) / 26); return [Te[e][2], Te[e][o]] } if (120782 <= r && r <= 120831) { const e = Math.floor((r - 120782) / 10); return [Be[e][2], Be[e][o]] } if (120485 === r || 120486 === r) return [Te[0][2], Te[0][o]]; if (120486 < r && r < 120782) return ["", ""]; throw new n("Unsupported character: " + e) }(s, o)), h.length > 0) return Ne(s, h, o, t, i.concat(c)); if (l) { let e, n; if ("boldsymbol" === l) { const t = function (e, t, r, n, o) { return "textord" !== o && Ce(e, "Math-BoldItalic", t).metrics ? { fontName: "Math-BoldItalic", fontClass: "boldsymbol" } : { fontName: "Main-Bold", fontClass: "mathbf" } }(s, o, 0, 0, r); e = t.fontName, n = [t.fontClass] } else a ? (e = Le[l].fontName, n = [l]) : (e = Ee(l, t.fontWeight, t.fontShape), n = [l, t.fontWeight, t.fontShape]); if (Ce(s, e, o).metrics) return Ne(s, e, o, t, i.concat(n)); if (ve.hasOwnProperty(s) && "Typewriter" === e.slice(0, 10)) { const r = []; for (let a = 0; a < s.length; a++)r.push(Ne(s[a], e, o, t, i.concat(n))); return Oe(r) } } if ("mathord" === r) return Ne(s, "Math-Italic", o, t, i.concat(["mathnormal"])); if ("textord" === r) { const e = oe[o][s] && oe[o][s].font; if ("ams" === e) { const e = Ee("amsrm", t.fontWeight, t.fontShape); return Ne(s, e, o, t, i.concat("amsrm", t.fontWeight, t.fontShape)) } if ("main" !== e && e) { const r = Ee(e, t.fontWeight, t.fontShape); return Ne(s, r, o, t, i.concat(r, t.fontWeight, t.fontShape)) } { const e = Ee("textrm", t.fontWeight, t.fontShape); return Ne(s, e, o, t, i.concat(t.fontWeight, t.fontShape)) } } throw new Error("unexpected type: " + r + " in makeOrd") }, makeGlue: (e, t) => { const r = Re(["mspace"], [], t), n = P(e, t); return r.style.marginRight = F(n), r }, staticSvg: function (e, t) { const [r, n, o] = De[e], s = new J(r), i = new K([s], { width: F(n), height: F(o), style: "width:" + F(n), viewBox: "0 0 " + 1e3 * n + " " + 1e3 * o, preserveAspectRatio: "xMinYMin" }), a = He(["overlay"], [i], t); return a.height = o, a.style.height = F(o), a.style.width = F(n), a }, svgData: De, tryCombineChars: e => { for (let t = 0; t < e.length - 1; t++) { const r = e[t], n = e[t + 1]; r instanceof Z && n instanceof Z && qe(r, n) && (r.text += n.text, r.height = Math.max(r.height, n.height), r.depth = Math.max(r.depth, n.depth), r.italic = n.italic, e.splice(t + 1, 1), t--) } return e } }; const Pe = { number: 3, unit: "mu" }, Fe = { number: 4, unit: "mu" }, Ge = { number: 5, unit: "mu" }, Ue = { mord: { mop: Pe, mbin: Fe, mrel: Ge, minner: Pe }, mop: { mord: Pe, mop: Pe, mrel: Ge, minner: Pe }, mbin: { mord: Fe, mop: Fe, mopen: Fe, minner: Fe }, mrel: { mord: Ge, mop: Ge, mopen: Ge, minner: Ge }, mopen: {}, mclose: { mop: Pe, mbin: Fe, mrel: Ge, minner: Pe }, mpunct: { mord: Pe, mop: Pe, mrel: Ge, mopen: Pe, mclose: Pe, mpunct: Pe, minner: Pe }, minner: { mord: Pe, mop: Pe, mbin: Fe, mrel: Ge, mopen: Pe, mpunct: Pe, minner: Pe } }, Ye = { mord: { mop: Pe }, mop: { mord: Pe, mop: Pe }, mbin: {}, mrel: {}, mopen: {}, mclose: { mop: Pe }, mpunct: {}, minner: { mop: Pe } }, Xe = {}, We = {}, _e = {}; function je(e) { let { type: t, names: r, props: n, handler: o, htmlBuilder: s, mathmlBuilder: i } = e; const a = { type: t, numArgs: n.numArgs, argTypes: n.argTypes, allowedInArgument: !!n.allowedInArgument, allowedInText: !!n.allowedInText, allowedInMath: void 0 === n.allowedInMath || n.allowedInMath, numOptionalArgs: n.numOptionalArgs || 0, infix: !!n.infix, primitive: !!n.primitive, handler: o }; for (let e = 0; e < r.length; ++e)Xe[r[e]] = a; t && (s && (We[t] = s), i && (_e[t] = i)) } function $e(e) { let { type: t, htmlBuilder: r, mathmlBuilder: n } = e; je({ type: t, names: [], props: { numArgs: 0 }, handler() { throw new Error("Should never be called.") }, htmlBuilder: r, mathmlBuilder: n }) } const Ze = function (e) { return "ordgroup" === e.type && 1 === e.body.length ? e.body[0] : e }, Ke = function (e) { return "ordgroup" === e.type ? e.body : [e] }, Je = Ve.makeSpan, Qe = ["leftmost", "mbin", "mopen", "mrel", "mop", "mpunct"], et = ["rightmost", "mrel", "mclose", "mpunct"], tt = { display: w.DISPLAY, text: w.TEXT, script: w.SCRIPT, scriptscript: w.SCRIPTSCRIPT }, rt = { mord: "mord", mop: "mop", mbin: "mbin", mrel: "mrel", mopen: "mopen", mclose: "mclose", mpunct: "mpunct", minner: "minner" }, nt = function (e, t, r, n) { void 0 === n && (n = [null, null]); const o = []; for (let r = 0; r < e.length; r++) { const n = ht(e[r], t); if (n instanceof A) { const e = n.children; o.push(...e) } else o.push(n) } if (Ve.tryCombineChars(o), !r) return o; let s = t; if (1 === e.length) { const r = e[0]; "sizing" === r.type ? s = t.havingSize(r.size) : "styling" === r.type && (s = t.havingStyle(tt[r.style])) } const i = Je([n[0] || "leftmost"], [], t), a = Je([n[1] || "rightmost"], [], t), h = "root" === r; return ot(o, ((e, t) => { const r = t.classes[0], n = e.classes[0]; "mbin" === r && l.contains(et, n) ? t.classes[0] = "mord" : "mbin" === n && l.contains(Qe, r) && (e.classes[0] = "mord") }), { node: i }, a, h), ot(o, ((e, t) => { const r = at(t), n = at(e), o = r && n ? e.hasClass("mtight") ? Ye[r][n] : Ue[r][n] : null; if (o) return Ve.makeGlue(o, s) }), { node: i }, a, h), o }, ot = function (e, t, r, n, o) { n && e.push(n); let s = 0; for (; s < e.length; s++) { const n = e[s], i = st(n); if (i) { ot(i.children, t, r, null, o); continue } const a = !n.hasClass("mspace"); if (a) { const o = t(n, r.node); o && (r.insertAfter ? r.insertAfter(o) : (e.unshift(o), s++)) } a ? r.node = n : o && n.hasClass("newline") && (r.node = Je(["leftmost"])), r.insertAfter = (t => r => { e.splice(t + 1, 0, r), s++ })(s) } n && e.pop() }, st = function (e) { return e instanceof A || e instanceof _ || e instanceof W && e.hasClass("enclosing") ? e : null }, it = function (e, t) { const r = st(e); if (r) { const e = r.children; if (e.length) { if ("right" === t) return it(e[e.length - 1], "right"); if ("left" === t) return it(e[0], "left") } } return e }, at = function (e, t) { return e ? (t && (e = it(e, t)), rt[e.classes[0]] || null) : null }, lt = function (e, t) { const r = ["nulldelimiter"].concat(e.baseSizingClasses()); return Je(t.concat(r)) }, ht = function (e, t, r) { if (!e) return Je(); if (We[e.type]) { let n = We[e.type](e, t); if (r && t.size !== r.size) { n = Je(t.sizingClasses(r), [n], t); const e = t.sizeMultiplier / r.sizeMultiplier; n.height *= e, n.depth *= e } return n } throw new n("Got group of unknown type: '" + e.type + "'") }; function ct(e, t) { const r = Je(["base"], e, t), n = Je(["strut"]); return n.style.height = F(r.height + r.depth), r.depth && (n.style.verticalAlign = F(-r.depth)), r.children.unshift(n), r } function mt(e, t) { let r = null; 1 === e.length && "tag" === e[0].type && (r = e[0].tag, e = e[0].body); const n = nt(e, t, "root"); let o; 2 === n.length && n[1].hasClass("tag") && (o = n.pop()); const s = []; let i, a = []; for (let e = 0; e < n.length; e++)if (a.push(n[e]), n[e].hasClass("mbin") || n[e].hasClass("mrel") || n[e].hasClass("allowbreak")) { let r = !1; for (; e < n.length - 1 && n[e + 1].hasClass("mspace") && !n[e + 1].hasClass("newline");)e++, a.push(n[e]), n[e].hasClass("nobreak") && (r = !0); r || (s.push(ct(a, t)), a = []) } else n[e].hasClass("newline") && (a.pop(), a.length > 0 && (s.push(ct(a, t)), a = []), s.push(n[e])); a.length > 0 && s.push(ct(a, t)), r ? (i = ct(nt(r, t, !0)), i.classes = ["tag"], s.push(i)) : o && s.push(o); const l = Je(["katex-html"], s); if (l.setAttribute("aria-hidden", "true"), i) { const e = i.children[0]; e.style.height = F(l.height + l.depth), l.depth && (e.style.verticalAlign = F(-l.depth)) } return l } function pt(e) { return new A(e) } class ut { constructor(e, t, r) { this.type = void 0, this.attributes = void 0, this.children = void 0, this.classes = void 0, this.type = e, this.attributes = {}, this.children = t || [], this.classes = r || [] } setAttribute(e, t) { this.attributes[e] = t } getAttribute(e) { return this.attributes[e] } toNode() { const e = document.createElementNS("http://www.w3.org/1998/Math/MathML", this.type); for (const t in this.attributes) Object.prototype.hasOwnProperty.call(this.attributes, t) && e.setAttribute(t, this.attributes[t]); this.classes.length > 0 && (e.className = G(this.classes)); for (let t = 0; t < this.children.length; t++)e.appendChild(this.children[t].toNode()); return e } toMarkup() { let e = "<" + this.type; for (const t in this.attributes) Object.prototype.hasOwnProperty.call(this.attributes, t) && (e += " " + t + '="', e += l.escape(this.attributes[t]), e += '"'); this.classes.length > 0 && (e += ' class ="' + l.escape(G(this.classes)) + '"'), e += ">"; for (let t = 0; t < this.children.length; t++)e += this.children[t].toMarkup(); return e += "", e } toText() { return this.children.map((e => e.toText())).join("") } } class dt { constructor(e) { this.text = void 0, this.text = e } toNode() { return document.createTextNode(this.text) } toMarkup() { return l.escape(this.toText()) } toText() { return this.text } } var gt = { MathNode: ut, TextNode: dt, SpaceNode: class { constructor(e) { this.width = void 0, this.character = void 0, this.width = e, this.character = e >= .05555 && e <= .05556 ? "\u200a" : e >= .1666 && e <= .1667 ? "\u2009" : e >= .2222 && e <= .2223 ? "\u2005" : e >= .2777 && e <= .2778 ? "\u2005\u200a" : e >= -.05556 && e <= -.05555 ? "\u200a\u2063" : e >= -.1667 && e <= -.1666 ? "\u2009\u2063" : e >= -.2223 && e <= -.2222 ? "\u205f\u2063" : e >= -.2778 && e <= -.2777 ? "\u2005\u2063" : null } toNode() { if (this.character) return document.createTextNode(this.character); { const e = document.createElementNS("http://www.w3.org/1998/Math/MathML", "mspace"); return e.setAttribute("width", F(this.width)), e } } toMarkup() { return this.character ? "" + this.character + "" : '' } toText() { return this.character ? this.character : " " } }, newDocumentFragment: pt }; const ft = function (e, t, r) { return !oe[t][e] || !oe[t][e].replace || 55349 === e.charCodeAt(0) || ve.hasOwnProperty(e) && r && (r.fontFamily && "tt" === r.fontFamily.slice(4, 6) || r.font && "tt" === r.font.slice(4, 6)) || (e = oe[t][e].replace), new gt.TextNode(e) }, bt = function (e) { return 1 === e.length ? e[0] : new gt.MathNode("mrow", e) }, yt = function (e, t) { if ("texttt" === t.fontFamily) return "monospace"; if ("textsf" === t.fontFamily) return "textit" === t.fontShape && "textbf" === t.fontWeight ? "sans-serif-bold-italic" : "textit" === t.fontShape ? "sans-serif-italic" : "textbf" === t.fontWeight ? "bold-sans-serif" : "sans-serif"; if ("textit" === t.fontShape && "textbf" === t.fontWeight) return "bold-italic"; if ("textit" === t.fontShape) return "italic"; if ("textbf" === t.fontWeight) return "bold"; const r = t.font; if (!r || "mathnormal" === r) return null; const n = e.mode; if ("mathit" === r) return "italic"; if ("boldsymbol" === r) return "textord" === e.type ? "bold" : "bold-italic"; if ("mathbf" === r) return "bold"; if ("mathbb" === r) return "double-struck"; if ("mathfrak" === r) return "fraktur"; if ("mathscr" === r || "mathcal" === r) return "script"; if ("mathsf" === r) return "sans-serif"; if ("mathtt" === r) return "monospace"; let o = e.text; if (l.contains(["\\imath", "\\jmath"], o)) return null; oe[n][o] && oe[n][o].replace && (o = oe[n][o].replace); return N(o, Ve.fontMap[r].fontName, n) ? Ve.fontMap[r].variant : null }, xt = function (e, t, r) { if (1 === e.length) { const n = vt(e[0], t); return r && n instanceof ut && "mo" === n.type && (n.setAttribute("lspace", "0em"), n.setAttribute("rspace", "0em")), [n] } const n = []; let o; for (let r = 0; r < e.length; r++) { const s = vt(e[r], t); if (s instanceof ut && o instanceof ut) { if ("mtext" === s.type && "mtext" === o.type && s.getAttribute("mathvariant") === o.getAttribute("mathvariant")) { o.children.push(...s.children); continue } if ("mn" === s.type && "mn" === o.type) { o.children.push(...s.children); continue } if ("mi" === s.type && 1 === s.children.length && "mn" === o.type) { const e = s.children[0]; if (e instanceof dt && "." === e.text) { o.children.push(...s.children); continue } } else if ("mi" === o.type && 1 === o.children.length) { const e = o.children[0]; if (e instanceof dt && "\u0338" === e.text && ("mo" === s.type || "mi" === s.type || "mn" === s.type)) { const e = s.children[0]; e instanceof dt && e.text.length > 0 && (e.text = e.text.slice(0, 1) + "\u0338" + e.text.slice(1), n.pop()) } } } n.push(s), o = s } return n }, wt = function (e, t, r) { return bt(xt(e, t, r)) }, vt = function (e, t) { if (!e) return new gt.MathNode("mrow"); if (_e[e.type]) { return _e[e.type](e, t) } throw new n("Got group of unknown type: '" + e.type + "'") }; function kt(e, t, r, n, o) { const s = xt(e, r); let i; i = 1 === s.length && s[0] instanceof ut && l.contains(["mrow", "mtable"], s[0].type) ? s[0] : new gt.MathNode("mrow", s); const a = new gt.MathNode("annotation", [new gt.TextNode(t)]); a.setAttribute("encoding", "application/x-tex"); const h = new gt.MathNode("semantics", [i, a]), c = new gt.MathNode("math", [h]); c.setAttribute("xmlns", "http://www.w3.org/1998/Math/MathML"), n && c.setAttribute("display", "block"); const m = o ? "katex" : "katex-mathml"; return Ve.makeSpan([m], [c]) } const St = function (e) { return new E({ style: e.displayMode ? w.DISPLAY : w.TEXT, maxSize: e.maxSize, minRuleThickness: e.minRuleThickness }) }, Mt = function (e, t) { if (t.displayMode) { const r = ["katex-display"]; t.leqno && r.push("leqno"), t.fleqn && r.push("fleqn"), e = Ve.makeSpan(r, [e]) } return e }, zt = function (e, t, r) { const n = St(r); let o; if ("mathml" === r.output) return kt(e, t, n, r.displayMode, !0); if ("html" === r.output) { const t = mt(e, n); o = Ve.makeSpan(["katex"], [t]) } else { const s = kt(e, t, n, r.displayMode, !1), i = mt(e, n); o = Ve.makeSpan(["katex"], [s, i]) } return Mt(o, r) }; const At = { widehat: "^", widecheck: "\u02c7", widetilde: "~", utilde: "~", overleftarrow: "\u2190", underleftarrow: "\u2190", xleftarrow: "\u2190", overrightarrow: "\u2192", underrightarrow: "\u2192", xrightarrow: "\u2192", underbrace: "\u23df", overbrace: "\u23de", overgroup: "\u23e0", undergroup: "\u23e1", overleftrightarrow: "\u2194", underleftrightarrow: "\u2194", xleftrightarrow: "\u2194", Overrightarrow: "\u21d2", xRightarrow: "\u21d2", overleftharpoon: "\u21bc", xleftharpoonup: "\u21bc", overrightharpoon: "\u21c0", xrightharpoonup: "\u21c0", xLeftarrow: "\u21d0", xLeftrightarrow: "\u21d4", xhookleftarrow: "\u21a9", xhookrightarrow: "\u21aa", xmapsto: "\u21a6", xrightharpoondown: "\u21c1", xleftharpoondown: "\u21bd", xrightleftharpoons: "\u21cc", xleftrightharpoons: "\u21cb", xtwoheadleftarrow: "\u219e", xtwoheadrightarrow: "\u21a0", xlongequal: "=", xtofrom: "\u21c4", xrightleftarrows: "\u21c4", xrightequilibrium: "\u21cc", xleftequilibrium: "\u21cb", "\\cdrightarrow": "\u2192", "\\cdleftarrow": "\u2190", "\\cdlongequal": "=" }, Tt = { overrightarrow: [["rightarrow"], .888, 522, "xMaxYMin"], overleftarrow: [["leftarrow"], .888, 522, "xMinYMin"], underrightarrow: [["rightarrow"], .888, 522, "xMaxYMin"], underleftarrow: [["leftarrow"], .888, 522, "xMinYMin"], xrightarrow: [["rightarrow"], 1.469, 522, "xMaxYMin"], "\\cdrightarrow": [["rightarrow"], 3, 522, "xMaxYMin"], xleftarrow: [["leftarrow"], 1.469, 522, "xMinYMin"], "\\cdleftarrow": [["leftarrow"], 3, 522, "xMinYMin"], Overrightarrow: [["doublerightarrow"], .888, 560, "xMaxYMin"], xRightarrow: [["doublerightarrow"], 1.526, 560, "xMaxYMin"], xLeftarrow: [["doubleleftarrow"], 1.526, 560, "xMinYMin"], overleftharpoon: [["leftharpoon"], .888, 522, "xMinYMin"], xleftharpoonup: [["leftharpoon"], .888, 522, "xMinYMin"], xleftharpoondown: [["leftharpoondown"], .888, 522, "xMinYMin"], overrightharpoon: [["rightharpoon"], .888, 522, "xMaxYMin"], xrightharpoonup: [["rightharpoon"], .888, 522, "xMaxYMin"], xrightharpoondown: [["rightharpoondown"], .888, 522, "xMaxYMin"], xlongequal: [["longequal"], .888, 334, "xMinYMin"], "\\cdlongequal": [["longequal"], 3, 334, "xMinYMin"], xtwoheadleftarrow: [["twoheadleftarrow"], .888, 334, "xMinYMin"], xtwoheadrightarrow: [["twoheadrightarrow"], .888, 334, "xMaxYMin"], overleftrightarrow: [["leftarrow", "rightarrow"], .888, 522], overbrace: [["leftbrace", "midbrace", "rightbrace"], 1.6, 548], underbrace: [["leftbraceunder", "midbraceunder", "rightbraceunder"], 1.6, 548], underleftrightarrow: [["leftarrow", "rightarrow"], .888, 522], xleftrightarrow: [["leftarrow", "rightarrow"], 1.75, 522], xLeftrightarrow: [["doubleleftarrow", "doublerightarrow"], 1.75, 560], xrightleftharpoons: [["leftharpoondownplus", "rightharpoonplus"], 1.75, 716], xleftrightharpoons: [["leftharpoonplus", "rightharpoondownplus"], 1.75, 716], xhookleftarrow: [["leftarrow", "righthook"], 1.08, 522], xhookrightarrow: [["lefthook", "rightarrow"], 1.08, 522], overlinesegment: [["leftlinesegment", "rightlinesegment"], .888, 522], underlinesegment: [["leftlinesegment", "rightlinesegment"], .888, 522], overgroup: [["leftgroup", "rightgroup"], .888, 342], undergroup: [["leftgroupunder", "rightgroupunder"], .888, 342], xmapsto: [["leftmapsto", "rightarrow"], 1.5, 522], xtofrom: [["leftToFrom", "rightToFrom"], 1.75, 528], xrightleftarrows: [["baraboveleftarrow", "rightarrowabovebar"], 1.75, 901], xrightequilibrium: [["baraboveshortleftharpoon", "rightharpoonaboveshortbar"], 1.75, 716], xleftequilibrium: [["shortbaraboveleftharpoon", "shortrightharpoonabovebar"], 1.75, 716] }; var Bt = function (e, t, r, n, o) { let s; const i = e.height + e.depth + r + n; if (/fbox|color|angl/.test(t)) { if (s = Ve.makeSpan(["stretchy", t], [], o), "fbox" === t) { const e = o.color && o.getColor(); e && (s.style.borderColor = e) } } else { const e = []; /^[bx]cancel$/.test(t) && e.push(new Q({ x1: "0", y1: "0", x2: "100%", y2: "100%", "stroke-width": "0.046em" })), /^x?cancel$/.test(t) && e.push(new Q({ x1: "0", y1: "100%", x2: "100%", y2: "0", "stroke-width": "0.046em" })); const r = new K(e, { width: "100%", height: F(i) }); s = Ve.makeSvgSpan([], [r], o) } return s.height = i, s.style.height = F(i), s }, Ct = function (e) { const t = new gt.MathNode("mo", [new gt.TextNode(At[e.replace(/^\\/, "")])]); return t.setAttribute("stretchy", "true"), t }, Nt = function (e, t) { const { span: r, minWidth: n, height: o } = function () { let r = 4e5; const n = e.label.slice(1); if (l.contains(["widehat", "widecheck", "widetilde", "utilde"], n)) { const s = "ordgroup" === (o = e.base).type ? o.body.length : 1; let i, a, l; if (s > 5) "widehat" === n || "widecheck" === n ? (i = 420, r = 2364, l = .42, a = n + "4") : (i = 312, r = 2340, l = .34, a = "tilde4"); else { const e = [1, 1, 2, 2, 3, 3][s]; "widehat" === n || "widecheck" === n ? (r = [0, 1062, 2364, 2364, 2364][e], i = [0, 239, 300, 360, 420][e], l = [0, .24, .3, .3, .36, .42][e], a = n + e) : (r = [0, 600, 1033, 2339, 2340][e], i = [0, 260, 286, 306, 312][e], l = [0, .26, .286, .3, .306, .34][e], a = "tilde" + e) } const h = new J(a), c = new K([h], { width: "100%", height: F(l), viewBox: "0 0 " + r + " " + i, preserveAspectRatio: "none" }); return { span: Ve.makeSvgSpan([], [c], t), minWidth: 0, height: l } } { const e = [], o = Tt[n], [s, i, a] = o, l = a / 1e3, h = s.length; let c, m; if (1 === h) { c = ["hide-tail"], m = [o[3]] } else if (2 === h) c = ["halfarrow-left", "halfarrow-right"], m = ["xMinYMin", "xMaxYMin"]; else { if (3 !== h) throw new Error("Correct katexImagesData or update code here to support\n " + h + " children."); c = ["brace-left", "brace-center", "brace-right"], m = ["xMinYMin", "xMidYMin", "xMaxYMin"] } for (let n = 0; n < h; n++) { const o = new J(s[n]), p = new K([o], { width: "400em", height: F(l), viewBox: "0 0 " + r + " " + a, preserveAspectRatio: m[n] + " slice" }), u = Ve.makeSvgSpan([c[n]], [p], t); if (1 === h) return { span: u, minWidth: i, height: l }; u.style.height = F(l), e.push(u) } return { span: Ve.makeSpan(["stretchy"], e, t), minWidth: i, height: l } } var o }(); return r.height = o, r.style.height = F(o), n > 0 && (r.style.minWidth = F(n)), r }; function qt(e, t) { if (!e || e.type !== t) throw new Error("Expected node of type " + t + ", but got " + (e ? "node of type " + e.type : String(e))); return e } function It(e) { const t = Rt(e); if (!t) throw new Error("Expected node of symbol group type, but got " + (e ? "node of type " + e.type : String(e))); return t } function Rt(e) { return e && ("atom" === e.type || re.hasOwnProperty(e.type)) ? e : null } const Ht = (e, t) => { let r, n, o; e && "supsub" === e.type ? (n = qt(e.base, "accent"), r = n.base, e.base = r, o = function (e) { if (e instanceof W) return e; throw new Error("Expected span but got " + String(e) + ".") }(ht(e, t)), e.base = n) : (n = qt(e, "accent"), r = n.base); const s = ht(r, t.havingCrampedStyle()); let i = 0; if (n.isShifty && l.isCharacterBox(r)) { const e = l.getBaseElem(r); i = ee(ht(e, t.havingCrampedStyle())).skew } const a = "\\c" === n.label; let h, c = a ? s.height + s.depth : Math.min(s.height, t.fontMetrics().xHeight); if (n.isStretchy) h = Nt(n, t), h = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: s }, { type: "elem", elem: h, wrapperClasses: ["svg-align"], wrapperStyle: i > 0 ? { width: "calc(100% - " + F(2 * i) + ")", marginLeft: F(2 * i) } : void 0 }] }, t); else { let e, r; "\\vec" === n.label ? (e = Ve.staticSvg("vec", t), r = Ve.svgData.vec[1]) : (e = Ve.makeOrd({ mode: n.mode, text: n.label }, t, "textord"), e = ee(e), e.italic = 0, r = e.width, a && (c += e.depth)), h = Ve.makeSpan(["accent-body"], [e]); const o = "\\textcircled" === n.label; o && (h.classes.push("accent-full"), c = s.height); let l = i; o || (l -= r / 2), h.style.left = F(l), "\\textcircled" === n.label && (h.style.top = ".2em"), h = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: s }, { type: "kern", size: -c }, { type: "elem", elem: h }] }, t) } const m = Ve.makeSpan(["mord", "accent"], [h], t); return o ? (o.children[0] = m, o.height = Math.max(m.height, o.height), o.classes[0] = "mord", o) : m }, Ot = (e, t) => { const r = e.isStretchy ? Ct(e.label) : new gt.MathNode("mo", [ft(e.label, e.mode)]), n = new gt.MathNode("mover", [vt(e.base, t), r]); return n.setAttribute("accent", "true"), n }, Et = new RegExp(["\\acute", "\\grave", "\\ddot", "\\tilde", "\\bar", "\\breve", "\\check", "\\hat", "\\vec", "\\dot", "\\mathring"].map((e => "\\" + e)).join("|")); je({ type: "accent", names: ["\\acute", "\\grave", "\\ddot", "\\tilde", "\\bar", "\\breve", "\\check", "\\hat", "\\vec", "\\dot", "\\mathring", "\\widecheck", "\\widehat", "\\widetilde", "\\overrightarrow", "\\overleftarrow", "\\Overrightarrow", "\\overleftrightarrow", "\\overgroup", "\\overlinesegment", "\\overleftharpoon", "\\overrightharpoon"], props: { numArgs: 1 }, handler: (e, t) => { const r = Ze(t[0]), n = !Et.test(e.funcName), o = !n || "\\widehat" === e.funcName || "\\widetilde" === e.funcName || "\\widecheck" === e.funcName; return { type: "accent", mode: e.parser.mode, label: e.funcName, isStretchy: n, isShifty: o, base: r } }, htmlBuilder: Ht, mathmlBuilder: Ot }), je({ type: "accent", names: ["\\'", "\\`", "\\^", "\\~", "\\=", "\\u", "\\.", '\\"', "\\c", "\\r", "\\H", "\\v", "\\textcircled"], props: { numArgs: 1, allowedInText: !0, allowedInMath: !0, argTypes: ["primitive"] }, handler: (e, t) => { const r = t[0]; let n = e.parser.mode; return "math" === n && (e.parser.settings.reportNonstrict("mathVsTextAccents", "LaTeX's accent " + e.funcName + " works only in text mode"), n = "text"), { type: "accent", mode: n, label: e.funcName, isStretchy: !1, isShifty: !0, base: r } }, htmlBuilder: Ht, mathmlBuilder: Ot }), je({ type: "accentUnder", names: ["\\underleftarrow", "\\underrightarrow", "\\underleftrightarrow", "\\undergroup", "\\underlinesegment", "\\utilde"], props: { numArgs: 1 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "accentUnder", mode: r.mode, label: n, base: o } }, htmlBuilder: (e, t) => { const r = ht(e.base, t), n = Nt(e, t), o = "\\utilde" === e.label ? .12 : 0, s = Ve.makeVList({ positionType: "top", positionData: r.height, children: [{ type: "elem", elem: n, wrapperClasses: ["svg-align"] }, { type: "kern", size: o }, { type: "elem", elem: r }] }, t); return Ve.makeSpan(["mord", "accentunder"], [s], t) }, mathmlBuilder: (e, t) => { const r = Ct(e.label), n = new gt.MathNode("munder", [vt(e.base, t), r]); return n.setAttribute("accentunder", "true"), n } }); const Lt = e => { const t = new gt.MathNode("mpadded", e ? [e] : []); return t.setAttribute("width", "+0.6em"), t.setAttribute("lspace", "0.3em"), t }; je({ type: "xArrow", names: ["\\xleftarrow", "\\xrightarrow", "\\xLeftarrow", "\\xRightarrow", "\\xleftrightarrow", "\\xLeftrightarrow", "\\xhookleftarrow", "\\xhookrightarrow", "\\xmapsto", "\\xrightharpoondown", "\\xrightharpoonup", "\\xleftharpoondown", "\\xleftharpoonup", "\\xrightleftharpoons", "\\xleftrightharpoons", "\\xlongequal", "\\xtwoheadrightarrow", "\\xtwoheadleftarrow", "\\xtofrom", "\\xrightleftarrows", "\\xrightequilibrium", "\\xleftequilibrium", "\\\\cdrightarrow", "\\\\cdleftarrow", "\\\\cdlongequal"], props: { numArgs: 1, numOptionalArgs: 1 }, handler(e, t, r) { let { parser: n, funcName: o } = e; return { type: "xArrow", mode: n.mode, label: o, body: t[0], below: r[0] } }, htmlBuilder(e, t) { const r = t.style; let n = t.havingStyle(r.sup()); const o = Ve.wrapFragment(ht(e.body, n, t), t), s = "\\x" === e.label.slice(0, 2) ? "x" : "cd"; let i; o.classes.push(s + "-arrow-pad"), e.below && (n = t.havingStyle(r.sub()), i = Ve.wrapFragment(ht(e.below, n, t), t), i.classes.push(s + "-arrow-pad")); const a = Nt(e, t), l = -t.fontMetrics().axisHeight + .5 * a.height; let h, c = -t.fontMetrics().axisHeight - .5 * a.height - .111; if ((o.depth > .25 || "\\xleftequilibrium" === e.label) && (c -= o.depth), i) { const e = -t.fontMetrics().axisHeight + i.height + .5 * a.height + .111; h = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: o, shift: c }, { type: "elem", elem: a, shift: l }, { type: "elem", elem: i, shift: e }] }, t) } else h = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: o, shift: c }, { type: "elem", elem: a, shift: l }] }, t); return h.children[0].children[0].children[1].classes.push("svg-align"), Ve.makeSpan(["mrel", "x-arrow"], [h], t) }, mathmlBuilder(e, t) { const r = Ct(e.label); let n; if (r.setAttribute("minsize", "x" === e.label.charAt(0) ? "1.75em" : "3.0em"), e.body) { const o = Lt(vt(e.body, t)); if (e.below) { const s = Lt(vt(e.below, t)); n = new gt.MathNode("munderover", [r, s, o]) } else n = new gt.MathNode("mover", [r, o]) } else if (e.below) { const o = Lt(vt(e.below, t)); n = new gt.MathNode("munder", [r, o]) } else n = Lt(), n = new gt.MathNode("mover", [r, n]); return n } }); const Dt = Ve.makeSpan; function Vt(e, t) { const r = nt(e.body, t, !0); return Dt([e.mclass], r, t) } function Pt(e, t) { let r; const n = xt(e.body, t); return "minner" === e.mclass ? r = new gt.MathNode("mpadded", n) : "mord" === e.mclass ? e.isCharacterBox ? (r = n[0], r.type = "mi") : r = new gt.MathNode("mi", n) : (e.isCharacterBox ? (r = n[0], r.type = "mo") : r = new gt.MathNode("mo", n), "mbin" === e.mclass ? (r.attributes.lspace = "0.22em", r.attributes.rspace = "0.22em") : "mpunct" === e.mclass ? (r.attributes.lspace = "0em", r.attributes.rspace = "0.17em") : "mopen" === e.mclass || "mclose" === e.mclass ? (r.attributes.lspace = "0em", r.attributes.rspace = "0em") : "minner" === e.mclass && (r.attributes.lspace = "0.0556em", r.attributes.width = "+0.1111em")), r } je({ type: "mclass", names: ["\\mathord", "\\mathbin", "\\mathrel", "\\mathopen", "\\mathclose", "\\mathpunct", "\\mathinner"], props: { numArgs: 1, primitive: !0 }, handler(e, t) { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "mclass", mode: r.mode, mclass: "m" + n.slice(5), body: Ke(o), isCharacterBox: l.isCharacterBox(o) } }, htmlBuilder: Vt, mathmlBuilder: Pt }); const Ft = e => { const t = "ordgroup" === e.type && e.body.length ? e.body[0] : e; return "atom" !== t.type || "bin" !== t.family && "rel" !== t.family ? "mord" : "m" + t.family }; je({ type: "mclass", names: ["\\@binrel"], props: { numArgs: 2 }, handler(e, t) { let { parser: r } = e; return { type: "mclass", mode: r.mode, mclass: Ft(t[0]), body: Ke(t[1]), isCharacterBox: l.isCharacterBox(t[1]) } } }), je({ type: "mclass", names: ["\\stackrel", "\\overset", "\\underset"], props: { numArgs: 2 }, handler(e, t) { let { parser: r, funcName: n } = e; const o = t[1], s = t[0]; let i; i = "\\stackrel" !== n ? Ft(o) : "mrel"; const a = { type: "op", mode: o.mode, limits: !0, alwaysHandleSupSub: !0, parentIsSupSub: !1, symbol: !1, suppressBaseShift: "\\stackrel" !== n, body: Ke(o) }, h = { type: "supsub", mode: s.mode, base: a, sup: "\\underset" === n ? null : s, sub: "\\underset" === n ? s : null }; return { type: "mclass", mode: r.mode, mclass: i, body: [h], isCharacterBox: l.isCharacterBox(h) } }, htmlBuilder: Vt, mathmlBuilder: Pt }), je({ type: "pmb", names: ["\\pmb"], props: { numArgs: 1, allowedInText: !0 }, handler(e, t) { let { parser: r } = e; return { type: "pmb", mode: r.mode, mclass: Ft(t[0]), body: Ke(t[0]) } }, htmlBuilder(e, t) { const r = nt(e.body, t, !0), n = Ve.makeSpan([e.mclass], r, t); return n.style.textShadow = "0.02em 0.01em 0.04px", n }, mathmlBuilder(e, t) { const r = xt(e.body, t), n = new gt.MathNode("mstyle", r); return n.setAttribute("style", "text-shadow: 0.02em 0.01em 0.04px"), n } }); const Gt = { ">": "\\\\cdrightarrow", "<": "\\\\cdleftarrow", "=": "\\\\cdlongequal", A: "\\uparrow", V: "\\downarrow", "|": "\\Vert", ".": "no arrow" }, Ut = e => "textord" === e.type && "@" === e.text; function Yt(e, t, r) { const n = Gt[e]; switch (n) { case "\\\\cdrightarrow": case "\\\\cdleftarrow": return r.callFunction(n, [t[0]], [t[1]]); case "\\uparrow": case "\\downarrow": { const e = { type: "atom", text: n, mode: "math", family: "rel" }, o = { type: "ordgroup", mode: "math", body: [r.callFunction("\\\\cdleft", [t[0]], []), r.callFunction("\\Big", [e], []), r.callFunction("\\\\cdright", [t[1]], [])] }; return r.callFunction("\\\\cdparent", [o], []) } case "\\\\cdlongequal": return r.callFunction("\\\\cdlongequal", [], []); case "\\Vert": { const e = { type: "textord", text: "\\Vert", mode: "math" }; return r.callFunction("\\Big", [e], []) } default: return { type: "textord", text: " ", mode: "math" } } } je({ type: "cdlabel", names: ["\\\\cdleft", "\\\\cdright"], props: { numArgs: 1 }, handler(e, t) { let { parser: r, funcName: n } = e; return { type: "cdlabel", mode: r.mode, side: n.slice(4), label: t[0] } }, htmlBuilder(e, t) { const r = t.havingStyle(t.style.sup()), n = Ve.wrapFragment(ht(e.label, r, t), t); return n.classes.push("cd-label-" + e.side), n.style.bottom = F(.8 - n.depth), n.height = 0, n.depth = 0, n }, mathmlBuilder(e, t) { let r = new gt.MathNode("mrow", [vt(e.label, t)]); return r = new gt.MathNode("mpadded", [r]), r.setAttribute("width", "0"), "left" === e.side && r.setAttribute("lspace", "-1width"), r.setAttribute("voffset", "0.7em"), r = new gt.MathNode("mstyle", [r]), r.setAttribute("displaystyle", "false"), r.setAttribute("scriptlevel", "1"), r } }), je({ type: "cdlabelparent", names: ["\\\\cdparent"], props: { numArgs: 1 }, handler(e, t) { let { parser: r } = e; return { type: "cdlabelparent", mode: r.mode, fragment: t[0] } }, htmlBuilder(e, t) { const r = Ve.wrapFragment(ht(e.fragment, t), t); return r.classes.push("cd-vert-arrow"), r }, mathmlBuilder(e, t) { return new gt.MathNode("mrow", [vt(e.fragment, t)]) } }), je({ type: "textord", names: ["\\@char"], props: { numArgs: 1, allowedInText: !0 }, handler(e, t) { let { parser: r } = e; const o = qt(t[0], "ordgroup").body; let s = ""; for (let e = 0; e < o.length; e++) { s += qt(o[e], "textord").text } let i, a = parseInt(s); if (isNaN(a)) throw new n("\\@char has non-numeric argument " + s); if (a < 0 || a >= 1114111) throw new n("\\@char with invalid code point " + s); return a <= 65535 ? i = String.fromCharCode(a) : (a -= 65536, i = String.fromCharCode(55296 + (a >> 10), 56320 + (1023 & a))), { type: "textord", mode: r.mode, text: i } } }); const Xt = (e, t) => { const r = nt(e.body, t.withColor(e.color), !1); return Ve.makeFragment(r) }, Wt = (e, t) => { const r = xt(e.body, t.withColor(e.color)), n = new gt.MathNode("mstyle", r); return n.setAttribute("mathcolor", e.color), n }; je({ type: "color", names: ["\\textcolor"], props: { numArgs: 2, allowedInText: !0, argTypes: ["color", "original"] }, handler(e, t) { let { parser: r } = e; const n = qt(t[0], "color-token").color, o = t[1]; return { type: "color", mode: r.mode, color: n, body: Ke(o) } }, htmlBuilder: Xt, mathmlBuilder: Wt }), je({ type: "color", names: ["\\color"], props: { numArgs: 1, allowedInText: !0, argTypes: ["color"] }, handler(e, t) { let { parser: r, breakOnTokenText: n } = e; const o = qt(t[0], "color-token").color; r.gullet.macros.set("\\current@color", o); const s = r.parseExpression(!0, n); return { type: "color", mode: r.mode, color: o, body: s } }, htmlBuilder: Xt, mathmlBuilder: Wt }), je({ type: "cr", names: ["\\\\"], props: { numArgs: 0, numOptionalArgs: 0, allowedInText: !0 }, handler(e, t, r) { let { parser: n } = e; const o = "[" === n.gullet.future().text ? n.parseSizeGroup(!0) : null, s = !n.settings.displayMode || !n.settings.useStrictBehavior("newLineInDisplayMode", "In LaTeX, \\\\ or \\newline does nothing in display mode"); return { type: "cr", mode: n.mode, newLine: s, size: o && qt(o, "size").value } }, htmlBuilder(e, t) { const r = Ve.makeSpan(["mspace"], [], t); return e.newLine && (r.classes.push("newline"), e.size && (r.style.marginTop = F(P(e.size, t)))), r }, mathmlBuilder(e, t) { const r = new gt.MathNode("mspace"); return e.newLine && (r.setAttribute("linebreak", "newline"), e.size && r.setAttribute("height", F(P(e.size, t)))), r } }); const _t = { "\\global": "\\global", "\\long": "\\\\globallong", "\\\\globallong": "\\\\globallong", "\\def": "\\gdef", "\\gdef": "\\gdef", "\\edef": "\\xdef", "\\xdef": "\\xdef", "\\let": "\\\\globallet", "\\futurelet": "\\\\globalfuture" }, jt = e => { const t = e.text; if (/^(?:[\\{}$&#^_]|EOF)$/.test(t)) throw new n("Expected a control sequence", e); return t }, $t = (e, t, r, n) => { let o = e.gullet.macros.get(r.text); null == o && (r.noexpand = !0, o = { tokens: [r], numArgs: 0, unexpandable: !e.gullet.isExpandable(r.text) }), e.gullet.macros.set(t, o, n) }; je({ type: "internal", names: ["\\global", "\\long", "\\\\globallong"], props: { numArgs: 0, allowedInText: !0 }, handler(e) { let { parser: t, funcName: r } = e; t.consumeSpaces(); const o = t.fetch(); if (_t[o.text]) return "\\global" !== r && "\\\\globallong" !== r || (o.text = _t[o.text]), qt(t.parseFunction(), "internal"); throw new n("Invalid token after macro prefix", o) } }), je({ type: "internal", names: ["\\def", "\\gdef", "\\edef", "\\xdef"], props: { numArgs: 0, allowedInText: !0, primitive: !0 }, handler(e) { let { parser: t, funcName: r } = e, o = t.gullet.popToken(); const s = o.text; if (/^(?:[\\{}$&#^_]|EOF)$/.test(s)) throw new n("Expected a control sequence", o); let i, a = 0; const l = [[]]; for (; "{" !== t.gullet.future().text;)if (o = t.gullet.popToken(), "#" === o.text) { if ("{" === t.gullet.future().text) { i = t.gullet.future(), l[a].push("{"); break } if (o = t.gullet.popToken(), !/^[1-9]$/.test(o.text)) throw new n('Invalid argument number "' + o.text + '"'); if (parseInt(o.text) !== a + 1) throw new n('Argument number "' + o.text + '" out of order'); a++, l.push([]) } else { if ("EOF" === o.text) throw new n("Expected a macro definition"); l[a].push(o.text) } let { tokens: h } = t.gullet.consumeArg(); return i && h.unshift(i), "\\edef" !== r && "\\xdef" !== r || (h = t.gullet.expandTokens(h), h.reverse()), t.gullet.macros.set(s, { tokens: h, numArgs: a, delimiters: l }, r === _t[r]), { type: "internal", mode: t.mode } } }), je({ type: "internal", names: ["\\let", "\\\\globallet"], props: { numArgs: 0, allowedInText: !0, primitive: !0 }, handler(e) { let { parser: t, funcName: r } = e; const n = jt(t.gullet.popToken()); t.gullet.consumeSpaces(); const o = (e => { let t = e.gullet.popToken(); return "=" === t.text && (t = e.gullet.popToken(), " " === t.text && (t = e.gullet.popToken())), t })(t); return $t(t, n, o, "\\\\globallet" === r), { type: "internal", mode: t.mode } } }), je({ type: "internal", names: ["\\futurelet", "\\\\globalfuture"], props: { numArgs: 0, allowedInText: !0, primitive: !0 }, handler(e) { let { parser: t, funcName: r } = e; const n = jt(t.gullet.popToken()), o = t.gullet.popToken(), s = t.gullet.popToken(); return $t(t, n, s, "\\\\globalfuture" === r), t.gullet.pushToken(s), t.gullet.pushToken(o), { type: "internal", mode: t.mode } } }); const Zt = function (e, t, r) { const n = N(oe.math[e] && oe.math[e].replace || e, t, r); if (!n) throw new Error("Unsupported symbol " + e + " and font size " + t + "."); return n }, Kt = function (e, t, r, n) { const o = r.havingBaseStyle(t), s = Ve.makeSpan(n.concat(o.sizingClasses(r)), [e], r), i = o.sizeMultiplier / r.sizeMultiplier; return s.height *= i, s.depth *= i, s.maxFontSize = o.sizeMultiplier, s }, Jt = function (e, t, r) { const n = t.havingBaseStyle(r), o = (1 - t.sizeMultiplier / n.sizeMultiplier) * t.fontMetrics().axisHeight; e.classes.push("delimcenter"), e.style.top = F(o), e.height -= o, e.depth += o }, Qt = function (e, t, r, n, o, s) { const i = function (e, t, r, n) { return Ve.makeSymbol(e, "Size" + t + "-Regular", r, n) }(e, t, o, n), a = Kt(Ve.makeSpan(["delimsizing", "size" + t], [i], n), w.TEXT, n, s); return r && Jt(a, n, w.TEXT), a }, er = function (e, t, r) { let n; n = "Size1-Regular" === t ? "delim-size1" : "delim-size4"; return { type: "elem", elem: Ve.makeSpan(["delimsizinginner", n], [Ve.makeSpan([], [Ve.makeSymbol(e, t, r)])]) } }, tr = function (e, t, r) { const n = T["Size4-Regular"][e.charCodeAt(0)] ? T["Size4-Regular"][e.charCodeAt(0)][4] : T["Size1-Regular"][e.charCodeAt(0)][4], o = new J("inner", function (e, t) { switch (e) { case "\u239c": return "M291 0 H417 V" + t + " H291z M291 0 H417 V" + t + " H291z"; case "\u2223": return "M145 0 H188 V" + t + " H145z M145 0 H188 V" + t + " H145z"; case "\u2225": return "M145 0 H188 V" + t + " H145z M145 0 H188 V" + t + " H145zM367 0 H410 V" + t + " H367z M367 0 H410 V" + t + " H367z"; case "\u239f": return "M457 0 H583 V" + t + " H457z M457 0 H583 V" + t + " H457z"; case "\u23a2": return "M319 0 H403 V" + t + " H319z M319 0 H403 V" + t + " H319z"; case "\u23a5": return "M263 0 H347 V" + t + " H263z M263 0 H347 V" + t + " H263z"; case "\u23aa": return "M384 0 H504 V" + t + " H384z M384 0 H504 V" + t + " H384z"; case "\u23d0": return "M312 0 H355 V" + t + " H312z M312 0 H355 V" + t + " H312z"; case "\u2016": return "M257 0 H300 V" + t + " H257z M257 0 H300 V" + t + " H257zM478 0 H521 V" + t + " H478z M478 0 H521 V" + t + " H478z"; default: return "" } }(e, Math.round(1e3 * t))), s = new K([o], { width: F(n), height: F(t), style: "width:" + F(n), viewBox: "0 0 " + 1e3 * n + " " + Math.round(1e3 * t), preserveAspectRatio: "xMinYMin" }), i = Ve.makeSvgSpan([], [s], r); return i.height = t, i.style.height = F(t), i.style.width = F(n), { type: "elem", elem: i } }, rr = { type: "kern", size: -.008 }, nr = ["|", "\\lvert", "\\rvert", "\\vert"], or = ["\\|", "\\lVert", "\\rVert", "\\Vert"], sr = function (e, t, r, n, o, s) { let i, a, h, c, m = "", p = 0; i = h = c = e, a = null; let u = "Size1-Regular"; "\\uparrow" === e ? h = c = "\u23d0" : "\\Uparrow" === e ? h = c = "\u2016" : "\\downarrow" === e ? i = h = "\u23d0" : "\\Downarrow" === e ? i = h = "\u2016" : "\\updownarrow" === e ? (i = "\\uparrow", h = "\u23d0", c = "\\downarrow") : "\\Updownarrow" === e ? (i = "\\Uparrow", h = "\u2016", c = "\\Downarrow") : l.contains(nr, e) ? (h = "\u2223", m = "vert", p = 333) : l.contains(or, e) ? (h = "\u2225", m = "doublevert", p = 556) : "[" === e || "\\lbrack" === e ? (i = "\u23a1", h = "\u23a2", c = "\u23a3", u = "Size4-Regular", m = "lbrack", p = 667) : "]" === e || "\\rbrack" === e ? (i = "\u23a4", h = "\u23a5", c = "\u23a6", u = "Size4-Regular", m = "rbrack", p = 667) : "\\lfloor" === e || "\u230a" === e ? (h = i = "\u23a2", c = "\u23a3", u = "Size4-Regular", m = "lfloor", p = 667) : "\\lceil" === e || "\u2308" === e ? (i = "\u23a1", h = c = "\u23a2", u = "Size4-Regular", m = "lceil", p = 667) : "\\rfloor" === e || "\u230b" === e ? (h = i = "\u23a5", c = "\u23a6", u = "Size4-Regular", m = "rfloor", p = 667) : "\\rceil" === e || "\u2309" === e ? (i = "\u23a4", h = c = "\u23a5", u = "Size4-Regular", m = "rceil", p = 667) : "(" === e || "\\lparen" === e ? (i = "\u239b", h = "\u239c", c = "\u239d", u = "Size4-Regular", m = "lparen", p = 875) : ")" === e || "\\rparen" === e ? (i = "\u239e", h = "\u239f", c = "\u23a0", u = "Size4-Regular", m = "rparen", p = 875) : "\\{" === e || "\\lbrace" === e ? (i = "\u23a7", a = "\u23a8", c = "\u23a9", h = "\u23aa", u = "Size4-Regular") : "\\}" === e || "\\rbrace" === e ? (i = "\u23ab", a = "\u23ac", c = "\u23ad", h = "\u23aa", u = "Size4-Regular") : "\\lgroup" === e || "\u27ee" === e ? (i = "\u23a7", c = "\u23a9", h = "\u23aa", u = "Size4-Regular") : "\\rgroup" === e || "\u27ef" === e ? (i = "\u23ab", c = "\u23ad", h = "\u23aa", u = "Size4-Regular") : "\\lmoustache" === e || "\u23b0" === e ? (i = "\u23a7", c = "\u23ad", h = "\u23aa", u = "Size4-Regular") : "\\rmoustache" !== e && "\u23b1" !== e || (i = "\u23ab", c = "\u23a9", h = "\u23aa", u = "Size4-Regular"); const d = Zt(i, u, o), g = d.height + d.depth, f = Zt(h, u, o), b = f.height + f.depth, y = Zt(c, u, o), x = y.height + y.depth; let v = 0, k = 1; if (null !== a) { const e = Zt(a, u, o); v = e.height + e.depth, k = 2 } const S = g + x + v, M = S + Math.max(0, Math.ceil((t - S) / (k * b))) * k * b; let z = n.fontMetrics().axisHeight; r && (z *= n.sizeMultiplier); const A = M / 2 - z, T = []; if (m.length > 0) { const e = M - g - x, t = Math.round(1e3 * M), r = function (e, t) { switch (e) { case "lbrack": return "M403 1759 V84 H666 V0 H319 V1759 v" + t + " v1759 h347 v-84\nH403z M403 1759 V0 H319 V1759 v" + t + " v1759 h84z"; case "rbrack": return "M347 1759 V0 H0 V84 H263 V1759 v" + t + " v1759 H0 v84 H347z\nM347 1759 V0 H263 V1759 v" + t + " v1759 h84z"; case "vert": return "M145 15 v585 v" + t + " v585 c2.667,10,9.667,15,21,15\nc10,0,16.667,-5,20,-15 v-585 v" + -t + " v-585 c-2.667,-10,-9.667,-15,-21,-15\nc-10,0,-16.667,5,-20,15z M188 15 H145 v585 v" + t + " v585 h43z"; case "doublevert": return "M145 15 v585 v" + t + " v585 c2.667,10,9.667,15,21,15\nc10,0,16.667,-5,20,-15 v-585 v" + -t + " v-585 c-2.667,-10,-9.667,-15,-21,-15\nc-10,0,-16.667,5,-20,15z M188 15 H145 v585 v" + t + " v585 h43z\nM367 15 v585 v" + t + " v585 c2.667,10,9.667,15,21,15\nc10,0,16.667,-5,20,-15 v-585 v" + -t + " v-585 c-2.667,-10,-9.667,-15,-21,-15\nc-10,0,-16.667,5,-20,15z M410 15 H367 v585 v" + t + " v585 h43z"; case "lfloor": return "M319 602 V0 H403 V602 v" + t + " v1715 h263 v84 H319z\nMM319 602 V0 H403 V602 v" + t + " v1715 H319z"; case "rfloor": return "M319 602 V0 H403 V602 v" + t + " v1799 H0 v-84 H319z\nMM319 602 V0 H403 V602 v" + t + " v1715 H319z"; case "lceil": return "M403 1759 V84 H666 V0 H319 V1759 v" + t + " v602 h84z\nM403 1759 V0 H319 V1759 v" + t + " v602 h84z"; case "rceil": return "M347 1759 V0 H0 V84 H263 V1759 v" + t + " v602 h84z\nM347 1759 V0 h-84 V1759 v" + t + " v602 h84z"; case "lparen": return "M863,9c0,-2,-2,-5,-6,-9c0,0,-17,0,-17,0c-12.7,0,-19.3,0.3,-20,1\nc-5.3,5.3,-10.3,11,-15,17c-242.7,294.7,-395.3,682,-458,1162c-21.3,163.3,-33.3,349,\n-36,557 l0," + (t + 84) + "c0.2,6,0,26,0,60c2,159.3,10,310.7,24,454c53.3,528,210,\n949.7,470,1265c4.7,6,9.7,11.7,15,17c0.7,0.7,7,1,19,1c0,0,18,0,18,0c4,-4,6,-7,6,-9\nc0,-2.7,-3.3,-8.7,-10,-18c-135.3,-192.7,-235.5,-414.3,-300.5,-665c-65,-250.7,-102.5,\n-544.7,-112.5,-882c-2,-104,-3,-167,-3,-189\nl0,-" + (t + 92) + "c0,-162.7,5.7,-314,17,-454c20.7,-272,63.7,-513,129,-723c65.3,\n-210,155.3,-396.3,270,-559c6.7,-9.3,10,-15.3,10,-18z"; case "rparen": return "M76,0c-16.7,0,-25,3,-25,9c0,2,2,6.3,6,13c21.3,28.7,42.3,60.3,\n63,95c96.7,156.7,172.8,332.5,228.5,527.5c55.7,195,92.8,416.5,111.5,664.5\nc11.3,139.3,17,290.7,17,454c0,28,1.7,43,3.3,45l0," + (t + 9) + "\nc-3,4,-3.3,16.7,-3.3,38c0,162,-5.7,313.7,-17,455c-18.7,248,-55.8,469.3,-111.5,664\nc-55.7,194.7,-131.8,370.3,-228.5,527c-20.7,34.7,-41.7,66.3,-63,95c-2,3.3,-4,7,-6,11\nc0,7.3,5.7,11,17,11c0,0,11,0,11,0c9.3,0,14.3,-0.3,15,-1c5.3,-5.3,10.3,-11,15,-17\nc242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558\nl0,-" + (t + 144) + "c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7,\n-470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z"; default: throw new Error("Unknown stretchy delimiter.") } }(m, Math.round(1e3 * e)), o = new J(m, r), s = (p / 1e3).toFixed(3) + "em", i = (t / 1e3).toFixed(3) + "em", a = new K([o], { width: s, height: i, viewBox: "0 0 " + p + " " + t }), l = Ve.makeSvgSpan([], [a], n); l.height = t / 1e3, l.style.width = s, l.style.height = i, T.push({ type: "elem", elem: l }) } else { if (T.push(er(c, u, o)), T.push(rr), null === a) { const e = M - g - x + .016; T.push(tr(h, e, n)) } else { const e = (M - g - x - v) / 2 + .016; T.push(tr(h, e, n)), T.push(rr), T.push(er(a, u, o)), T.push(rr), T.push(tr(h, e, n)) } T.push(rr), T.push(er(i, u, o)) } const B = n.havingBaseStyle(w.TEXT), C = Ve.makeVList({ positionType: "bottom", positionData: A, children: T }, B); return Kt(Ve.makeSpan(["delimsizing", "mult"], [C], B), w.TEXT, n, s) }, ir = .08, ar = function (e, t, r, n, o) { const s = function (e, t, r) { t *= 1e3; let n = ""; switch (e) { case "sqrtMain": n = function (e, t) { return "M95," + (622 + e + t) + "\nc-2.7,0,-7.17,-2.7,-13.5,-8c-5.8,-5.3,-9.5,-10,-9.5,-14\nc0,-2,0.3,-3.3,1,-4c1.3,-2.7,23.83,-20.7,67.5,-54\nc44.2,-33.3,65.8,-50.3,66.5,-51c1.3,-1.3,3,-2,5,-2c4.7,0,8.7,3.3,12,10\ns173,378,173,378c0.7,0,35.3,-71,104,-213c68.7,-142,137.5,-285,206.5,-429\nc69,-144,104.5,-217.7,106.5,-221\nl" + e / 2.075 + " -" + e + "\nc5.3,-9.3,12,-14,20,-14\nH400000v" + (40 + e) + "H845.2724\ns-225.272,467,-225.272,467s-235,486,-235,486c-2.7,4.7,-9,7,-19,7\nc-6,0,-10,-1,-12,-3s-194,-422,-194,-422s-65,47,-65,47z\nM" + (834 + e) + " " + t + "h400000v" + (40 + e) + "h-400000z" }(t, M); break; case "sqrtSize1": n = function (e, t) { return "M263," + (601 + e + t) + "c0.7,0,18,39.7,52,119\nc34,79.3,68.167,158.7,102.5,238c34.3,79.3,51.8,119.3,52.5,120\nc340,-704.7,510.7,-1060.3,512,-1067\nl" + e / 2.084 + " -" + e + "\nc4.7,-7.3,11,-11,19,-11\nH40000v" + (40 + e) + "H1012.3\ns-271.3,567,-271.3,567c-38.7,80.7,-84,175,-136,283c-52,108,-89.167,185.3,-111.5,232\nc-22.3,46.7,-33.8,70.3,-34.5,71c-4.7,4.7,-12.3,7,-23,7s-12,-1,-12,-1\ns-109,-253,-109,-253c-72.7,-168,-109.3,-252,-110,-252c-10.7,8,-22,16.7,-34,26\nc-22,17.3,-33.3,26,-34,26s-26,-26,-26,-26s76,-59,76,-59s76,-60,76,-60z\nM" + (1001 + e) + " " + t + "h400000v" + (40 + e) + "h-400000z" }(t, M); break; case "sqrtSize2": n = function (e, t) { return "M983 " + (10 + e + t) + "\nl" + e / 3.13 + " -" + e + "\nc4,-6.7,10,-10,18,-10 H400000v" + (40 + e) + "\nH1013.1s-83.4,268,-264.1,840c-180.7,572,-277,876.3,-289,913c-4.7,4.7,-12.7,7,-24,7\ns-12,0,-12,0c-1.3,-3.3,-3.7,-11.7,-7,-25c-35.3,-125.3,-106.7,-373.3,-214,-744\nc-10,12,-21,25,-33,39s-32,39,-32,39c-6,-5.3,-15,-14,-27,-26s25,-30,25,-30\nc26.7,-32.7,52,-63,76,-91s52,-60,52,-60s208,722,208,722\nc56,-175.3,126.3,-397.3,211,-666c84.7,-268.7,153.8,-488.2,207.5,-658.5\nc53.7,-170.3,84.5,-266.8,92.5,-289.5z\nM" + (1001 + e) + " " + t + "h400000v" + (40 + e) + "h-400000z" }(t, M); break; case "sqrtSize3": n = function (e, t) { return "M424," + (2398 + e + t) + "\nc-1.3,-0.7,-38.5,-172,-111.5,-514c-73,-342,-109.8,-513.3,-110.5,-514\nc0,-2,-10.7,14.3,-32,49c-4.7,7.3,-9.8,15.7,-15.5,25c-5.7,9.3,-9.8,16,-12.5,20\ns-5,7,-5,7c-4,-3.3,-8.3,-7.7,-13,-13s-13,-13,-13,-13s76,-122,76,-122s77,-121,77,-121\ns209,968,209,968c0,-2,84.7,-361.7,254,-1079c169.3,-717.3,254.7,-1077.7,256,-1081\nl" + e / 4.223 + " -" + e + "c4,-6.7,10,-10,18,-10 H400000\nv" + (40 + e) + "H1014.6\ns-87.3,378.7,-272.6,1166c-185.3,787.3,-279.3,1182.3,-282,1185\nc-2,6,-10,9,-24,9\nc-8,0,-12,-0.7,-12,-2z M" + (1001 + e) + " " + t + "\nh400000v" + (40 + e) + "h-400000z" }(t, M); break; case "sqrtSize4": n = function (e, t) { return "M473," + (2713 + e + t) + "\nc339.3,-1799.3,509.3,-2700,510,-2702 l" + e / 5.298 + " -" + e + "\nc3.3,-7.3,9.3,-11,18,-11 H400000v" + (40 + e) + "H1017.7\ns-90.5,478,-276.2,1466c-185.7,988,-279.5,1483,-281.5,1485c-2,6,-10,9,-24,9\nc-8,0,-12,-0.7,-12,-2c0,-1.3,-5.3,-32,-16,-92c-50.7,-293.3,-119.7,-693.3,-207,-1200\nc0,-1.3,-5.3,8.7,-16,30c-10.7,21.3,-21.3,42.7,-32,64s-16,33,-16,33s-26,-26,-26,-26\ns76,-153,76,-153s77,-151,77,-151c0.7,0.7,35.7,202,105,604c67.3,400.7,102,602.7,104,\n606zM" + (1001 + e) + " " + t + "h400000v" + (40 + e) + "H1017.7z" }(t, M); break; case "sqrtTall": n = function (e, t, r) { return "M702 " + (e + t) + "H400000" + (40 + e) + "\nH742v" + (r - 54 - t - e) + "l-4 4-4 4c-.667.7 -2 1.5-4 2.5s-4.167 1.833-6.5 2.5-5.5 1-9.5 1\nh-12l-28-84c-16.667-52-96.667 -294.333-240-727l-212 -643 -85 170\nc-4-3.333-8.333-7.667-13 -13l-13-13l77-155 77-156c66 199.333 139 419.667\n219 661 l218 661zM702 " + t + "H400000v" + (40 + e) + "H742z" }(t, M, r) }return n }(e, n, r), i = new J(e, s), a = new K([i], { width: "400em", height: F(t), viewBox: "0 0 400000 " + r, preserveAspectRatio: "xMinYMin slice" }); return Ve.makeSvgSpan(["hide-tail"], [a], o) }, lr = ["(", "\\lparen", ")", "\\rparen", "[", "\\lbrack", "]", "\\rbrack", "\\{", "\\lbrace", "\\}", "\\rbrace", "\\lfloor", "\\rfloor", "\u230a", "\u230b", "\\lceil", "\\rceil", "\u2308", "\u2309", "\\surd"], hr = ["\\uparrow", "\\downarrow", "\\updownarrow", "\\Uparrow", "\\Downarrow", "\\Updownarrow", "|", "\\|", "\\vert", "\\Vert", "\\lvert", "\\rvert", "\\lVert", "\\rVert", "\\lgroup", "\\rgroup", "\u27ee", "\u27ef", "\\lmoustache", "\\rmoustache", "\u23b0", "\u23b1"], cr = ["<", ">", "\\langle", "\\rangle", "/", "\\backslash", "\\lt", "\\gt"], mr = [0, 1.2, 1.8, 2.4, 3], pr = [{ type: "small", style: w.SCRIPTSCRIPT }, { type: "small", style: w.SCRIPT }, { type: "small", style: w.TEXT }, { type: "large", size: 1 }, { type: "large", size: 2 }, { type: "large", size: 3 }, { type: "large", size: 4 }], ur = [{ type: "small", style: w.SCRIPTSCRIPT }, { type: "small", style: w.SCRIPT }, { type: "small", style: w.TEXT }, { type: "stack" }], dr = [{ type: "small", style: w.SCRIPTSCRIPT }, { type: "small", style: w.SCRIPT }, { type: "small", style: w.TEXT }, { type: "large", size: 1 }, { type: "large", size: 2 }, { type: "large", size: 3 }, { type: "large", size: 4 }, { type: "stack" }], gr = function (e) { if ("small" === e.type) return "Main-Regular"; if ("large" === e.type) return "Size" + e.size + "-Regular"; if ("stack" === e.type) return "Size4-Regular"; throw new Error("Add support for delim type '" + e.type + "' here.") }, fr = function (e, t, r, n) { for (let o = Math.min(2, 3 - n.style.size); o < r.length && "stack" !== r[o].type; o++) { const s = Zt(e, gr(r[o]), "math"); let i = s.height + s.depth; if ("small" === r[o].type) { i *= n.havingBaseStyle(r[o].style).sizeMultiplier } if (i > t) return r[o] } return r[r.length - 1] }, br = function (e, t, r, n, o, s) { let i; "<" === e || "\\lt" === e || "\u27e8" === e ? e = "\\langle" : ">" !== e && "\\gt" !== e && "\u27e9" !== e || (e = "\\rangle"), i = l.contains(cr, e) ? pr : l.contains(lr, e) ? dr : ur; const a = fr(e, t, i, n); return "small" === a.type ? function (e, t, r, n, o, s) { const i = Ve.makeSymbol(e, "Main-Regular", o, n), a = Kt(i, t, n, s); return r && Jt(a, n, t), a }(e, a.style, r, n, o, s) : "large" === a.type ? Qt(e, a.size, r, n, o, s) : sr(e, t, r, n, o, s) }; var yr = { sqrtImage: function (e, t) { const r = t.havingBaseSizing(), n = fr("\\surd", e * r.sizeMultiplier, dr, r); let o = r.sizeMultiplier; const s = Math.max(0, t.minRuleThickness - t.fontMetrics().sqrtRuleThickness); let i, a, l = 0, h = 0, c = 0; return "small" === n.type ? (c = 1e3 + 1e3 * s + 80, e < 1 ? o = 1 : e < 1.4 && (o = .7), l = (1 + s + ir) / o, h = (1 + s) / o, i = ar("sqrtMain", l, c, s, t), i.style.minWidth = "0.853em", a = .833 / o) : "large" === n.type ? (c = 1080 * mr[n.size], h = (mr[n.size] + s) / o, l = (mr[n.size] + s + ir) / o, i = ar("sqrtSize" + n.size, l, c, s, t), i.style.minWidth = "1.02em", a = 1 / o) : (l = e + s + ir, h = e + s, c = Math.floor(1e3 * e + s) + 80, i = ar("sqrtTall", l, c, s, t), i.style.minWidth = "0.742em", a = 1.056), i.height = h, i.style.height = F(l), { span: i, advanceWidth: a, ruleWidth: (t.fontMetrics().sqrtRuleThickness + s) * o } }, sizedDelim: function (e, t, r, o, s) { if ("<" === e || "\\lt" === e || "\u27e8" === e ? e = "\\langle" : ">" !== e && "\\gt" !== e && "\u27e9" !== e || (e = "\\rangle"), l.contains(lr, e) || l.contains(cr, e)) return Qt(e, t, !1, r, o, s); if (l.contains(hr, e)) return sr(e, mr[t], !1, r, o, s); throw new n("Illegal delimiter: '" + e + "'") }, sizeToMaxHeight: mr, customSizedDelim: br, leftRightDelim: function (e, t, r, n, o, s) { const i = n.fontMetrics().axisHeight * n.sizeMultiplier, a = 5 / n.fontMetrics().ptPerEm, l = Math.max(t - i, r + i), h = Math.max(l / 500 * 901, 2 * l - a); return br(e, h, !0, n, o, s) } }; const xr = { "\\bigl": { mclass: "mopen", size: 1 }, "\\Bigl": { mclass: "mopen", size: 2 }, "\\biggl": { mclass: "mopen", size: 3 }, "\\Biggl": { mclass: "mopen", size: 4 }, "\\bigr": { mclass: "mclose", size: 1 }, "\\Bigr": { mclass: "mclose", size: 2 }, "\\biggr": { mclass: "mclose", size: 3 }, "\\Biggr": { mclass: "mclose", size: 4 }, "\\bigm": { mclass: "mrel", size: 1 }, "\\Bigm": { mclass: "mrel", size: 2 }, "\\biggm": { mclass: "mrel", size: 3 }, "\\Biggm": { mclass: "mrel", size: 4 }, "\\big": { mclass: "mord", size: 1 }, "\\Big": { mclass: "mord", size: 2 }, "\\bigg": { mclass: "mord", size: 3 }, "\\Bigg": { mclass: "mord", size: 4 } }, wr = ["(", "\\lparen", ")", "\\rparen", "[", "\\lbrack", "]", "\\rbrack", "\\{", "\\lbrace", "\\}", "\\rbrace", "\\lfloor", "\\rfloor", "\u230a", "\u230b", "\\lceil", "\\rceil", "\u2308", "\u2309", "<", ">", "\\langle", "\u27e8", "\\rangle", "\u27e9", "\\lt", "\\gt", "\\lvert", "\\rvert", "\\lVert", "\\rVert", "\\lgroup", "\\rgroup", "\u27ee", "\u27ef", "\\lmoustache", "\\rmoustache", "\u23b0", "\u23b1", "/", "\\backslash", "|", "\\vert", "\\|", "\\Vert", "\\uparrow", "\\Uparrow", "\\downarrow", "\\Downarrow", "\\updownarrow", "\\Updownarrow", "."]; function vr(e, t) { const r = Rt(e); if (r && l.contains(wr, r.text)) return r; throw new n(r ? "Invalid delimiter '" + r.text + "' after '" + t.funcName + "'" : "Invalid delimiter type '" + e.type + "'", e) } function kr(e) { if (!e.body) throw new Error("Bug: The leftright ParseNode wasn't fully parsed.") } je({ type: "delimsizing", names: ["\\bigl", "\\Bigl", "\\biggl", "\\Biggl", "\\bigr", "\\Bigr", "\\biggr", "\\Biggr", "\\bigm", "\\Bigm", "\\biggm", "\\Biggm", "\\big", "\\Big", "\\bigg", "\\Bigg"], props: { numArgs: 1, argTypes: ["primitive"] }, handler: (e, t) => { const r = vr(t[0], e); return { type: "delimsizing", mode: e.parser.mode, size: xr[e.funcName].size, mclass: xr[e.funcName].mclass, delim: r.text } }, htmlBuilder: (e, t) => "." === e.delim ? Ve.makeSpan([e.mclass]) : yr.sizedDelim(e.delim, e.size, t, e.mode, [e.mclass]), mathmlBuilder: e => { const t = []; "." !== e.delim && t.push(ft(e.delim, e.mode)); const r = new gt.MathNode("mo", t); "mopen" === e.mclass || "mclose" === e.mclass ? r.setAttribute("fence", "true") : r.setAttribute("fence", "false"), r.setAttribute("stretchy", "true"); const n = F(yr.sizeToMaxHeight[e.size]); return r.setAttribute("minsize", n), r.setAttribute("maxsize", n), r } }), je({ type: "leftright-right", names: ["\\right"], props: { numArgs: 1, primitive: !0 }, handler: (e, t) => { const r = e.parser.gullet.macros.get("\\current@color"); if (r && "string" != typeof r) throw new n("\\current@color set to non-string in \\right"); return { type: "leftright-right", mode: e.parser.mode, delim: vr(t[0], e).text, color: r } } }), je({ type: "leftright", names: ["\\left"], props: { numArgs: 1, primitive: !0 }, handler: (e, t) => { const r = vr(t[0], e), n = e.parser; ++n.leftrightDepth; const o = n.parseExpression(!1); --n.leftrightDepth, n.expect("\\right", !1); const s = qt(n.parseFunction(), "leftright-right"); return { type: "leftright", mode: n.mode, body: o, left: r.text, right: s.delim, rightColor: s.color } }, htmlBuilder: (e, t) => { kr(e); const r = nt(e.body, t, !0, ["mopen", "mclose"]); let n, o, s = 0, i = 0, a = !1; for (let e = 0; e < r.length; e++)r[e].isMiddle ? a = !0 : (s = Math.max(r[e].height, s), i = Math.max(r[e].depth, i)); if (s *= t.sizeMultiplier, i *= t.sizeMultiplier, n = "." === e.left ? lt(t, ["mopen"]) : yr.leftRightDelim(e.left, s, i, t, e.mode, ["mopen"]), r.unshift(n), a) for (let t = 1; t < r.length; t++) { const n = r[t].isMiddle; n && (r[t] = yr.leftRightDelim(n.delim, s, i, n.options, e.mode, [])) } if ("." === e.right) o = lt(t, ["mclose"]); else { const r = e.rightColor ? t.withColor(e.rightColor) : t; o = yr.leftRightDelim(e.right, s, i, r, e.mode, ["mclose"]) } return r.push(o), Ve.makeSpan(["minner"], r, t) }, mathmlBuilder: (e, t) => { kr(e); const r = xt(e.body, t); if ("." !== e.left) { const t = new gt.MathNode("mo", [ft(e.left, e.mode)]); t.setAttribute("fence", "true"), r.unshift(t) } if ("." !== e.right) { const t = new gt.MathNode("mo", [ft(e.right, e.mode)]); t.setAttribute("fence", "true"), e.rightColor && t.setAttribute("mathcolor", e.rightColor), r.push(t) } return bt(r) } }), je({ type: "middle", names: ["\\middle"], props: { numArgs: 1, primitive: !0 }, handler: (e, t) => { const r = vr(t[0], e); if (!e.parser.leftrightDepth) throw new n("\\middle without preceding \\left", r); return { type: "middle", mode: e.parser.mode, delim: r.text } }, htmlBuilder: (e, t) => { let r; if ("." === e.delim) r = lt(t, []); else { r = yr.sizedDelim(e.delim, 1, t, e.mode, []); const n = { delim: e.delim, options: t }; r.isMiddle = n } return r }, mathmlBuilder: (e, t) => { const r = "\\vert" === e.delim || "|" === e.delim ? ft("|", "text") : ft(e.delim, e.mode), n = new gt.MathNode("mo", [r]); return n.setAttribute("fence", "true"), n.setAttribute("lspace", "0.05em"), n.setAttribute("rspace", "0.05em"), n } }); const Sr = (e, t) => { const r = Ve.wrapFragment(ht(e.body, t), t), n = e.label.slice(1); let o, s = t.sizeMultiplier, i = 0; const a = l.isCharacterBox(e.body); if ("sout" === n) o = Ve.makeSpan(["stretchy", "sout"]), o.height = t.fontMetrics().defaultRuleThickness / s, i = -.5 * t.fontMetrics().xHeight; else if ("phase" === n) { const e = P({ number: .6, unit: "pt" }, t), n = P({ number: .35, unit: "ex" }, t); s /= t.havingBaseSizing().sizeMultiplier; const a = r.height + r.depth + e + n; r.style.paddingLeft = F(a / 2 + e); const l = Math.floor(1e3 * a * s), c = "M400000 " + (h = l) + " H0 L" + h / 2 + " 0 l65 45 L145 " + (h - 80) + " H400000z", m = new K([new J("phase", c)], { width: "400em", height: F(l / 1e3), viewBox: "0 0 400000 " + l, preserveAspectRatio: "xMinYMin slice" }); o = Ve.makeSvgSpan(["hide-tail"], [m], t), o.style.height = F(a), i = r.depth + e + n } else { /cancel/.test(n) ? a || r.classes.push("cancel-pad") : "angl" === n ? r.classes.push("anglpad") : r.classes.push("boxpad"); let s = 0, l = 0, h = 0; /box/.test(n) ? (h = Math.max(t.fontMetrics().fboxrule, t.minRuleThickness), s = t.fontMetrics().fboxsep + ("colorbox" === n ? 0 : h), l = s) : "angl" === n ? (h = Math.max(t.fontMetrics().defaultRuleThickness, t.minRuleThickness), s = 4 * h, l = Math.max(0, .25 - r.depth)) : (s = a ? .2 : 0, l = s), o = Bt(r, n, s, l, t), /fbox|boxed|fcolorbox/.test(n) ? (o.style.borderStyle = "solid", o.style.borderWidth = F(h)) : "angl" === n && .049 !== h && (o.style.borderTopWidth = F(h), o.style.borderRightWidth = F(h)), i = r.depth + l, e.backgroundColor && (o.style.backgroundColor = e.backgroundColor, e.borderColor && (o.style.borderColor = e.borderColor)) } var h; let c; if (e.backgroundColor) c = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: o, shift: i }, { type: "elem", elem: r, shift: 0 }] }, t); else { const e = /cancel|phase/.test(n) ? ["svg-align"] : []; c = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: r, shift: 0 }, { type: "elem", elem: o, shift: i, wrapperClasses: e }] }, t) } return /cancel/.test(n) && (c.height = r.height, c.depth = r.depth), /cancel/.test(n) && !a ? Ve.makeSpan(["mord", "cancel-lap"], [c], t) : Ve.makeSpan(["mord"], [c], t) }, Mr = (e, t) => { let r = 0; const n = new gt.MathNode(e.label.indexOf("colorbox") > -1 ? "mpadded" : "menclose", [vt(e.body, t)]); switch (e.label) { case "\\cancel": n.setAttribute("notation", "updiagonalstrike"); break; case "\\bcancel": n.setAttribute("notation", "downdiagonalstrike"); break; case "\\phase": n.setAttribute("notation", "phasorangle"); break; case "\\sout": n.setAttribute("notation", "horizontalstrike"); break; case "\\fbox": n.setAttribute("notation", "box"); break; case "\\angl": n.setAttribute("notation", "actuarial"); break; case "\\fcolorbox": case "\\colorbox": if (r = t.fontMetrics().fboxsep * t.fontMetrics().ptPerEm, n.setAttribute("width", "+" + 2 * r + "pt"), n.setAttribute("height", "+" + 2 * r + "pt"), n.setAttribute("lspace", r + "pt"), n.setAttribute("voffset", r + "pt"), "\\fcolorbox" === e.label) { const r = Math.max(t.fontMetrics().fboxrule, t.minRuleThickness); n.setAttribute("style", "border: " + r + "em solid " + String(e.borderColor)) } break; case "\\xcancel": n.setAttribute("notation", "updiagonalstrike downdiagonalstrike") }return e.backgroundColor && n.setAttribute("mathbackground", e.backgroundColor), n }; je({ type: "enclose", names: ["\\colorbox"], props: { numArgs: 2, allowedInText: !0, argTypes: ["color", "text"] }, handler(e, t, r) { let { parser: n, funcName: o } = e; const s = qt(t[0], "color-token").color, i = t[1]; return { type: "enclose", mode: n.mode, label: o, backgroundColor: s, body: i } }, htmlBuilder: Sr, mathmlBuilder: Mr }), je({ type: "enclose", names: ["\\fcolorbox"], props: { numArgs: 3, allowedInText: !0, argTypes: ["color", "color", "text"] }, handler(e, t, r) { let { parser: n, funcName: o } = e; const s = qt(t[0], "color-token").color, i = qt(t[1], "color-token").color, a = t[2]; return { type: "enclose", mode: n.mode, label: o, backgroundColor: i, borderColor: s, body: a } }, htmlBuilder: Sr, mathmlBuilder: Mr }), je({ type: "enclose", names: ["\\fbox"], props: { numArgs: 1, argTypes: ["hbox"], allowedInText: !0 }, handler(e, t) { let { parser: r } = e; return { type: "enclose", mode: r.mode, label: "\\fbox", body: t[0] } } }), je({ type: "enclose", names: ["\\cancel", "\\bcancel", "\\xcancel", "\\sout", "\\phase"], props: { numArgs: 1 }, handler(e, t) { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "enclose", mode: r.mode, label: n, body: o } }, htmlBuilder: Sr, mathmlBuilder: Mr }), je({ type: "enclose", names: ["\\angl"], props: { numArgs: 1, argTypes: ["hbox"], allowedInText: !1 }, handler(e, t) { let { parser: r } = e; return { type: "enclose", mode: r.mode, label: "\\angl", body: t[0] } } }); const zr = {}; function Ar(e) { let { type: t, names: r, props: n, handler: o, htmlBuilder: s, mathmlBuilder: i } = e; const a = { type: t, numArgs: n.numArgs || 0, allowedInText: !1, numOptionalArgs: 0, handler: o }; for (let e = 0; e < r.length; ++e)zr[r[e]] = a; s && (We[t] = s), i && (_e[t] = i) } const Tr = {}; function Br(e, t) { Tr[e] = t } class Cr { constructor(e, t, r) { this.lexer = void 0, this.start = void 0, this.end = void 0, this.lexer = e, this.start = t, this.end = r } static range(e, t) { return t ? e && e.loc && t.loc && e.loc.lexer === t.loc.lexer ? new Cr(e.loc.lexer, e.loc.start, t.loc.end) : null : e && e.loc } } class Nr { constructor(e, t) { this.text = void 0, this.loc = void 0, this.noexpand = void 0, this.treatAsRelax = void 0, this.text = e, this.loc = t } range(e, t) { return new Nr(t, Cr.range(this, e)) } } function qr(e) { const t = []; e.consumeSpaces(); let r = e.fetch().text; for ("\\relax" === r && (e.consume(), e.consumeSpaces(), r = e.fetch().text); "\\hline" === r || "\\hdashline" === r;)e.consume(), t.push("\\hdashline" === r), e.consumeSpaces(), r = e.fetch().text; return t } const Ir = e => { if (!e.parser.settings.displayMode) throw new n("{" + e.envName + "} can be used only in display mode.") }; function Rr(e) { if (-1 === e.indexOf("ed")) return -1 === e.indexOf("*") } function Hr(e, t, r) { let { hskipBeforeAndAfter: o, addJot: s, cols: i, arraystretch: a, colSeparationType: l, autoTag: h, singleRow: c, emptySingleRow: m, maxNumCols: p, leqno: u } = t; if (e.gullet.beginGroup(), c || e.gullet.macros.set("\\cr", "\\\\\\relax"), !a) { const t = e.gullet.expandMacroAsText("\\arraystretch"); if (null == t) a = 1; else if (a = parseFloat(t), !a || a < 0) throw new n("Invalid \\arraystretch: " + t) } e.gullet.beginGroup(); let d = []; const g = [d], f = [], b = [], y = null != h ? [] : void 0; function x() { h && e.gullet.macros.set("\\@eqnsw", "1", !0) } function w() { y && (e.gullet.macros.get("\\df@tag") ? (y.push(e.subparse([new Nr("\\df@tag")])), e.gullet.macros.set("\\df@tag", void 0, !0)) : y.push(Boolean(h) && "1" === e.gullet.macros.get("\\@eqnsw"))) } for (x(), b.push(qr(e)); ;) { let t = e.parseExpression(!1, c ? "\\end" : "\\\\"); e.gullet.endGroup(), e.gullet.beginGroup(), t = { type: "ordgroup", mode: e.mode, body: t }, r && (t = { type: "styling", mode: e.mode, style: r, body: [t] }), d.push(t); const o = e.fetch().text; if ("&" === o) { if (p && d.length === p) { if (c || l) throw new n("Too many tab characters: &", e.nextToken); e.settings.reportNonstrict("textEnv", "Too few columns specified in the {array} column argument.") } e.consume() } else { if ("\\end" === o) { w(), 1 === d.length && "styling" === t.type && 0 === t.body[0].body.length && (g.length > 1 || !m) && g.pop(), b.length < g.length + 1 && b.push([]); break } if ("\\\\" !== o) throw new n("Expected & or \\\\ or \\cr or \\end", e.nextToken); { let t; e.consume(), " " !== e.gullet.future().text && (t = e.parseSizeGroup(!0)), f.push(t ? t.value : null), w(), b.push(qr(e)), d = [], g.push(d), x() } } } return e.gullet.endGroup(), e.gullet.endGroup(), { type: "array", mode: e.mode, addJot: s, arraystretch: a, body: g, cols: i, rowGaps: f, hskipBeforeAndAfter: o, hLinesBeforeRow: b, colSeparationType: l, tags: y, leqno: u } } function Or(e) { return "d" === e.slice(0, 1) ? "display" : "text" } const Er = function (e, t) { let r, o; const s = e.body.length, i = e.hLinesBeforeRow; let a = 0, h = new Array(s); const c = [], m = Math.max(t.fontMetrics().arrayRuleWidth, t.minRuleThickness), p = 1 / t.fontMetrics().ptPerEm; let u = 5 * p; if (e.colSeparationType && "small" === e.colSeparationType) { u = t.havingStyle(w.SCRIPT).sizeMultiplier / t.sizeMultiplier * .2778 } const d = "CD" === e.colSeparationType ? P({ number: 3, unit: "ex" }, t) : 12 * p, g = 3 * p, f = e.arraystretch * d, b = .7 * f, y = .3 * f; let x = 0; function v(e) { for (let t = 0; t < e.length; ++t)t > 0 && (x += .25), c.push({ pos: x, isDashed: e[t] }) } for (v(i[0]), r = 0; r < e.body.length; ++r) { const n = e.body[r]; let s = b, l = y; a < n.length && (a = n.length); const c = new Array(n.length); for (o = 0; o < n.length; ++o) { const e = ht(n[o], t); l < e.depth && (l = e.depth), s < e.height && (s = e.height), c[o] = e } const m = e.rowGaps[r]; let p = 0; m && (p = P(m, t), p > 0 && (p += y, l < p && (l = p), p = 0)), e.addJot && (l += g), c.height = s, c.depth = l, x += s, c.pos = x, x += l + p, h[r] = c, v(i[r + 1]) } const k = x / 2 + t.fontMetrics().axisHeight, S = e.cols || [], M = []; let z, A; const T = []; if (e.tags && e.tags.some((e => e))) for (r = 0; r < s; ++r) { const n = h[r], o = n.pos - k, s = e.tags[r]; let i; i = !0 === s ? Ve.makeSpan(["eqn-num"], [], t) : !1 === s ? Ve.makeSpan([], [], t) : Ve.makeSpan([], nt(s, t, !0), t), i.depth = n.depth, i.height = n.height, T.push({ type: "elem", elem: i, shift: o }) } for (o = 0, A = 0; o < a || A < S.length; ++o, ++A) { let i, c = S[A] || {}, p = !0; for (; "separator" === c.type;) { if (p || (z = Ve.makeSpan(["arraycolsep"], []), z.style.width = F(t.fontMetrics().doubleRuleSep), M.push(z)), "|" !== c.separator && ":" !== c.separator) throw new n("Invalid separator type: " + c.separator); { const e = "|" === c.separator ? "solid" : "dashed", r = Ve.makeSpan(["vertical-separator"], [], t); r.style.height = F(x), r.style.borderRightWidth = F(m), r.style.borderRightStyle = e, r.style.margin = "0 " + F(-m / 2); const n = x - k; n && (r.style.verticalAlign = F(-n)), M.push(r) } A++, c = S[A] || {}, p = !1 } if (o >= a) continue; (o > 0 || e.hskipBeforeAndAfter) && (i = l.deflt(c.pregap, u), 0 !== i && (z = Ve.makeSpan(["arraycolsep"], []), z.style.width = F(i), M.push(z))); let d = []; for (r = 0; r < s; ++r) { const e = h[r], t = e[o]; if (!t) continue; const n = e.pos - k; t.depth = e.depth, t.height = e.height, d.push({ type: "elem", elem: t, shift: n }) } d = Ve.makeVList({ positionType: "individualShift", children: d }, t), d = Ve.makeSpan(["col-align-" + (c.align || "c")], [d]), M.push(d), (o < a - 1 || e.hskipBeforeAndAfter) && (i = l.deflt(c.postgap, u), 0 !== i && (z = Ve.makeSpan(["arraycolsep"], []), z.style.width = F(i), M.push(z))) } if (h = Ve.makeSpan(["mtable"], M), c.length > 0) { const e = Ve.makeLineSpan("hline", t, m), r = Ve.makeLineSpan("hdashline", t, m), n = [{ type: "elem", elem: h, shift: 0 }]; for (; c.length > 0;) { const t = c.pop(), o = t.pos - k; t.isDashed ? n.push({ type: "elem", elem: r, shift: o }) : n.push({ type: "elem", elem: e, shift: o }) } h = Ve.makeVList({ positionType: "individualShift", children: n }, t) } if (0 === T.length) return Ve.makeSpan(["mord"], [h], t); { let e = Ve.makeVList({ positionType: "individualShift", children: T }, t); return e = Ve.makeSpan(["tag"], [e], t), Ve.makeFragment([h, e]) } }, Lr = { c: "center ", l: "left ", r: "right " }, Dr = function (e, t) { const r = [], n = new gt.MathNode("mtd", [], ["mtr-glue"]), o = new gt.MathNode("mtd", [], ["mml-eqn-num"]); for (let s = 0; s < e.body.length; s++) { const i = e.body[s], a = []; for (let e = 0; e < i.length; e++)a.push(new gt.MathNode("mtd", [vt(i[e], t)])); e.tags && e.tags[s] && (a.unshift(n), a.push(n), e.leqno ? a.unshift(o) : a.push(o)), r.push(new gt.MathNode("mtr", a)) } let s = new gt.MathNode("mtable", r); const i = .5 === e.arraystretch ? .1 : .16 + e.arraystretch - 1 + (e.addJot ? .09 : 0); s.setAttribute("rowspacing", F(i)); let a = "", l = ""; if (e.cols && e.cols.length > 0) { const t = e.cols; let r = "", n = !1, o = 0, i = t.length; "separator" === t[0].type && (a += "top ", o = 1), "separator" === t[t.length - 1].type && (a += "bottom ", i -= 1); for (let e = o; e < i; e++)"align" === t[e].type ? (l += Lr[t[e].align], n && (r += "none "), n = !0) : "separator" === t[e].type && n && (r += "|" === t[e].separator ? "solid " : "dashed ", n = !1); s.setAttribute("columnalign", l.trim()), /[sd]/.test(r) && s.setAttribute("columnlines", r.trim()) } if ("align" === e.colSeparationType) { const t = e.cols || []; let r = ""; for (let e = 1; e < t.length; e++)r += e % 2 ? "0em " : "1em "; s.setAttribute("columnspacing", r.trim()) } else "alignat" === e.colSeparationType || "gather" === e.colSeparationType ? s.setAttribute("columnspacing", "0em") : "small" === e.colSeparationType ? s.setAttribute("columnspacing", "0.2778em") : "CD" === e.colSeparationType ? s.setAttribute("columnspacing", "0.5em") : s.setAttribute("columnspacing", "1em"); let h = ""; const c = e.hLinesBeforeRow; a += c[0].length > 0 ? "left " : "", a += c[c.length - 1].length > 0 ? "right " : ""; for (let e = 1; e < c.length - 1; e++)h += 0 === c[e].length ? "none " : c[e][0] ? "dashed " : "solid "; return /[sd]/.test(h) && s.setAttribute("rowlines", h.trim()), "" !== a && (s = new gt.MathNode("menclose", [s]), s.setAttribute("notation", a.trim())), e.arraystretch && e.arraystretch < 1 && (s = new gt.MathNode("mstyle", [s]), s.setAttribute("scriptlevel", "1")), s }, Vr = function (e, t) { -1 === e.envName.indexOf("ed") && Ir(e); const r = [], o = e.envName.indexOf("at") > -1 ? "alignat" : "align", s = "split" === e.envName, i = Hr(e.parser, { cols: r, addJot: !0, autoTag: s ? void 0 : Rr(e.envName), emptySingleRow: !0, colSeparationType: o, maxNumCols: s ? 2 : void 0, leqno: e.parser.settings.leqno }, "display"); let a, l = 0; const h = { type: "ordgroup", mode: e.mode, body: [] }; if (t[0] && "ordgroup" === t[0].type) { let e = ""; for (let r = 0; r < t[0].body.length; r++) { e += qt(t[0].body[r], "textord").text } a = Number(e), l = 2 * a } const c = !l; i.body.forEach((function (e) { for (let t = 1; t < e.length; t += 2) { const r = qt(e[t], "styling"); qt(r.body[0], "ordgroup").body.unshift(h) } if (c) l < e.length && (l = e.length); else { const t = e.length / 2; if (a < t) throw new n("Too many math in a row: expected " + a + ", but got " + t, e[0]) } })); for (let e = 0; e < l; ++e) { let t = "r", n = 0; e % 2 == 1 ? t = "l" : e > 0 && c && (n = 1), r[e] = { type: "align", align: t, pregap: n, postgap: 0 } } return i.colSeparationType = c ? "align" : "alignat", i }; Ar({ type: "array", names: ["array", "darray"], props: { numArgs: 1 }, handler(e, t) { const r = (Rt(t[0]) ? [t[0]] : qt(t[0], "ordgroup").body).map((function (e) { const t = It(e).text; if (-1 !== "lcr".indexOf(t)) return { type: "align", align: t }; if ("|" === t) return { type: "separator", separator: "|" }; if (":" === t) return { type: "separator", separator: ":" }; throw new n("Unknown column alignment: " + t, e) })), o = { cols: r, hskipBeforeAndAfter: !0, maxNumCols: r.length }; return Hr(e.parser, o, Or(e.envName)) }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["matrix", "pmatrix", "bmatrix", "Bmatrix", "vmatrix", "Vmatrix", "matrix*", "pmatrix*", "bmatrix*", "Bmatrix*", "vmatrix*", "Vmatrix*"], props: { numArgs: 0 }, handler(e) { const t = { matrix: null, pmatrix: ["(", ")"], bmatrix: ["[", "]"], Bmatrix: ["\\{", "\\}"], vmatrix: ["|", "|"], Vmatrix: ["\\Vert", "\\Vert"] }[e.envName.replace("*", "")]; let r = "c"; const o = { hskipBeforeAndAfter: !1, cols: [{ type: "align", align: r }] }; if ("*" === e.envName.charAt(e.envName.length - 1)) { const t = e.parser; if (t.consumeSpaces(), "[" === t.fetch().text) { if (t.consume(), t.consumeSpaces(), r = t.fetch().text, -1 === "lcr".indexOf(r)) throw new n("Expected l or c or r", t.nextToken); t.consume(), t.consumeSpaces(), t.expect("]"), t.consume(), o.cols = [{ type: "align", align: r }] } } const s = Hr(e.parser, o, Or(e.envName)), i = Math.max(0, ...s.body.map((e => e.length))); return s.cols = new Array(i).fill({ type: "align", align: r }), t ? { type: "leftright", mode: e.mode, body: [s], left: t[0], right: t[1], rightColor: void 0 } : s }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["smallmatrix"], props: { numArgs: 0 }, handler(e) { const t = Hr(e.parser, { arraystretch: .5 }, "script"); return t.colSeparationType = "small", t }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["subarray"], props: { numArgs: 1 }, handler(e, t) { const r = (Rt(t[0]) ? [t[0]] : qt(t[0], "ordgroup").body).map((function (e) { const t = It(e).text; if (-1 !== "lc".indexOf(t)) return { type: "align", align: t }; throw new n("Unknown column alignment: " + t, e) })); if (r.length > 1) throw new n("{subarray} can contain only one column"); let o = { cols: r, hskipBeforeAndAfter: !1, arraystretch: .5 }; if (o = Hr(e.parser, o, "script"), o.body.length > 0 && o.body[0].length > 1) throw new n("{subarray} can contain only one column"); return o }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["cases", "dcases", "rcases", "drcases"], props: { numArgs: 0 }, handler(e) { const t = Hr(e.parser, { arraystretch: 1.2, cols: [{ type: "align", align: "l", pregap: 0, postgap: 1 }, { type: "align", align: "l", pregap: 0, postgap: 0 }] }, Or(e.envName)); return { type: "leftright", mode: e.mode, body: [t], left: e.envName.indexOf("r") > -1 ? "." : "\\{", right: e.envName.indexOf("r") > -1 ? "\\}" : ".", rightColor: void 0 } }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["align", "align*", "aligned", "split"], props: { numArgs: 0 }, handler: Vr, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["gathered", "gather", "gather*"], props: { numArgs: 0 }, handler(e) { l.contains(["gather", "gather*"], e.envName) && Ir(e); const t = { cols: [{ type: "align", align: "c" }], addJot: !0, colSeparationType: "gather", autoTag: Rr(e.envName), emptySingleRow: !0, leqno: e.parser.settings.leqno }; return Hr(e.parser, t, "display") }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["alignat", "alignat*", "alignedat"], props: { numArgs: 1 }, handler: Vr, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["equation", "equation*"], props: { numArgs: 0 }, handler(e) { Ir(e); const t = { autoTag: Rr(e.envName), emptySingleRow: !0, singleRow: !0, maxNumCols: 1, leqno: e.parser.settings.leqno }; return Hr(e.parser, t, "display") }, htmlBuilder: Er, mathmlBuilder: Dr }), Ar({ type: "array", names: ["CD"], props: { numArgs: 0 }, handler(e) { return Ir(e), function (e) { const t = []; for (e.gullet.beginGroup(), e.gullet.macros.set("\\cr", "\\\\\\relax"), e.gullet.beginGroup(); ;) { t.push(e.parseExpression(!1, "\\\\")), e.gullet.endGroup(), e.gullet.beginGroup(); const r = e.fetch().text; if ("&" !== r && "\\\\" !== r) { if ("\\end" === r) { 0 === t[t.length - 1].length && t.pop(); break } throw new n("Expected \\\\ or \\cr or \\end", e.nextToken) } e.consume() } let r = []; const o = [r]; for (let a = 0; a < t.length; a++) { const l = t[a]; let h = { type: "styling", body: [], mode: "math", style: "display" }; for (let t = 0; t < l.length; t++)if (Ut(l[t])) { r.push(h), t += 1; const o = It(l[t]).text, a = new Array(2); if (a[0] = { type: "ordgroup", mode: "math", body: [] }, a[1] = { type: "ordgroup", mode: "math", body: [] }, "=|.".indexOf(o) > -1); else { if (!("<>AV".indexOf(o) > -1)) throw new n('Expected one of "<>AV=|." after @', l[t]); for (let e = 0; e < 2; e++) { let r = !0; for (let h = t + 1; h < l.length; h++) { if (i = o, ("mathord" === (s = l[h]).type || "atom" === s.type) && s.text === i) { r = !1, t = h; break } if (Ut(l[h])) throw new n("Missing a " + o + " character to complete a CD arrow.", l[h]); a[e].body.push(l[h]) } if (r) throw new n("Missing a " + o + " character to complete a CD arrow.", l[t]) } } const c = { type: "styling", body: [Yt(o, a, e)], mode: "math", style: "display" }; r.push(c), h = { type: "styling", body: [], mode: "math", style: "display" } } else h.body.push(l[t]); a % 2 == 0 ? r.push(h) : r.shift(), r = [], o.push(r) } var s, i; return e.gullet.endGroup(), e.gullet.endGroup(), { type: "array", mode: "math", body: o, arraystretch: 1, addJot: !0, rowGaps: [null], cols: new Array(o[0].length).fill({ type: "align", align: "c", pregap: .25, postgap: .25 }), colSeparationType: "CD", hLinesBeforeRow: new Array(o.length + 1).fill([]) } }(e.parser) }, htmlBuilder: Er, mathmlBuilder: Dr }), Br("\\nonumber", "\\gdef\\@eqnsw{0}"), Br("\\notag", "\\nonumber"), je({ type: "text", names: ["\\hline", "\\hdashline"], props: { numArgs: 0, allowedInText: !0, allowedInMath: !0 }, handler(e, t) { throw new n(e.funcName + " valid only within array environment") } }); var Pr = zr; je({ type: "environment", names: ["\\begin", "\\end"], props: { numArgs: 1, argTypes: ["text"] }, handler(e, t) { let { parser: r, funcName: o } = e; const s = t[0]; if ("ordgroup" !== s.type) throw new n("Invalid environment name", s); let i = ""; for (let e = 0; e < s.body.length; ++e)i += qt(s.body[e], "textord").text; if ("\\begin" === o) { if (!Pr.hasOwnProperty(i)) throw new n("No such environment: " + i, s); const e = Pr[i], { args: t, optArgs: o } = r.parseArguments("\\begin{" + i + "}", e), a = { mode: r.mode, envName: i, parser: r }, l = e.handler(a, t, o); r.expect("\\end", !1); const h = r.nextToken, c = qt(r.parseFunction(), "environment"); if (c.name !== i) throw new n("Mismatch: \\begin{" + i + "} matched by \\end{" + c.name + "}", h); return l } return { type: "environment", mode: r.mode, name: i, nameGroup: s } } }); const Fr = (e, t) => { const r = e.font, n = t.withFont(r); return ht(e.body, n) }, Gr = (e, t) => { const r = e.font, n = t.withFont(r); return vt(e.body, n) }, Ur = { "\\Bbb": "\\mathbb", "\\bold": "\\mathbf", "\\frak": "\\mathfrak", "\\bm": "\\boldsymbol" }; je({ type: "font", names: ["\\mathrm", "\\mathit", "\\mathbf", "\\mathnormal", "\\mathbb", "\\mathcal", "\\mathfrak", "\\mathscr", "\\mathsf", "\\mathtt", "\\Bbb", "\\bold", "\\frak"], props: { numArgs: 1, allowedInArgument: !0 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = Ze(t[0]); let s = n; return s in Ur && (s = Ur[s]), { type: "font", mode: r.mode, font: s.slice(1), body: o } }, htmlBuilder: Fr, mathmlBuilder: Gr }), je({ type: "mclass", names: ["\\boldsymbol", "\\bm"], props: { numArgs: 1 }, handler: (e, t) => { let { parser: r } = e; const n = t[0], o = l.isCharacterBox(n); return { type: "mclass", mode: r.mode, mclass: Ft(n), body: [{ type: "font", mode: r.mode, font: "boldsymbol", body: n }], isCharacterBox: o } } }), je({ type: "font", names: ["\\rm", "\\sf", "\\tt", "\\bf", "\\it", "\\cal"], props: { numArgs: 0, allowedInText: !0 }, handler: (e, t) => { let { parser: r, funcName: n, breakOnTokenText: o } = e; const { mode: s } = r, i = r.parseExpression(!0, o); return { type: "font", mode: s, font: "math" + n.slice(1), body: { type: "ordgroup", mode: r.mode, body: i } } }, htmlBuilder: Fr, mathmlBuilder: Gr }); const Yr = (e, t) => { let r = t; return "display" === e ? r = r.id >= w.SCRIPT.id ? r.text() : w.DISPLAY : "text" === e && r.size === w.DISPLAY.size ? r = w.TEXT : "script" === e ? r = w.SCRIPT : "scriptscript" === e && (r = w.SCRIPTSCRIPT), r }, Xr = (e, t) => { const r = Yr(e.size, t.style), n = r.fracNum(), o = r.fracDen(); let s; s = t.havingStyle(n); const i = ht(e.numer, s, t); if (e.continued) { const e = 8.5 / t.fontMetrics().ptPerEm, r = 3.5 / t.fontMetrics().ptPerEm; i.height = i.height < e ? e : i.height, i.depth = i.depth < r ? r : i.depth } s = t.havingStyle(o); const a = ht(e.denom, s, t); let l, h, c, m, p, u, d, g, f, b; if (e.hasBarLine ? (e.barSize ? (h = P(e.barSize, t), l = Ve.makeLineSpan("frac-line", t, h)) : l = Ve.makeLineSpan("frac-line", t), h = l.height, c = l.height) : (l = null, h = 0, c = t.fontMetrics().defaultRuleThickness), r.size === w.DISPLAY.size || "display" === e.size ? (m = t.fontMetrics().num1, p = h > 0 ? 3 * c : 7 * c, u = t.fontMetrics().denom1) : (h > 0 ? (m = t.fontMetrics().num2, p = c) : (m = t.fontMetrics().num3, p = 3 * c), u = t.fontMetrics().denom2), l) { const e = t.fontMetrics().axisHeight; m - i.depth - (e + .5 * h) < p && (m += p - (m - i.depth - (e + .5 * h))), e - .5 * h - (a.height - u) < p && (u += p - (e - .5 * h - (a.height - u))); const r = -(e - .5 * h); d = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: a, shift: u }, { type: "elem", elem: l, shift: r }, { type: "elem", elem: i, shift: -m }] }, t) } else { const e = m - i.depth - (a.height - u); e < p && (m += .5 * (p - e), u += .5 * (p - e)), d = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: a, shift: u }, { type: "elem", elem: i, shift: -m }] }, t) } return s = t.havingStyle(r), d.height *= s.sizeMultiplier / t.sizeMultiplier, d.depth *= s.sizeMultiplier / t.sizeMultiplier, g = r.size === w.DISPLAY.size ? t.fontMetrics().delim1 : r.size === w.SCRIPTSCRIPT.size ? t.havingStyle(w.SCRIPT).fontMetrics().delim2 : t.fontMetrics().delim2, f = null == e.leftDelim ? lt(t, ["mopen"]) : yr.customSizedDelim(e.leftDelim, g, !0, t.havingStyle(r), e.mode, ["mopen"]), b = e.continued ? Ve.makeSpan([]) : null == e.rightDelim ? lt(t, ["mclose"]) : yr.customSizedDelim(e.rightDelim, g, !0, t.havingStyle(r), e.mode, ["mclose"]), Ve.makeSpan(["mord"].concat(s.sizingClasses(t)), [f, Ve.makeSpan(["mfrac"], [d]), b], t) }, Wr = (e, t) => { let r = new gt.MathNode("mfrac", [vt(e.numer, t), vt(e.denom, t)]); if (e.hasBarLine) { if (e.barSize) { const n = P(e.barSize, t); r.setAttribute("linethickness", F(n)) } } else r.setAttribute("linethickness", "0px"); const n = Yr(e.size, t.style); if (n.size !== t.style.size) { r = new gt.MathNode("mstyle", [r]); const e = n.size === w.DISPLAY.size ? "true" : "false"; r.setAttribute("displaystyle", e), r.setAttribute("scriptlevel", "0") } if (null != e.leftDelim || null != e.rightDelim) { const t = []; if (null != e.leftDelim) { const r = new gt.MathNode("mo", [new gt.TextNode(e.leftDelim.replace("\\", ""))]); r.setAttribute("fence", "true"), t.push(r) } if (t.push(r), null != e.rightDelim) { const r = new gt.MathNode("mo", [new gt.TextNode(e.rightDelim.replace("\\", ""))]); r.setAttribute("fence", "true"), t.push(r) } return bt(t) } return r }; je({ type: "genfrac", names: ["\\dfrac", "\\frac", "\\tfrac", "\\dbinom", "\\binom", "\\tbinom", "\\\\atopfrac", "\\\\bracefrac", "\\\\brackfrac"], props: { numArgs: 2, allowedInArgument: !0 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0], s = t[1]; let i, a = null, l = null, h = "auto"; switch (n) { case "\\dfrac": case "\\frac": case "\\tfrac": i = !0; break; case "\\\\atopfrac": i = !1; break; case "\\dbinom": case "\\binom": case "\\tbinom": i = !1, a = "(", l = ")"; break; case "\\\\bracefrac": i = !1, a = "\\{", l = "\\}"; break; case "\\\\brackfrac": i = !1, a = "[", l = "]"; break; default: throw new Error("Unrecognized genfrac command") }switch (n) { case "\\dfrac": case "\\dbinom": h = "display"; break; case "\\tfrac": case "\\tbinom": h = "text" }return { type: "genfrac", mode: r.mode, continued: !1, numer: o, denom: s, hasBarLine: i, leftDelim: a, rightDelim: l, size: h, barSize: null } }, htmlBuilder: Xr, mathmlBuilder: Wr }), je({ type: "genfrac", names: ["\\cfrac"], props: { numArgs: 2 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0], s = t[1]; return { type: "genfrac", mode: r.mode, continued: !0, numer: o, denom: s, hasBarLine: !0, leftDelim: null, rightDelim: null, size: "display", barSize: null } } }), je({ type: "infix", names: ["\\over", "\\choose", "\\atop", "\\brace", "\\brack"], props: { numArgs: 0, infix: !0 }, handler(e) { let t, { parser: r, funcName: n, token: o } = e; switch (n) { case "\\over": t = "\\frac"; break; case "\\choose": t = "\\binom"; break; case "\\atop": t = "\\\\atopfrac"; break; case "\\brace": t = "\\\\bracefrac"; break; case "\\brack": t = "\\\\brackfrac"; break; default: throw new Error("Unrecognized infix genfrac command") }return { type: "infix", mode: r.mode, replaceWith: t, token: o } } }); const _r = ["display", "text", "script", "scriptscript"], jr = function (e) { let t = null; return e.length > 0 && (t = e, t = "." === t ? null : t), t }; je({ type: "genfrac", names: ["\\genfrac"], props: { numArgs: 6, allowedInArgument: !0, argTypes: ["math", "math", "size", "text", "math", "math"] }, handler(e, t) { let { parser: r } = e; const n = t[4], o = t[5], s = Ze(t[0]), i = "atom" === s.type && "open" === s.family ? jr(s.text) : null, a = Ze(t[1]), l = "atom" === a.type && "close" === a.family ? jr(a.text) : null, h = qt(t[2], "size"); let c, m = null; h.isBlank ? c = !0 : (m = h.value, c = m.number > 0); let p = "auto", u = t[3]; if ("ordgroup" === u.type) { if (u.body.length > 0) { const e = qt(u.body[0], "textord"); p = _r[Number(e.text)] } } else u = qt(u, "textord"), p = _r[Number(u.text)]; return { type: "genfrac", mode: r.mode, numer: n, denom: o, continued: !1, hasBarLine: c, barSize: m, leftDelim: i, rightDelim: l, size: p } }, htmlBuilder: Xr, mathmlBuilder: Wr }), je({ type: "infix", names: ["\\above"], props: { numArgs: 1, argTypes: ["size"], infix: !0 }, handler(e, t) { let { parser: r, funcName: n, token: o } = e; return { type: "infix", mode: r.mode, replaceWith: "\\\\abovefrac", size: qt(t[0], "size").value, token: o } } }), je({ type: "genfrac", names: ["\\\\abovefrac"], props: { numArgs: 3, argTypes: ["math", "size", "math"] }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0], s = function (e) { if (!e) throw new Error("Expected non-null, but got " + String(e)); return e }(qt(t[1], "infix").size), i = t[2], a = s.number > 0; return { type: "genfrac", mode: r.mode, numer: o, denom: i, continued: !1, hasBarLine: a, barSize: s, leftDelim: null, rightDelim: null, size: "auto" } }, htmlBuilder: Xr, mathmlBuilder: Wr }); const $r = (e, t) => { const r = t.style; let n, o; "supsub" === e.type ? (n = e.sup ? ht(e.sup, t.havingStyle(r.sup()), t) : ht(e.sub, t.havingStyle(r.sub()), t), o = qt(e.base, "horizBrace")) : o = qt(e, "horizBrace"); const s = ht(o.base, t.havingBaseStyle(w.DISPLAY)), i = Nt(o, t); let a; if (o.isOver ? (a = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: s }, { type: "kern", size: .1 }, { type: "elem", elem: i }] }, t), a.children[0].children[0].children[1].classes.push("svg-align")) : (a = Ve.makeVList({ positionType: "bottom", positionData: s.depth + .1 + i.height, children: [{ type: "elem", elem: i }, { type: "kern", size: .1 }, { type: "elem", elem: s }] }, t), a.children[0].children[0].children[0].classes.push("svg-align")), n) { const e = Ve.makeSpan(["mord", o.isOver ? "mover" : "munder"], [a], t); a = o.isOver ? Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: e }, { type: "kern", size: .2 }, { type: "elem", elem: n }] }, t) : Ve.makeVList({ positionType: "bottom", positionData: e.depth + .2 + n.height + n.depth, children: [{ type: "elem", elem: n }, { type: "kern", size: .2 }, { type: "elem", elem: e }] }, t) } return Ve.makeSpan(["mord", o.isOver ? "mover" : "munder"], [a], t) }; je({ type: "horizBrace", names: ["\\overbrace", "\\underbrace"], props: { numArgs: 1 }, handler(e, t) { let { parser: r, funcName: n } = e; return { type: "horizBrace", mode: r.mode, label: n, isOver: /^\\over/.test(n), base: t[0] } }, htmlBuilder: $r, mathmlBuilder: (e, t) => { const r = Ct(e.label); return new gt.MathNode(e.isOver ? "mover" : "munder", [vt(e.base, t), r]) } }), je({ type: "href", names: ["\\href"], props: { numArgs: 2, argTypes: ["url", "original"], allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; const n = t[1], o = qt(t[0], "url").url; return r.settings.isTrusted({ command: "\\href", url: o }) ? { type: "href", mode: r.mode, href: o, body: Ke(n) } : r.formatUnsupportedCmd("\\href") }, htmlBuilder: (e, t) => { const r = nt(e.body, t, !1); return Ve.makeAnchor(e.href, [], r, t) }, mathmlBuilder: (e, t) => { let r = wt(e.body, t); return r instanceof ut || (r = new ut("mrow", [r])), r.setAttribute("href", e.href), r } }), je({ type: "href", names: ["\\url"], props: { numArgs: 1, argTypes: ["url"], allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; const n = qt(t[0], "url").url; if (!r.settings.isTrusted({ command: "\\url", url: n })) return r.formatUnsupportedCmd("\\url"); const o = []; for (let e = 0; e < n.length; e++) { let t = n[e]; "~" === t && (t = "\\textasciitilde"), o.push({ type: "textord", mode: "text", text: t }) } const s = { type: "text", mode: r.mode, font: "\\texttt", body: o }; return { type: "href", mode: r.mode, href: n, body: Ke(s) } } }), je({ type: "hbox", names: ["\\hbox"], props: { numArgs: 1, argTypes: ["text"], allowedInText: !0, primitive: !0 }, handler(e, t) { let { parser: r } = e; return { type: "hbox", mode: r.mode, body: Ke(t[0]) } }, htmlBuilder(e, t) { const r = nt(e.body, t, !1); return Ve.makeFragment(r) }, mathmlBuilder(e, t) { return new gt.MathNode("mrow", xt(e.body, t)) } }), je({ type: "html", names: ["\\htmlClass", "\\htmlId", "\\htmlStyle", "\\htmlData"], props: { numArgs: 2, argTypes: ["raw", "original"], allowedInText: !0 }, handler: (e, t) => { let { parser: r, funcName: o, token: s } = e; const i = qt(t[0], "raw").string, a = t[1]; let l; r.settings.strict && r.settings.reportNonstrict("htmlExtension", "HTML extension is disabled on strict mode"); const h = {}; switch (o) { case "\\htmlClass": h.class = i, l = { command: "\\htmlClass", class: i }; break; case "\\htmlId": h.id = i, l = { command: "\\htmlId", id: i }; break; case "\\htmlStyle": h.style = i, l = { command: "\\htmlStyle", style: i }; break; case "\\htmlData": { const e = i.split(","); for (let t = 0; t < e.length; t++) { const r = e[t].split("="); if (2 !== r.length) throw new n("Error parsing key-value for \\htmlData"); h["data-" + r[0].trim()] = r[1].trim() } l = { command: "\\htmlData", attributes: h }; break } default: throw new Error("Unrecognized html command") }return r.settings.isTrusted(l) ? { type: "html", mode: r.mode, attributes: h, body: Ke(a) } : r.formatUnsupportedCmd(o) }, htmlBuilder: (e, t) => { const r = nt(e.body, t, !1), n = ["enclosing"]; e.attributes.class && n.push(...e.attributes.class.trim().split(/\s+/)); const o = Ve.makeSpan(n, r, t); for (const t in e.attributes) "class" !== t && e.attributes.hasOwnProperty(t) && o.setAttribute(t, e.attributes[t]); return o }, mathmlBuilder: (e, t) => wt(e.body, t) }), je({ type: "htmlmathml", names: ["\\html@mathml"], props: { numArgs: 2, allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; return { type: "htmlmathml", mode: r.mode, html: Ke(t[0]), mathml: Ke(t[1]) } }, htmlBuilder: (e, t) => { const r = nt(e.html, t, !1); return Ve.makeFragment(r) }, mathmlBuilder: (e, t) => wt(e.mathml, t) }); const Zr = function (e) { if (/^[-+]? *(\d+(\.\d*)?|\.\d+)$/.test(e)) return { number: +e, unit: "bp" }; { const t = /([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})/.exec(e); if (!t) throw new n("Invalid size: '" + e + "' in \\includegraphics"); const r = { number: +(t[1] + t[2]), unit: t[3] }; if (!V(r)) throw new n("Invalid unit: '" + r.unit + "' in \\includegraphics."); return r } }; je({ type: "includegraphics", names: ["\\includegraphics"], props: { numArgs: 1, numOptionalArgs: 1, argTypes: ["raw", "url"], allowedInText: !1 }, handler: (e, t, r) => { let { parser: o } = e, s = { number: 0, unit: "em" }, i = { number: .9, unit: "em" }, a = { number: 0, unit: "em" }, l = ""; if (r[0]) { const e = qt(r[0], "raw").string.split(","); for (let t = 0; t < e.length; t++) { const r = e[t].split("="); if (2 === r.length) { const e = r[1].trim(); switch (r[0].trim()) { case "alt": l = e; break; case "width": s = Zr(e); break; case "height": i = Zr(e); break; case "totalheight": a = Zr(e); break; default: throw new n("Invalid key: '" + r[0] + "' in \\includegraphics.") } } } } const h = qt(t[0], "url").url; return "" === l && (l = h, l = l.replace(/^.*[\\/]/, ""), l = l.substring(0, l.lastIndexOf("."))), o.settings.isTrusted({ command: "\\includegraphics", url: h }) ? { type: "includegraphics", mode: o.mode, alt: l, width: s, height: i, totalheight: a, src: h } : o.formatUnsupportedCmd("\\includegraphics") }, htmlBuilder: (e, t) => { const r = P(e.height, t); let n = 0; e.totalheight.number > 0 && (n = P(e.totalheight, t) - r); let o = 0; e.width.number > 0 && (o = P(e.width, t)); const s = { height: F(r + n) }; o > 0 && (s.width = F(o)), n > 0 && (s.verticalAlign = F(-n)); const i = new j(e.src, e.alt, s); return i.height = r, i.depth = n, i }, mathmlBuilder: (e, t) => { const r = new gt.MathNode("mglyph", []); r.setAttribute("alt", e.alt); const n = P(e.height, t); let o = 0; if (e.totalheight.number > 0 && (o = P(e.totalheight, t) - n, r.setAttribute("valign", F(-o))), r.setAttribute("height", F(n + o)), e.width.number > 0) { const n = P(e.width, t); r.setAttribute("width", F(n)) } return r.setAttribute("src", e.src), r } }), je({ type: "kern", names: ["\\kern", "\\mkern", "\\hskip", "\\mskip"], props: { numArgs: 1, argTypes: ["size"], primitive: !0, allowedInText: !0 }, handler(e, t) { let { parser: r, funcName: n } = e; const o = qt(t[0], "size"); if (r.settings.strict) { const e = "m" === n[1], t = "mu" === o.value.unit; e ? (t || r.settings.reportNonstrict("mathVsTextUnits", "LaTeX's " + n + " supports only mu units, not " + o.value.unit + " units"), "math" !== r.mode && r.settings.reportNonstrict("mathVsTextUnits", "LaTeX's " + n + " works only in math mode")) : t && r.settings.reportNonstrict("mathVsTextUnits", "LaTeX's " + n + " doesn't support mu units") } return { type: "kern", mode: r.mode, dimension: o.value } }, htmlBuilder(e, t) { return Ve.makeGlue(e.dimension, t) }, mathmlBuilder(e, t) { const r = P(e.dimension, t); return new gt.SpaceNode(r) } }), je({ type: "lap", names: ["\\mathllap", "\\mathrlap", "\\mathclap"], props: { numArgs: 1, allowedInText: !0 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "lap", mode: r.mode, alignment: n.slice(5), body: o } }, htmlBuilder: (e, t) => { let r; "clap" === e.alignment ? (r = Ve.makeSpan([], [ht(e.body, t)]), r = Ve.makeSpan(["inner"], [r], t)) : r = Ve.makeSpan(["inner"], [ht(e.body, t)]); const n = Ve.makeSpan(["fix"], []); let o = Ve.makeSpan([e.alignment], [r, n], t); const s = Ve.makeSpan(["strut"]); return s.style.height = F(o.height + o.depth), o.depth && (s.style.verticalAlign = F(-o.depth)), o.children.unshift(s), o = Ve.makeSpan(["thinbox"], [o], t), Ve.makeSpan(["mord", "vbox"], [o], t) }, mathmlBuilder: (e, t) => { const r = new gt.MathNode("mpadded", [vt(e.body, t)]); if ("rlap" !== e.alignment) { const t = "llap" === e.alignment ? "-1" : "-0.5"; r.setAttribute("lspace", t + "width") } return r.setAttribute("width", "0px"), r } }), je({ type: "styling", names: ["\\(", "$"], props: { numArgs: 0, allowedInText: !0, allowedInMath: !1 }, handler(e, t) { let { funcName: r, parser: n } = e; const o = n.mode; n.switchMode("math"); const s = "\\(" === r ? "\\)" : "$", i = n.parseExpression(!1, s); return n.expect(s), n.switchMode(o), { type: "styling", mode: n.mode, style: "text", body: i } } }), je({ type: "text", names: ["\\)", "\\]"], props: { numArgs: 0, allowedInText: !0, allowedInMath: !1 }, handler(e, t) { throw new n("Mismatched " + e.funcName) } }); const Kr = (e, t) => { switch (t.style.size) { case w.DISPLAY.size: return e.display; case w.TEXT.size: return e.text; case w.SCRIPT.size: return e.script; case w.SCRIPTSCRIPT.size: return e.scriptscript; default: return e.text } }; je({ type: "mathchoice", names: ["\\mathchoice"], props: { numArgs: 4, primitive: !0 }, handler: (e, t) => { let { parser: r } = e; return { type: "mathchoice", mode: r.mode, display: Ke(t[0]), text: Ke(t[1]), script: Ke(t[2]), scriptscript: Ke(t[3]) } }, htmlBuilder: (e, t) => { const r = Kr(e, t), n = nt(r, t, !1); return Ve.makeFragment(n) }, mathmlBuilder: (e, t) => { const r = Kr(e, t); return wt(r, t) } }); const Jr = (e, t, r, n, o, s, i) => { e = Ve.makeSpan([], [e]); const a = r && l.isCharacterBox(r); let h, c, m; if (t) { const e = ht(t, n.havingStyle(o.sup()), n); c = { elem: e, kern: Math.max(n.fontMetrics().bigOpSpacing1, n.fontMetrics().bigOpSpacing3 - e.depth) } } if (r) { const e = ht(r, n.havingStyle(o.sub()), n); h = { elem: e, kern: Math.max(n.fontMetrics().bigOpSpacing2, n.fontMetrics().bigOpSpacing4 - e.height) } } if (c && h) { const t = n.fontMetrics().bigOpSpacing5 + h.elem.height + h.elem.depth + h.kern + e.depth + i; m = Ve.makeVList({ positionType: "bottom", positionData: t, children: [{ type: "kern", size: n.fontMetrics().bigOpSpacing5 }, { type: "elem", elem: h.elem, marginLeft: F(-s) }, { type: "kern", size: h.kern }, { type: "elem", elem: e }, { type: "kern", size: c.kern }, { type: "elem", elem: c.elem, marginLeft: F(s) }, { type: "kern", size: n.fontMetrics().bigOpSpacing5 }] }, n) } else if (h) { const t = e.height - i; m = Ve.makeVList({ positionType: "top", positionData: t, children: [{ type: "kern", size: n.fontMetrics().bigOpSpacing5 }, { type: "elem", elem: h.elem, marginLeft: F(-s) }, { type: "kern", size: h.kern }, { type: "elem", elem: e }] }, n) } else { if (!c) return e; { const t = e.depth + i; m = Ve.makeVList({ positionType: "bottom", positionData: t, children: [{ type: "elem", elem: e }, { type: "kern", size: c.kern }, { type: "elem", elem: c.elem, marginLeft: F(s) }, { type: "kern", size: n.fontMetrics().bigOpSpacing5 }] }, n) } } const p = [m]; if (h && 0 !== s && !a) { const e = Ve.makeSpan(["mspace"], [], n); e.style.marginRight = F(s), p.unshift(e) } return Ve.makeSpan(["mop", "op-limits"], p, n) }, Qr = ["\\smallint"], en = (e, t) => { let r, n, o, s = !1; "supsub" === e.type ? (r = e.sup, n = e.sub, o = qt(e.base, "op"), s = !0) : o = qt(e, "op"); const i = t.style; let a, h = !1; if (i.size === w.DISPLAY.size && o.symbol && !l.contains(Qr, o.name) && (h = !0), o.symbol) { const e = h ? "Size2-Regular" : "Size1-Regular"; let r = ""; if ("\\oiint" !== o.name && "\\oiiint" !== o.name || (r = o.name.slice(1), o.name = "oiint" === r ? "\\iint" : "\\iiint"), a = Ve.makeSymbol(o.name, e, "math", t, ["mop", "op-symbol", h ? "large-op" : "small-op"]), r.length > 0) { const e = a.italic, n = Ve.staticSvg(r + "Size" + (h ? "2" : "1"), t); a = Ve.makeVList({ positionType: "individualShift", children: [{ type: "elem", elem: a, shift: 0 }, { type: "elem", elem: n, shift: h ? .08 : 0 }] }, t), o.name = "\\" + r, a.classes.unshift("mop"), a.italic = e } } else if (o.body) { const e = nt(o.body, t, !0); 1 === e.length && e[0] instanceof Z ? (a = e[0], a.classes[0] = "mop") : a = Ve.makeSpan(["mop"], e, t) } else { const e = []; for (let r = 1; r < o.name.length; r++)e.push(Ve.mathsym(o.name[r], o.mode, t)); a = Ve.makeSpan(["mop"], e, t) } let c = 0, m = 0; return (a instanceof Z || "\\oiint" === o.name || "\\oiiint" === o.name) && !o.suppressBaseShift && (c = (a.height - a.depth) / 2 - t.fontMetrics().axisHeight, m = a.italic), s ? Jr(a, r, n, t, i, m, c) : (c && (a.style.position = "relative", a.style.top = F(c)), a) }, tn = (e, t) => { let r; if (e.symbol) r = new ut("mo", [ft(e.name, e.mode)]), l.contains(Qr, e.name) && r.setAttribute("largeop", "false"); else if (e.body) r = new ut("mo", xt(e.body, t)); else { r = new ut("mi", [new dt(e.name.slice(1))]); const t = new ut("mo", [ft("\u2061", "text")]); r = e.parentIsSupSub ? new ut("mrow", [r, t]) : pt([r, t]) } return r }, rn = { "\u220f": "\\prod", "\u2210": "\\coprod", "\u2211": "\\sum", "\u22c0": "\\bigwedge", "\u22c1": "\\bigvee", "\u22c2": "\\bigcap", "\u22c3": "\\bigcup", "\u2a00": "\\bigodot", "\u2a01": "\\bigoplus", "\u2a02": "\\bigotimes", "\u2a04": "\\biguplus", "\u2a06": "\\bigsqcup" }; je({ type: "op", names: ["\\coprod", "\\bigvee", "\\bigwedge", "\\biguplus", "\\bigcap", "\\bigcup", "\\intop", "\\prod", "\\sum", "\\bigotimes", "\\bigoplus", "\\bigodot", "\\bigsqcup", "\\smallint", "\u220f", "\u2210", "\u2211", "\u22c0", "\u22c1", "\u22c2", "\u22c3", "\u2a00", "\u2a01", "\u2a02", "\u2a04", "\u2a06"], props: { numArgs: 0 }, handler: (e, t) => { let { parser: r, funcName: n } = e, o = n; return 1 === o.length && (o = rn[o]), { type: "op", mode: r.mode, limits: !0, parentIsSupSub: !1, symbol: !0, name: o } }, htmlBuilder: en, mathmlBuilder: tn }), je({ type: "op", names: ["\\mathop"], props: { numArgs: 1, primitive: !0 }, handler: (e, t) => { let { parser: r } = e; const n = t[0]; return { type: "op", mode: r.mode, limits: !1, parentIsSupSub: !1, symbol: !1, body: Ke(n) } }, htmlBuilder: en, mathmlBuilder: tn }); const nn = { "\u222b": "\\int", "\u222c": "\\iint", "\u222d": "\\iiint", "\u222e": "\\oint", "\u222f": "\\oiint", "\u2230": "\\oiiint" }; je({ type: "op", names: ["\\arcsin", "\\arccos", "\\arctan", "\\arctg", "\\arcctg", "\\arg", "\\ch", "\\cos", "\\cosec", "\\cosh", "\\cot", "\\cotg", "\\coth", "\\csc", "\\ctg", "\\cth", "\\deg", "\\dim", "\\exp", "\\hom", "\\ker", "\\lg", "\\ln", "\\log", "\\sec", "\\sin", "\\sinh", "\\sh", "\\tan", "\\tanh", "\\tg", "\\th"], props: { numArgs: 0 }, handler(e) { let { parser: t, funcName: r } = e; return { type: "op", mode: t.mode, limits: !1, parentIsSupSub: !1, symbol: !1, name: r } }, htmlBuilder: en, mathmlBuilder: tn }), je({ type: "op", names: ["\\det", "\\gcd", "\\inf", "\\lim", "\\max", "\\min", "\\Pr", "\\sup"], props: { numArgs: 0 }, handler(e) { let { parser: t, funcName: r } = e; return { type: "op", mode: t.mode, limits: !0, parentIsSupSub: !1, symbol: !1, name: r } }, htmlBuilder: en, mathmlBuilder: tn }), je({ type: "op", names: ["\\int", "\\iint", "\\iiint", "\\oint", "\\oiint", "\\oiiint", "\u222b", "\u222c", "\u222d", "\u222e", "\u222f", "\u2230"], props: { numArgs: 0 }, handler(e) { let { parser: t, funcName: r } = e, n = r; return 1 === n.length && (n = nn[n]), { type: "op", mode: t.mode, limits: !1, parentIsSupSub: !1, symbol: !0, name: n } }, htmlBuilder: en, mathmlBuilder: tn }); const on = (e, t) => { let r, n, o, s, i = !1; if ("supsub" === e.type ? (r = e.sup, n = e.sub, o = qt(e.base, "operatorname"), i = !0) : o = qt(e, "operatorname"), o.body.length > 0) { const e = o.body.map((e => { const t = e.text; return "string" == typeof t ? { type: "textord", mode: e.mode, text: t } : e })), r = nt(e, t.withFont("mathrm"), !0); for (let e = 0; e < r.length; e++) { const t = r[e]; t instanceof Z && (t.text = t.text.replace(/\u2212/, "-").replace(/\u2217/, "*")) } s = Ve.makeSpan(["mop"], r, t) } else s = Ve.makeSpan(["mop"], [], t); return i ? Jr(s, r, n, t, t.style, 0, 0) : s }; function sn(e, t, r) { const n = nt(e, t, !1), o = t.sizeMultiplier / r.sizeMultiplier; for (let e = 0; e < n.length; e++) { const s = n[e].classes.indexOf("sizing"); s < 0 ? Array.prototype.push.apply(n[e].classes, t.sizingClasses(r)) : n[e].classes[s + 1] === "reset-size" + t.size && (n[e].classes[s + 1] = "reset-size" + r.size), n[e].height *= o, n[e].depth *= o } return Ve.makeFragment(n) } je({ type: "operatorname", names: ["\\operatorname@", "\\operatornamewithlimits"], props: { numArgs: 1 }, handler: (e, t) => { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "operatorname", mode: r.mode, body: Ke(o), alwaysHandleSupSub: "\\operatornamewithlimits" === n, limits: !1, parentIsSupSub: !1 } }, htmlBuilder: on, mathmlBuilder: (e, t) => { let r = xt(e.body, t.withFont("mathrm")), n = !0; for (let e = 0; e < r.length; e++) { const t = r[e]; if (t instanceof gt.SpaceNode); else if (t instanceof gt.MathNode) switch (t.type) { case "mi": case "mn": case "ms": case "mspace": case "mtext": break; case "mo": { const e = t.children[0]; 1 === t.children.length && e instanceof gt.TextNode ? e.text = e.text.replace(/\u2212/, "-").replace(/\u2217/, "*") : n = !1; break } default: n = !1 } else n = !1 } if (n) { const e = r.map((e => e.toText())).join(""); r = [new gt.TextNode(e)] } const o = new gt.MathNode("mi", r); o.setAttribute("mathvariant", "normal"); const s = new gt.MathNode("mo", [ft("\u2061", "text")]); return e.parentIsSupSub ? new gt.MathNode("mrow", [o, s]) : gt.newDocumentFragment([o, s]) } }), Br("\\operatorname", "\\@ifstar\\operatornamewithlimits\\operatorname@"), $e({ type: "ordgroup", htmlBuilder(e, t) { return e.semisimple ? Ve.makeFragment(nt(e.body, t, !1)) : Ve.makeSpan(["mord"], nt(e.body, t, !0), t) }, mathmlBuilder(e, t) { return wt(e.body, t, !0) } }), je({ type: "overline", names: ["\\overline"], props: { numArgs: 1 }, handler(e, t) { let { parser: r } = e; const n = t[0]; return { type: "overline", mode: r.mode, body: n } }, htmlBuilder(e, t) { const r = ht(e.body, t.havingCrampedStyle()), n = Ve.makeLineSpan("overline-line", t), o = t.fontMetrics().defaultRuleThickness, s = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: r }, { type: "kern", size: 3 * o }, { type: "elem", elem: n }, { type: "kern", size: o }] }, t); return Ve.makeSpan(["mord", "overline"], [s], t) }, mathmlBuilder(e, t) { const r = new gt.MathNode("mo", [new gt.TextNode("\u203e")]); r.setAttribute("stretchy", "true"); const n = new gt.MathNode("mover", [vt(e.body, t), r]); return n.setAttribute("accent", "true"), n } }), je({ type: "phantom", names: ["\\phantom"], props: { numArgs: 1, allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; const n = t[0]; return { type: "phantom", mode: r.mode, body: Ke(n) } }, htmlBuilder: (e, t) => { const r = nt(e.body, t.withPhantom(), !1); return Ve.makeFragment(r) }, mathmlBuilder: (e, t) => { const r = xt(e.body, t); return new gt.MathNode("mphantom", r) } }), je({ type: "hphantom", names: ["\\hphantom"], props: { numArgs: 1, allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; const n = t[0]; return { type: "hphantom", mode: r.mode, body: n } }, htmlBuilder: (e, t) => { let r = Ve.makeSpan([], [ht(e.body, t.withPhantom())]); if (r.height = 0, r.depth = 0, r.children) for (let e = 0; e < r.children.length; e++)r.children[e].height = 0, r.children[e].depth = 0; return r = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: r }] }, t), Ve.makeSpan(["mord"], [r], t) }, mathmlBuilder: (e, t) => { const r = xt(Ke(e.body), t), n = new gt.MathNode("mphantom", r), o = new gt.MathNode("mpadded", [n]); return o.setAttribute("height", "0px"), o.setAttribute("depth", "0px"), o } }), je({ type: "vphantom", names: ["\\vphantom"], props: { numArgs: 1, allowedInText: !0 }, handler: (e, t) => { let { parser: r } = e; const n = t[0]; return { type: "vphantom", mode: r.mode, body: n } }, htmlBuilder: (e, t) => { const r = Ve.makeSpan(["inner"], [ht(e.body, t.withPhantom())]), n = Ve.makeSpan(["fix"], []); return Ve.makeSpan(["mord", "rlap"], [r, n], t) }, mathmlBuilder: (e, t) => { const r = xt(Ke(e.body), t), n = new gt.MathNode("mphantom", r), o = new gt.MathNode("mpadded", [n]); return o.setAttribute("width", "0px"), o } }), je({ type: "raisebox", names: ["\\raisebox"], props: { numArgs: 2, argTypes: ["size", "hbox"], allowedInText: !0 }, handler(e, t) { let { parser: r } = e; const n = qt(t[0], "size").value, o = t[1]; return { type: "raisebox", mode: r.mode, dy: n, body: o } }, htmlBuilder(e, t) { const r = ht(e.body, t), n = P(e.dy, t); return Ve.makeVList({ positionType: "shift", positionData: -n, children: [{ type: "elem", elem: r }] }, t) }, mathmlBuilder(e, t) { const r = new gt.MathNode("mpadded", [vt(e.body, t)]), n = e.dy.number + e.dy.unit; return r.setAttribute("voffset", n), r } }), je({ type: "internal", names: ["\\relax"], props: { numArgs: 0, allowedInText: !0 }, handler(e) { let { parser: t } = e; return { type: "internal", mode: t.mode } } }), je({ type: "rule", names: ["\\rule"], props: { numArgs: 2, numOptionalArgs: 1, argTypes: ["size", "size", "size"] }, handler(e, t, r) { let { parser: n } = e; const o = r[0], s = qt(t[0], "size"), i = qt(t[1], "size"); return { type: "rule", mode: n.mode, shift: o && qt(o, "size").value, width: s.value, height: i.value } }, htmlBuilder(e, t) { const r = Ve.makeSpan(["mord", "rule"], [], t), n = P(e.width, t), o = P(e.height, t), s = e.shift ? P(e.shift, t) : 0; return r.style.borderRightWidth = F(n), r.style.borderTopWidth = F(o), r.style.bottom = F(s), r.width = n, r.height = o + s, r.depth = -s, r.maxFontSize = 1.125 * o * t.sizeMultiplier, r }, mathmlBuilder(e, t) { const r = P(e.width, t), n = P(e.height, t), o = e.shift ? P(e.shift, t) : 0, s = t.color && t.getColor() || "black", i = new gt.MathNode("mspace"); i.setAttribute("mathbackground", s), i.setAttribute("width", F(r)), i.setAttribute("height", F(n)); const a = new gt.MathNode("mpadded", [i]); return o >= 0 ? a.setAttribute("height", F(o)) : (a.setAttribute("height", F(o)), a.setAttribute("depth", F(-o))), a.setAttribute("voffset", F(o)), a } }); const an = ["\\tiny", "\\sixptsize", "\\scriptsize", "\\footnotesize", "\\small", "\\normalsize", "\\large", "\\Large", "\\LARGE", "\\huge", "\\Huge"]; je({ type: "sizing", names: an, props: { numArgs: 0, allowedInText: !0 }, handler: (e, t) => { let { breakOnTokenText: r, funcName: n, parser: o } = e; const s = o.parseExpression(!1, r); return { type: "sizing", mode: o.mode, size: an.indexOf(n) + 1, body: s } }, htmlBuilder: (e, t) => { const r = t.havingSize(e.size); return sn(e.body, r, t) }, mathmlBuilder: (e, t) => { const r = t.havingSize(e.size), n = xt(e.body, r), o = new gt.MathNode("mstyle", n); return o.setAttribute("mathsize", F(r.sizeMultiplier)), o } }), je({ type: "smash", names: ["\\smash"], props: { numArgs: 1, numOptionalArgs: 1, allowedInText: !0 }, handler: (e, t, r) => { let { parser: n } = e, o = !1, s = !1; const i = r[0] && qt(r[0], "ordgroup"); if (i) { let e = ""; for (let t = 0; t < i.body.length; ++t) { if (e = i.body[t].text, "t" === e) o = !0; else { if ("b" !== e) { o = !1, s = !1; break } s = !0 } } } else o = !0, s = !0; const a = t[0]; return { type: "smash", mode: n.mode, body: a, smashHeight: o, smashDepth: s } }, htmlBuilder: (e, t) => { const r = Ve.makeSpan([], [ht(e.body, t)]); if (!e.smashHeight && !e.smashDepth) return r; if (e.smashHeight && (r.height = 0, r.children)) for (let e = 0; e < r.children.length; e++)r.children[e].height = 0; if (e.smashDepth && (r.depth = 0, r.children)) for (let e = 0; e < r.children.length; e++)r.children[e].depth = 0; const n = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: r }] }, t); return Ve.makeSpan(["mord"], [n], t) }, mathmlBuilder: (e, t) => { const r = new gt.MathNode("mpadded", [vt(e.body, t)]); return e.smashHeight && r.setAttribute("height", "0px"), e.smashDepth && r.setAttribute("depth", "0px"), r } }), je({ type: "sqrt", names: ["\\sqrt"], props: { numArgs: 1, numOptionalArgs: 1 }, handler(e, t, r) { let { parser: n } = e; const o = r[0], s = t[0]; return { type: "sqrt", mode: n.mode, body: s, index: o } }, htmlBuilder(e, t) { let r = ht(e.body, t.havingCrampedStyle()); 0 === r.height && (r.height = t.fontMetrics().xHeight), r = Ve.wrapFragment(r, t); const n = t.fontMetrics().defaultRuleThickness; let o = n; t.style.id < w.TEXT.id && (o = t.fontMetrics().xHeight); let s = n + o / 4; const i = r.height + r.depth + s + n, { span: a, ruleWidth: l, advanceWidth: h } = yr.sqrtImage(i, t), c = a.height - l; c > r.height + r.depth + s && (s = (s + c - r.height - r.depth) / 2); const m = a.height - r.height - s - l; r.style.paddingLeft = F(h); const p = Ve.makeVList({ positionType: "firstBaseline", children: [{ type: "elem", elem: r, wrapperClasses: ["svg-align"] }, { type: "kern", size: -(r.height + m) }, { type: "elem", elem: a }, { type: "kern", size: l }] }, t); if (e.index) { const r = t.havingStyle(w.SCRIPTSCRIPT), n = ht(e.index, r, t), o = .6 * (p.height - p.depth), s = Ve.makeVList({ positionType: "shift", positionData: -o, children: [{ type: "elem", elem: n }] }, t), i = Ve.makeSpan(["root"], [s]); return Ve.makeSpan(["mord", "sqrt"], [i, p], t) } return Ve.makeSpan(["mord", "sqrt"], [p], t) }, mathmlBuilder(e, t) { const { body: r, index: n } = e; return n ? new gt.MathNode("mroot", [vt(r, t), vt(n, t)]) : new gt.MathNode("msqrt", [vt(r, t)]) } }); const ln = { display: w.DISPLAY, text: w.TEXT, script: w.SCRIPT, scriptscript: w.SCRIPTSCRIPT }; je({ type: "styling", names: ["\\displaystyle", "\\textstyle", "\\scriptstyle", "\\scriptscriptstyle"], props: { numArgs: 0, allowedInText: !0, primitive: !0 }, handler(e, t) { let { breakOnTokenText: r, funcName: n, parser: o } = e; const s = o.parseExpression(!0, r), i = n.slice(1, n.length - 5); return { type: "styling", mode: o.mode, style: i, body: s } }, htmlBuilder(e, t) { const r = ln[e.style], n = t.havingStyle(r).withFont(""); return sn(e.body, n, t) }, mathmlBuilder(e, t) { const r = ln[e.style], n = t.havingStyle(r), o = xt(e.body, n), s = new gt.MathNode("mstyle", o), i = { display: ["0", "true"], text: ["0", "false"], script: ["1", "false"], scriptscript: ["2", "false"] }[e.style]; return s.setAttribute("scriptlevel", i[0]), s.setAttribute("displaystyle", i[1]), s } }); $e({ type: "supsub", htmlBuilder(e, t) { const r = function (e, t) { const r = e.base; if (r) return "op" === r.type ? r.limits && (t.style.size === w.DISPLAY.size || r.alwaysHandleSupSub) ? en : null : "operatorname" === r.type ? r.alwaysHandleSupSub && (t.style.size === w.DISPLAY.size || r.limits) ? on : null : "accent" === r.type ? l.isCharacterBox(r.base) ? Ht : null : "horizBrace" === r.type && !e.sub === r.isOver ? $r : null; return null }(e, t); if (r) return r(e, t); const { base: n, sup: o, sub: s } = e, i = ht(n, t); let a, h; const c = t.fontMetrics(); let m = 0, p = 0; const u = n && l.isCharacterBox(n); if (o) { const e = t.havingStyle(t.style.sup()); a = ht(o, e, t), u || (m = i.height - e.fontMetrics().supDrop * e.sizeMultiplier / t.sizeMultiplier) } if (s) { const e = t.havingStyle(t.style.sub()); h = ht(s, e, t), u || (p = i.depth + e.fontMetrics().subDrop * e.sizeMultiplier / t.sizeMultiplier) } let d; d = t.style === w.DISPLAY ? c.sup1 : t.style.cramped ? c.sup3 : c.sup2; const g = t.sizeMultiplier, f = F(.5 / c.ptPerEm / g); let b, y = null; if (h) { const t = e.base && "op" === e.base.type && e.base.name && ("\\oiint" === e.base.name || "\\oiiint" === e.base.name); (i instanceof Z || t) && (y = F(-i.italic)) } if (a && h) { m = Math.max(m, d, a.depth + .25 * c.xHeight), p = Math.max(p, c.sub2); const e = 4 * c.defaultRuleThickness; if (m - a.depth - (h.height - p) < e) { p = e - (m - a.depth) + h.height; const t = .8 * c.xHeight - (m - a.depth); t > 0 && (m += t, p -= t) } const r = [{ type: "elem", elem: h, shift: p, marginRight: f, marginLeft: y }, { type: "elem", elem: a, shift: -m, marginRight: f }]; b = Ve.makeVList({ positionType: "individualShift", children: r }, t) } else if (h) { p = Math.max(p, c.sub1, h.height - .8 * c.xHeight); const e = [{ type: "elem", elem: h, marginLeft: y, marginRight: f }]; b = Ve.makeVList({ positionType: "shift", positionData: p, children: e }, t) } else { if (!a) throw new Error("supsub must have either sup or sub."); m = Math.max(m, d, a.depth + .25 * c.xHeight), b = Ve.makeVList({ positionType: "shift", positionData: -m, children: [{ type: "elem", elem: a, marginRight: f }] }, t) } const x = at(i, "right") || "mord"; return Ve.makeSpan([x], [i, Ve.makeSpan(["msupsub"], [b])], t) }, mathmlBuilder(e, t) { let r, n, o = !1; e.base && "horizBrace" === e.base.type && (n = !!e.sup, n === e.base.isOver && (o = !0, r = e.base.isOver)), !e.base || "op" !== e.base.type && "operatorname" !== e.base.type || (e.base.parentIsSupSub = !0); const s = [vt(e.base, t)]; let i; if (e.sub && s.push(vt(e.sub, t)), e.sup && s.push(vt(e.sup, t)), o) i = r ? "mover" : "munder"; else if (e.sub) if (e.sup) { const r = e.base; i = r && "op" === r.type && r.limits && t.style === w.DISPLAY || r && "operatorname" === r.type && r.alwaysHandleSupSub && (t.style === w.DISPLAY || r.limits) ? "munderover" : "msubsup" } else { const r = e.base; i = r && "op" === r.type && r.limits && (t.style === w.DISPLAY || r.alwaysHandleSupSub) || r && "operatorname" === r.type && r.alwaysHandleSupSub && (r.limits || t.style === w.DISPLAY) ? "munder" : "msub" } else { const r = e.base; i = r && "op" === r.type && r.limits && (t.style === w.DISPLAY || r.alwaysHandleSupSub) || r && "operatorname" === r.type && r.alwaysHandleSupSub && (r.limits || t.style === w.DISPLAY) ? "mover" : "msup" } return new gt.MathNode(i, s) } }), $e({ type: "atom", htmlBuilder(e, t) { return Ve.mathsym(e.text, e.mode, t, ["m" + e.family]) }, mathmlBuilder(e, t) { const r = new gt.MathNode("mo", [ft(e.text, e.mode)]); if ("bin" === e.family) { const n = yt(e, t); "bold-italic" === n && r.setAttribute("mathvariant", n) } else "punct" === e.family ? r.setAttribute("separator", "true") : "open" !== e.family && "close" !== e.family || r.setAttribute("stretchy", "false"); return r } }); const hn = { mi: "italic", mn: "normal", mtext: "normal" }; $e({ type: "mathord", htmlBuilder(e, t) { return Ve.makeOrd(e, t, "mathord") }, mathmlBuilder(e, t) { const r = new gt.MathNode("mi", [ft(e.text, e.mode, t)]), n = yt(e, t) || "italic"; return n !== hn[r.type] && r.setAttribute("mathvariant", n), r } }), $e({ type: "textord", htmlBuilder(e, t) { return Ve.makeOrd(e, t, "textord") }, mathmlBuilder(e, t) { const r = ft(e.text, e.mode, t), n = yt(e, t) || "normal"; let o; return o = "text" === e.mode ? new gt.MathNode("mtext", [r]) : /[0-9]/.test(e.text) ? new gt.MathNode("mn", [r]) : "\\prime" === e.text ? new gt.MathNode("mo", [r]) : new gt.MathNode("mi", [r]), n !== hn[o.type] && o.setAttribute("mathvariant", n), o } }); const cn = { "\\nobreak": "nobreak", "\\allowbreak": "allowbreak" }, mn = { " ": {}, "\\ ": {}, "~": { className: "nobreak" }, "\\space": {}, "\\nobreakspace": { className: "nobreak" } }; $e({ type: "spacing", htmlBuilder(e, t) { if (mn.hasOwnProperty(e.text)) { const r = mn[e.text].className || ""; if ("text" === e.mode) { const n = Ve.makeOrd(e, t, "textord"); return n.classes.push(r), n } return Ve.makeSpan(["mspace", r], [Ve.mathsym(e.text, e.mode, t)], t) } if (cn.hasOwnProperty(e.text)) return Ve.makeSpan(["mspace", cn[e.text]], [], t); throw new n('Unknown type of space "' + e.text + '"') }, mathmlBuilder(e, t) { let r; if (!mn.hasOwnProperty(e.text)) { if (cn.hasOwnProperty(e.text)) return new gt.MathNode("mspace"); throw new n('Unknown type of space "' + e.text + '"') } return r = new gt.MathNode("mtext", [new gt.TextNode("\xa0")]), r } }); const pn = () => { const e = new gt.MathNode("mtd", []); return e.setAttribute("width", "50%"), e }; $e({ type: "tag", mathmlBuilder(e, t) { const r = new gt.MathNode("mtable", [new gt.MathNode("mtr", [pn(), new gt.MathNode("mtd", [wt(e.body, t)]), pn(), new gt.MathNode("mtd", [wt(e.tag, t)])])]); return r.setAttribute("width", "100%"), r } }); const un = { "\\text": void 0, "\\textrm": "textrm", "\\textsf": "textsf", "\\texttt": "texttt", "\\textnormal": "textrm" }, dn = { "\\textbf": "textbf", "\\textmd": "textmd" }, gn = { "\\textit": "textit", "\\textup": "textup" }, fn = (e, t) => { const r = e.font; return r ? un[r] ? t.withTextFontFamily(un[r]) : dn[r] ? t.withTextFontWeight(dn[r]) : "\\emph" === r ? "textit" === t.fontShape ? t.withTextFontShape("textup") : t.withTextFontShape("textit") : t.withTextFontShape(gn[r]) : t }; je({ type: "text", names: ["\\text", "\\textrm", "\\textsf", "\\texttt", "\\textnormal", "\\textbf", "\\textmd", "\\textit", "\\textup", "\\emph"], props: { numArgs: 1, argTypes: ["text"], allowedInArgument: !0, allowedInText: !0 }, handler(e, t) { let { parser: r, funcName: n } = e; const o = t[0]; return { type: "text", mode: r.mode, body: Ke(o), font: n } }, htmlBuilder(e, t) { const r = fn(e, t), n = nt(e.body, r, !0); return Ve.makeSpan(["mord", "text"], n, r) }, mathmlBuilder(e, t) { const r = fn(e, t); return wt(e.body, r) } }), je({ type: "underline", names: ["\\underline"], props: { numArgs: 1, allowedInText: !0 }, handler(e, t) { let { parser: r } = e; return { type: "underline", mode: r.mode, body: t[0] } }, htmlBuilder(e, t) { const r = ht(e.body, t), n = Ve.makeLineSpan("underline-line", t), o = t.fontMetrics().defaultRuleThickness, s = Ve.makeVList({ positionType: "top", positionData: r.height, children: [{ type: "kern", size: o }, { type: "elem", elem: n }, { type: "kern", size: 3 * o }, { type: "elem", elem: r }] }, t); return Ve.makeSpan(["mord", "underline"], [s], t) }, mathmlBuilder(e, t) { const r = new gt.MathNode("mo", [new gt.TextNode("\u203e")]); r.setAttribute("stretchy", "true"); const n = new gt.MathNode("munder", [vt(e.body, t), r]); return n.setAttribute("accentunder", "true"), n } }), je({ type: "vcenter", names: ["\\vcenter"], props: { numArgs: 1, argTypes: ["original"], allowedInText: !1 }, handler(e, t) { let { parser: r } = e; return { type: "vcenter", mode: r.mode, body: t[0] } }, htmlBuilder(e, t) { const r = ht(e.body, t), n = t.fontMetrics().axisHeight, o = .5 * (r.height - n - (r.depth + n)); return Ve.makeVList({ positionType: "shift", positionData: o, children: [{ type: "elem", elem: r }] }, t) }, mathmlBuilder(e, t) { return new gt.MathNode("mpadded", [vt(e.body, t)], ["vcenter"]) } }), je({ type: "verb", names: ["\\verb"], props: { numArgs: 0, allowedInText: !0 }, handler(e, t, r) { throw new n("\\verb ended by end of line instead of matching delimiter") }, htmlBuilder(e, t) { const r = bn(e), n = [], o = t.havingStyle(t.style.text()); for (let t = 0; t < r.length; t++) { let s = r[t]; "~" === s && (s = "\\textasciitilde"), n.push(Ve.makeSymbol(s, "Typewriter-Regular", e.mode, o, ["mord", "texttt"])) } return Ve.makeSpan(["mord", "text"].concat(o.sizingClasses(t)), Ve.tryCombineChars(n), o) }, mathmlBuilder(e, t) { const r = new gt.TextNode(bn(e)), n = new gt.MathNode("mtext", [r]); return n.setAttribute("mathvariant", "monospace"), n } }); const bn = e => e.body.replace(/ /g, e.star ? "\u2423" : "\xa0"); var yn = Xe; const xn = "[ \r\n\t]", wn = "(\\\\[a-zA-Z@]+)" + xn + "*", vn = "[\u0300-\u036f]", kn = new RegExp(vn + "+$"), Sn = "(" + xn + "+)|\\\\(\n|[ \r\t]+\n?)[ \r\t]*|([!-\\[\\]-\u2027\u202a-\ud7ff\uf900-\uffff]" + vn + "*|[\ud800-\udbff][\udc00-\udfff]" + vn + "*|\\\\verb\\*([^]).*?\\4|\\\\verb([^*a-zA-Z]).*?\\5|" + wn + "|\\\\[^\ud800-\udfff])"; class Mn { constructor(e, t) { this.input = void 0, this.settings = void 0, this.tokenRegex = void 0, this.catcodes = void 0, this.input = e, this.settings = t, this.tokenRegex = new RegExp(Sn, "g"), this.catcodes = { "%": 14, "~": 13 } } setCatcode(e, t) { this.catcodes[e] = t } lex() { const e = this.input, t = this.tokenRegex.lastIndex; if (t === e.length) return new Nr("EOF", new Cr(this, t, t)); const r = this.tokenRegex.exec(e); if (null === r || r.index !== t) throw new n("Unexpected character: '" + e[t] + "'", new Nr(e[t], new Cr(this, t, t + 1))); const o = r[6] || r[3] || (r[2] ? "\\ " : " "); if (14 === this.catcodes[o]) { const t = e.indexOf("\n", this.tokenRegex.lastIndex); return -1 === t ? (this.tokenRegex.lastIndex = e.length, this.settings.reportNonstrict("commentAtEnd", "% comment has no terminating newline; LaTeX would fail because of commenting the end of math mode (e.g. $)")) : this.tokenRegex.lastIndex = t + 1, this.lex() } return new Nr(o, new Cr(this, t, this.tokenRegex.lastIndex)) } } class zn { constructor(e, t) { void 0 === e && (e = {}), void 0 === t && (t = {}), this.current = void 0, this.builtins = void 0, this.undefStack = void 0, this.current = t, this.builtins = e, this.undefStack = [] } beginGroup() { this.undefStack.push({}) } endGroup() { if (0 === this.undefStack.length) throw new n("Unbalanced namespace destruction: attempt to pop global namespace; please report this as a bug"); const e = this.undefStack.pop(); for (const t in e) e.hasOwnProperty(t) && (null == e[t] ? delete this.current[t] : this.current[t] = e[t]) } endGroups() { for (; this.undefStack.length > 0;)this.endGroup() } has(e) { return this.current.hasOwnProperty(e) || this.builtins.hasOwnProperty(e) } get(e) { return this.current.hasOwnProperty(e) ? this.current[e] : this.builtins[e] } set(e, t, r) { if (void 0 === r && (r = !1), r) { for (let t = 0; t < this.undefStack.length; t++)delete this.undefStack[t][e]; this.undefStack.length > 0 && (this.undefStack[this.undefStack.length - 1][e] = t) } else { const t = this.undefStack[this.undefStack.length - 1]; t && !t.hasOwnProperty(e) && (t[e] = this.current[e]) } null == t ? delete this.current[e] : this.current[e] = t } } var An = Tr; Br("\\noexpand", (function (e) { const t = e.popToken(); return e.isExpandable(t.text) && (t.noexpand = !0, t.treatAsRelax = !0), { tokens: [t], numArgs: 0 } })), Br("\\expandafter", (function (e) { const t = e.popToken(); return e.expandOnce(!0), { tokens: [t], numArgs: 0 } })), Br("\\@firstoftwo", (function (e) { return { tokens: e.consumeArgs(2)[0], numArgs: 0 } })), Br("\\@secondoftwo", (function (e) { return { tokens: e.consumeArgs(2)[1], numArgs: 0 } })), Br("\\@ifnextchar", (function (e) { const t = e.consumeArgs(3); e.consumeSpaces(); const r = e.future(); return 1 === t[0].length && t[0][0].text === r.text ? { tokens: t[1], numArgs: 0 } : { tokens: t[2], numArgs: 0 } })), Br("\\@ifstar", "\\@ifnextchar *{\\@firstoftwo{#1}}"), Br("\\TextOrMath", (function (e) { const t = e.consumeArgs(2); return "text" === e.mode ? { tokens: t[0], numArgs: 0 } : { tokens: t[1], numArgs: 0 } })); const Tn = { 0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, a: 10, A: 10, b: 11, B: 11, c: 12, C: 12, d: 13, D: 13, e: 14, E: 14, f: 15, F: 15 }; Br("\\char", (function (e) { let t, r = e.popToken(), o = ""; if ("'" === r.text) t = 8, r = e.popToken(); else if ('"' === r.text) t = 16, r = e.popToken(); else if ("`" === r.text) if (r = e.popToken(), "\\" === r.text[0]) o = r.text.charCodeAt(1); else { if ("EOF" === r.text) throw new n("\\char` missing argument"); o = r.text.charCodeAt(0) } else t = 10; if (t) { if (o = Tn[r.text], null == o || o >= t) throw new n("Invalid base-" + t + " digit " + r.text); let s; for (; null != (s = Tn[e.future().text]) && s < t;)o *= t, o += s, e.popToken() } return "\\@char{" + o + "}" })); const Bn = (e, t, r) => { let o = e.consumeArg().tokens; if (1 !== o.length) throw new n("\\newcommand's first argument must be a macro name"); const s = o[0].text, i = e.isDefined(s); if (i && !t) throw new n("\\newcommand{" + s + "} attempting to redefine " + s + "; use \\renewcommand"); if (!i && !r) throw new n("\\renewcommand{" + s + "} when command " + s + " does not yet exist; use \\newcommand"); let a = 0; if (o = e.consumeArg().tokens, 1 === o.length && "[" === o[0].text) { let t = "", r = e.expandNextToken(); for (; "]" !== r.text && "EOF" !== r.text;)t += r.text, r = e.expandNextToken(); if (!t.match(/^\s*[0-9]+\s*$/)) throw new n("Invalid number of arguments: " + t); a = parseInt(t), o = e.consumeArg().tokens } return e.macros.set(s, { tokens: o, numArgs: a }), "" }; Br("\\newcommand", (e => Bn(e, !1, !0))), Br("\\renewcommand", (e => Bn(e, !0, !1))), Br("\\providecommand", (e => Bn(e, !0, !0))), Br("\\message", (e => { const t = e.consumeArgs(1)[0]; return console.log(t.reverse().map((e => e.text)).join("")), "" })), Br("\\errmessage", (e => { const t = e.consumeArgs(1)[0]; return console.error(t.reverse().map((e => e.text)).join("")), "" })), Br("\\show", (e => { const t = e.popToken(), r = t.text; return console.log(t, e.macros.get(r), yn[r], oe.math[r], oe.text[r]), "" })), Br("\\bgroup", "{"), Br("\\egroup", "}"), Br("~", "\\nobreakspace"), Br("\\lq", "`"), Br("\\rq", "'"), Br("\\aa", "\\r a"), Br("\\AA", "\\r A"), Br("\\textcopyright", "\\html@mathml{\\textcircled{c}}{\\char`\xa9}"), Br("\\copyright", "\\TextOrMath{\\textcopyright}{\\text{\\textcopyright}}"), Br("\\textregistered", "\\html@mathml{\\textcircled{\\scriptsize R}}{\\char`\xae}"), Br("\u212c", "\\mathscr{B}"), Br("\u2130", "\\mathscr{E}"), Br("\u2131", "\\mathscr{F}"), Br("\u210b", "\\mathscr{H}"), Br("\u2110", "\\mathscr{I}"), Br("\u2112", "\\mathscr{L}"), Br("\u2133", "\\mathscr{M}"), Br("\u211b", "\\mathscr{R}"), Br("\u212d", "\\mathfrak{C}"), Br("\u210c", "\\mathfrak{H}"), Br("\u2128", "\\mathfrak{Z}"), Br("\\Bbbk", "\\Bbb{k}"), Br("\xb7", "\\cdotp"), Br("\\llap", "\\mathllap{\\textrm{#1}}"), Br("\\rlap", "\\mathrlap{\\textrm{#1}}"), Br("\\clap", "\\mathclap{\\textrm{#1}}"), Br("\\mathstrut", "\\vphantom{(}"), Br("\\underbar", "\\underline{\\text{#1}}"), Br("\\not", '\\html@mathml{\\mathrel{\\mathrlap\\@not}}{\\char"338}'), Br("\\neq", "\\html@mathml{\\mathrel{\\not=}}{\\mathrel{\\char`\u2260}}"), Br("\\ne", "\\neq"), Br("\u2260", "\\neq"), Br("\\notin", "\\html@mathml{\\mathrel{{\\in}\\mathllap{/\\mskip1mu}}}{\\mathrel{\\char`\u2209}}"), Br("\u2209", "\\notin"), Br("\u2258", "\\html@mathml{\\mathrel{=\\kern{-1em}\\raisebox{0.4em}{$\\scriptsize\\frown$}}}{\\mathrel{\\char`\u2258}}"), Br("\u2259", "\\html@mathml{\\stackrel{\\tiny\\wedge}{=}}{\\mathrel{\\char`\u2258}}"), Br("\u225a", "\\html@mathml{\\stackrel{\\tiny\\vee}{=}}{\\mathrel{\\char`\u225a}}"), Br("\u225b", "\\html@mathml{\\stackrel{\\scriptsize\\star}{=}}{\\mathrel{\\char`\u225b}}"), Br("\u225d", "\\html@mathml{\\stackrel{\\tiny\\mathrm{def}}{=}}{\\mathrel{\\char`\u225d}}"), Br("\u225e", "\\html@mathml{\\stackrel{\\tiny\\mathrm{m}}{=}}{\\mathrel{\\char`\u225e}}"), Br("\u225f", "\\html@mathml{\\stackrel{\\tiny?}{=}}{\\mathrel{\\char`\u225f}}"), Br("\u27c2", "\\perp"), Br("\u203c", "\\mathclose{!\\mkern-0.8mu!}"), Br("\u220c", "\\notni"), Br("\u231c", "\\ulcorner"), Br("\u231d", "\\urcorner"), Br("\u231e", "\\llcorner"), Br("\u231f", "\\lrcorner"), Br("\xa9", "\\copyright"), Br("\xae", "\\textregistered"), Br("\ufe0f", "\\textregistered"), Br("\\ulcorner", '\\html@mathml{\\@ulcorner}{\\mathop{\\char"231c}}'), Br("\\urcorner", '\\html@mathml{\\@urcorner}{\\mathop{\\char"231d}}'), Br("\\llcorner", '\\html@mathml{\\@llcorner}{\\mathop{\\char"231e}}'), Br("\\lrcorner", '\\html@mathml{\\@lrcorner}{\\mathop{\\char"231f}}'), Br("\\vdots", "\\mathord{\\varvdots\\rule{0pt}{15pt}}"), Br("\u22ee", "\\vdots"), Br("\\varGamma", "\\mathit{\\Gamma}"), Br("\\varDelta", "\\mathit{\\Delta}"), Br("\\varTheta", "\\mathit{\\Theta}"), Br("\\varLambda", "\\mathit{\\Lambda}"), Br("\\varXi", "\\mathit{\\Xi}"), Br("\\varPi", "\\mathit{\\Pi}"), Br("\\varSigma", "\\mathit{\\Sigma}"), Br("\\varUpsilon", "\\mathit{\\Upsilon}"), Br("\\varPhi", "\\mathit{\\Phi}"), Br("\\varPsi", "\\mathit{\\Psi}"), Br("\\varOmega", "\\mathit{\\Omega}"), Br("\\substack", "\\begin{subarray}{c}#1\\end{subarray}"), Br("\\colon", "\\nobreak\\mskip2mu\\mathpunct{}\\mathchoice{\\mkern-3mu}{\\mkern-3mu}{}{}{:}\\mskip6mu\\relax"), Br("\\boxed", "\\fbox{$\\displaystyle{#1}$}"), Br("\\iff", "\\DOTSB\\;\\Longleftrightarrow\\;"), Br("\\implies", "\\DOTSB\\;\\Longrightarrow\\;"), Br("\\impliedby", "\\DOTSB\\;\\Longleftarrow\\;"); const Cn = { ",": "\\dotsc", "\\not": "\\dotsb", "+": "\\dotsb", "=": "\\dotsb", "<": "\\dotsb", ">": "\\dotsb", "-": "\\dotsb", "*": "\\dotsb", ":": "\\dotsb", "\\DOTSB": "\\dotsb", "\\coprod": "\\dotsb", "\\bigvee": "\\dotsb", "\\bigwedge": "\\dotsb", "\\biguplus": "\\dotsb", "\\bigcap": "\\dotsb", "\\bigcup": "\\dotsb", "\\prod": "\\dotsb", "\\sum": "\\dotsb", "\\bigotimes": "\\dotsb", "\\bigoplus": "\\dotsb", "\\bigodot": "\\dotsb", "\\bigsqcup": "\\dotsb", "\\And": "\\dotsb", "\\longrightarrow": "\\dotsb", "\\Longrightarrow": "\\dotsb", "\\longleftarrow": "\\dotsb", "\\Longleftarrow": "\\dotsb", "\\longleftrightarrow": "\\dotsb", "\\Longleftrightarrow": "\\dotsb", "\\mapsto": "\\dotsb", "\\longmapsto": "\\dotsb", "\\hookrightarrow": "\\dotsb", "\\doteq": "\\dotsb", "\\mathbin": "\\dotsb", "\\mathrel": "\\dotsb", "\\relbar": "\\dotsb", "\\Relbar": "\\dotsb", "\\xrightarrow": "\\dotsb", "\\xleftarrow": "\\dotsb", "\\DOTSI": "\\dotsi", "\\int": "\\dotsi", "\\oint": "\\dotsi", "\\iint": "\\dotsi", "\\iiint": "\\dotsi", "\\iiiint": "\\dotsi", "\\idotsint": "\\dotsi", "\\DOTSX": "\\dotsx" }; Br("\\dots", (function (e) { let t = "\\dotso"; const r = e.expandAfterFuture().text; return r in Cn ? t = Cn[r] : ("\\not" === r.slice(0, 4) || r in oe.math && l.contains(["bin", "rel"], oe.math[r].group)) && (t = "\\dotsb"), t })); const Nn = { ")": !0, "]": !0, "\\rbrack": !0, "\\}": !0, "\\rbrace": !0, "\\rangle": !0, "\\rceil": !0, "\\rfloor": !0, "\\rgroup": !0, "\\rmoustache": !0, "\\right": !0, "\\bigr": !0, "\\biggr": !0, "\\Bigr": !0, "\\Biggr": !0, $: !0, ";": !0, ".": !0, ",": !0 }; Br("\\dotso", (function (e) { return e.future().text in Nn ? "\\ldots\\," : "\\ldots" })), Br("\\dotsc", (function (e) { const t = e.future().text; return t in Nn && "," !== t ? "\\ldots\\," : "\\ldots" })), Br("\\cdots", (function (e) { return e.future().text in Nn ? "\\@cdots\\," : "\\@cdots" })), Br("\\dotsb", "\\cdots"), Br("\\dotsm", "\\cdots"), Br("\\dotsi", "\\!\\cdots"), Br("\\dotsx", "\\ldots\\,"), Br("\\DOTSI", "\\relax"), Br("\\DOTSB", "\\relax"), Br("\\DOTSX", "\\relax"), Br("\\tmspace", "\\TextOrMath{\\kern#1#3}{\\mskip#1#2}\\relax"), Br("\\,", "\\tmspace+{3mu}{.1667em}"), Br("\\thinspace", "\\,"), Br("\\>", "\\mskip{4mu}"), Br("\\:", "\\tmspace+{4mu}{.2222em}"), Br("\\medspace", "\\:"), Br("\\;", "\\tmspace+{5mu}{.2777em}"), Br("\\thickspace", "\\;"), Br("\\!", "\\tmspace-{3mu}{.1667em}"), Br("\\negthinspace", "\\!"), Br("\\negmedspace", "\\tmspace-{4mu}{.2222em}"), Br("\\negthickspace", "\\tmspace-{5mu}{.277em}"), Br("\\enspace", "\\kern.5em "), Br("\\enskip", "\\hskip.5em\\relax"), Br("\\quad", "\\hskip1em\\relax"), Br("\\qquad", "\\hskip2em\\relax"), Br("\\tag", "\\@ifstar\\tag@literal\\tag@paren"), Br("\\tag@paren", "\\tag@literal{({#1})}"), Br("\\tag@literal", (e => { if (e.macros.get("\\df@tag")) throw new n("Multiple \\tag"); return "\\gdef\\df@tag{\\text{#1}}" })), Br("\\bmod", "\\mathchoice{\\mskip1mu}{\\mskip1mu}{\\mskip5mu}{\\mskip5mu}\\mathbin{\\rm mod}\\mathchoice{\\mskip1mu}{\\mskip1mu}{\\mskip5mu}{\\mskip5mu}"), Br("\\pod", "\\allowbreak\\mathchoice{\\mkern18mu}{\\mkern8mu}{\\mkern8mu}{\\mkern8mu}(#1)"), Br("\\pmod", "\\pod{{\\rm mod}\\mkern6mu#1}"), Br("\\mod", "\\allowbreak\\mathchoice{\\mkern18mu}{\\mkern12mu}{\\mkern12mu}{\\mkern12mu}{\\rm mod}\\,\\,#1"), Br("\\newline", "\\\\\\relax"), Br("\\TeX", "\\textrm{\\html@mathml{T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX}{TeX}}"); const qn = F(T["Main-Regular"]["T".charCodeAt(0)][1] - .7 * T["Main-Regular"]["A".charCodeAt(0)][1]); Br("\\LaTeX", "\\textrm{\\html@mathml{L\\kern-.36em\\raisebox{" + qn + "}{\\scriptstyle A}\\kern-.15em\\TeX}{LaTeX}}"), Br("\\KaTeX", "\\textrm{\\html@mathml{K\\kern-.17em\\raisebox{" + qn + "}{\\scriptstyle A}\\kern-.15em\\TeX}{KaTeX}}"), Br("\\hspace", "\\@ifstar\\@hspacer\\@hspace"), Br("\\@hspace", "\\hskip #1\\relax"), Br("\\@hspacer", "\\rule{0pt}{0pt}\\hskip #1\\relax"), Br("\\ordinarycolon", ":"), Br("\\vcentcolon", "\\mathrel{\\mathop\\ordinarycolon}"), Br("\\dblcolon", '\\html@mathml{\\mathrel{\\vcentcolon\\mathrel{\\mkern-.9mu}\\vcentcolon}}{\\mathop{\\char"2237}}'), Br("\\coloneqq", '\\html@mathml{\\mathrel{\\vcentcolon\\mathrel{\\mkern-1.2mu}=}}{\\mathop{\\char"2254}}'), Br("\\Coloneqq", '\\html@mathml{\\mathrel{\\dblcolon\\mathrel{\\mkern-1.2mu}=}}{\\mathop{\\char"2237\\char"3d}}'), Br("\\coloneq", '\\html@mathml{\\mathrel{\\vcentcolon\\mathrel{\\mkern-1.2mu}\\mathrel{-}}}{\\mathop{\\char"3a\\char"2212}}'), Br("\\Coloneq", '\\html@mathml{\\mathrel{\\dblcolon\\mathrel{\\mkern-1.2mu}\\mathrel{-}}}{\\mathop{\\char"2237\\char"2212}}'), Br("\\eqqcolon", '\\html@mathml{\\mathrel{=\\mathrel{\\mkern-1.2mu}\\vcentcolon}}{\\mathop{\\char"2255}}'), Br("\\Eqqcolon", '\\html@mathml{\\mathrel{=\\mathrel{\\mkern-1.2mu}\\dblcolon}}{\\mathop{\\char"3d\\char"2237}}'), Br("\\eqcolon", '\\html@mathml{\\mathrel{\\mathrel{-}\\mathrel{\\mkern-1.2mu}\\vcentcolon}}{\\mathop{\\char"2239}}'), Br("\\Eqcolon", '\\html@mathml{\\mathrel{\\mathrel{-}\\mathrel{\\mkern-1.2mu}\\dblcolon}}{\\mathop{\\char"2212\\char"2237}}'), Br("\\colonapprox", '\\html@mathml{\\mathrel{\\vcentcolon\\mathrel{\\mkern-1.2mu}\\approx}}{\\mathop{\\char"3a\\char"2248}}'), Br("\\Colonapprox", '\\html@mathml{\\mathrel{\\dblcolon\\mathrel{\\mkern-1.2mu}\\approx}}{\\mathop{\\char"2237\\char"2248}}'), Br("\\colonsim", '\\html@mathml{\\mathrel{\\vcentcolon\\mathrel{\\mkern-1.2mu}\\sim}}{\\mathop{\\char"3a\\char"223c}}'), Br("\\Colonsim", '\\html@mathml{\\mathrel{\\dblcolon\\mathrel{\\mkern-1.2mu}\\sim}}{\\mathop{\\char"2237\\char"223c}}'), Br("\u2237", "\\dblcolon"), Br("\u2239", "\\eqcolon"), Br("\u2254", "\\coloneqq"), Br("\u2255", "\\eqqcolon"), Br("\u2a74", "\\Coloneqq"), Br("\\ratio", "\\vcentcolon"), Br("\\coloncolon", "\\dblcolon"), Br("\\colonequals", "\\coloneqq"), Br("\\coloncolonequals", "\\Coloneqq"), Br("\\equalscolon", "\\eqqcolon"), Br("\\equalscoloncolon", "\\Eqqcolon"), Br("\\colonminus", "\\coloneq"), Br("\\coloncolonminus", "\\Coloneq"), Br("\\minuscolon", "\\eqcolon"), Br("\\minuscoloncolon", "\\Eqcolon"), Br("\\coloncolonapprox", "\\Colonapprox"), Br("\\coloncolonsim", "\\Colonsim"), Br("\\simcolon", "\\mathrel{\\sim\\mathrel{\\mkern-1.2mu}\\vcentcolon}"), Br("\\simcoloncolon", "\\mathrel{\\sim\\mathrel{\\mkern-1.2mu}\\dblcolon}"), Br("\\approxcolon", "\\mathrel{\\approx\\mathrel{\\mkern-1.2mu}\\vcentcolon}"), Br("\\approxcoloncolon", "\\mathrel{\\approx\\mathrel{\\mkern-1.2mu}\\dblcolon}"), Br("\\notni", "\\html@mathml{\\not\\ni}{\\mathrel{\\char`\u220c}}"), Br("\\limsup", "\\DOTSB\\operatorname*{lim\\,sup}"), Br("\\liminf", "\\DOTSB\\operatorname*{lim\\,inf}"), Br("\\injlim", "\\DOTSB\\operatorname*{inj\\,lim}"), Br("\\projlim", "\\DOTSB\\operatorname*{proj\\,lim}"), Br("\\varlimsup", "\\DOTSB\\operatorname*{\\overline{lim}}"), Br("\\varliminf", "\\DOTSB\\operatorname*{\\underline{lim}}"), Br("\\varinjlim", "\\DOTSB\\operatorname*{\\underrightarrow{lim}}"), Br("\\varprojlim", "\\DOTSB\\operatorname*{\\underleftarrow{lim}}"), Br("\\gvertneqq", "\\html@mathml{\\@gvertneqq}{\u2269}"), Br("\\lvertneqq", "\\html@mathml{\\@lvertneqq}{\u2268}"), Br("\\ngeqq", "\\html@mathml{\\@ngeqq}{\u2271}"), Br("\\ngeqslant", "\\html@mathml{\\@ngeqslant}{\u2271}"), Br("\\nleqq", "\\html@mathml{\\@nleqq}{\u2270}"), Br("\\nleqslant", "\\html@mathml{\\@nleqslant}{\u2270}"), Br("\\nshortmid", "\\html@mathml{\\@nshortmid}{\u2224}"), Br("\\nshortparallel", "\\html@mathml{\\@nshortparallel}{\u2226}"), Br("\\nsubseteqq", "\\html@mathml{\\@nsubseteqq}{\u2288}"), Br("\\nsupseteqq", "\\html@mathml{\\@nsupseteqq}{\u2289}"), Br("\\varsubsetneq", "\\html@mathml{\\@varsubsetneq}{\u228a}"), Br("\\varsubsetneqq", "\\html@mathml{\\@varsubsetneqq}{\u2acb}"), Br("\\varsupsetneq", "\\html@mathml{\\@varsupsetneq}{\u228b}"), Br("\\varsupsetneqq", "\\html@mathml{\\@varsupsetneqq}{\u2acc}"), Br("\\imath", "\\html@mathml{\\@imath}{\u0131}"), Br("\\jmath", "\\html@mathml{\\@jmath}{\u0237}"), Br("\\llbracket", "\\html@mathml{\\mathopen{[\\mkern-3.2mu[}}{\\mathopen{\\char`\u27e6}}"), Br("\\rrbracket", "\\html@mathml{\\mathclose{]\\mkern-3.2mu]}}{\\mathclose{\\char`\u27e7}}"), Br("\u27e6", "\\llbracket"), Br("\u27e7", "\\rrbracket"), Br("\\lBrace", "\\html@mathml{\\mathopen{\\{\\mkern-3.2mu[}}{\\mathopen{\\char`\u2983}}"), Br("\\rBrace", "\\html@mathml{\\mathclose{]\\mkern-3.2mu\\}}}{\\mathclose{\\char`\u2984}}"), Br("\u2983", "\\lBrace"), Br("\u2984", "\\rBrace"), Br("\\minuso", "\\mathbin{\\html@mathml{{\\mathrlap{\\mathchoice{\\kern{0.145em}}{\\kern{0.145em}}{\\kern{0.1015em}}{\\kern{0.0725em}}\\circ}{-}}}{\\char`\u29b5}}"), Br("\u29b5", "\\minuso"), Br("\\darr", "\\downarrow"), Br("\\dArr", "\\Downarrow"), Br("\\Darr", "\\Downarrow"), Br("\\lang", "\\langle"), Br("\\rang", "\\rangle"), Br("\\uarr", "\\uparrow"), Br("\\uArr", "\\Uparrow"), Br("\\Uarr", "\\Uparrow"), Br("\\N", "\\mathbb{N}"), Br("\\R", "\\mathbb{R}"), Br("\\Z", "\\mathbb{Z}"), Br("\\alef", "\\aleph"), Br("\\alefsym", "\\aleph"), Br("\\Alpha", "\\mathrm{A}"), Br("\\Beta", "\\mathrm{B}"), Br("\\bull", "\\bullet"), Br("\\Chi", "\\mathrm{X}"), Br("\\clubs", "\\clubsuit"), Br("\\cnums", "\\mathbb{C}"), Br("\\Complex", "\\mathbb{C}"), Br("\\Dagger", "\\ddagger"), Br("\\diamonds", "\\diamondsuit"), Br("\\empty", "\\emptyset"), Br("\\Epsilon", "\\mathrm{E}"), Br("\\Eta", "\\mathrm{H}"), Br("\\exist", "\\exists"), Br("\\harr", "\\leftrightarrow"), Br("\\hArr", "\\Leftrightarrow"), Br("\\Harr", "\\Leftrightarrow"), Br("\\hearts", "\\heartsuit"), Br("\\image", "\\Im"), Br("\\infin", "\\infty"), Br("\\Iota", "\\mathrm{I}"), Br("\\isin", "\\in"), Br("\\Kappa", "\\mathrm{K}"), Br("\\larr", "\\leftarrow"), Br("\\lArr", "\\Leftarrow"), Br("\\Larr", "\\Leftarrow"), Br("\\lrarr", "\\leftrightarrow"), Br("\\lrArr", "\\Leftrightarrow"), Br("\\Lrarr", "\\Leftrightarrow"), Br("\\Mu", "\\mathrm{M}"), Br("\\natnums", "\\mathbb{N}"), Br("\\Nu", "\\mathrm{N}"), Br("\\Omicron", "\\mathrm{O}"), Br("\\plusmn", "\\pm"), Br("\\rarr", "\\rightarrow"), Br("\\rArr", "\\Rightarrow"), Br("\\Rarr", "\\Rightarrow"), Br("\\real", "\\Re"), Br("\\reals", "\\mathbb{R}"), Br("\\Reals", "\\mathbb{R}"), Br("\\Rho", "\\mathrm{P}"), Br("\\sdot", "\\cdot"), Br("\\sect", "\\S"), Br("\\spades", "\\spadesuit"), Br("\\sub", "\\subset"), Br("\\sube", "\\subseteq"), Br("\\supe", "\\supseteq"), Br("\\Tau", "\\mathrm{T}"), Br("\\thetasym", "\\vartheta"), Br("\\weierp", "\\wp"), Br("\\Zeta", "\\mathrm{Z}"), Br("\\argmin", "\\DOTSB\\operatorname*{arg\\,min}"), Br("\\argmax", "\\DOTSB\\operatorname*{arg\\,max}"), Br("\\plim", "\\DOTSB\\mathop{\\operatorname{plim}}\\limits"), Br("\\bra", "\\mathinner{\\langle{#1}|}"), Br("\\ket", "\\mathinner{|{#1}\\rangle}"), Br("\\braket", "\\mathinner{\\langle{#1}\\rangle}"), Br("\\Bra", "\\left\\langle#1\\right|"), Br("\\Ket", "\\left|#1\\right\\rangle"); const In = e => t => { const r = t.consumeArg().tokens, n = t.consumeArg().tokens, o = t.consumeArg().tokens, s = t.consumeArg().tokens, i = t.macros.get("|"), a = t.macros.get("\\|"); t.macros.beginGroup(); const l = t => r => { e && (r.macros.set("|", i), o.length && r.macros.set("\\|", a)); let s = t; if (!t && o.length) { "|" === r.future().text && (r.popToken(), s = !0) } return { tokens: s ? o : n, numArgs: 0 } }; t.macros.set("|", l(!1)), o.length && t.macros.set("\\|", l(!0)); const h = t.consumeArg().tokens, c = t.expandTokens([...s, ...h, ...r]); return t.macros.endGroup(), { tokens: c.reverse(), numArgs: 0 } }; Br("\\bra@ket", In(!1)), Br("\\bra@set", In(!0)), Br("\\Braket", "\\bra@ket{\\left\\langle}{\\,\\middle\\vert\\,}{\\,\\middle\\vert\\,}{\\right\\rangle}"), Br("\\Set", "\\bra@set{\\left\\{\\:}{\\;\\middle\\vert\\;}{\\;\\middle\\Vert\\;}{\\:\\right\\}}"), Br("\\set", "\\bra@set{\\{\\,}{\\mid}{}{\\,\\}}"), Br("\\angln", "{\\angl n}"), Br("\\blue", "\\textcolor{##6495ed}{#1}"), Br("\\orange", "\\textcolor{##ffa500}{#1}"), Br("\\pink", "\\textcolor{##ff00af}{#1}"), Br("\\red", "\\textcolor{##df0030}{#1}"), Br("\\green", "\\textcolor{##28ae7b}{#1}"), Br("\\gray", "\\textcolor{gray}{#1}"), Br("\\purple", "\\textcolor{##9d38bd}{#1}"), Br("\\blueA", "\\textcolor{##ccfaff}{#1}"), Br("\\blueB", "\\textcolor{##80f6ff}{#1}"), Br("\\blueC", "\\textcolor{##63d9ea}{#1}"), Br("\\blueD", "\\textcolor{##11accd}{#1}"), Br("\\blueE", "\\textcolor{##0c7f99}{#1}"), Br("\\tealA", "\\textcolor{##94fff5}{#1}"), Br("\\tealB", "\\textcolor{##26edd5}{#1}"), Br("\\tealC", "\\textcolor{##01d1c1}{#1}"), Br("\\tealD", "\\textcolor{##01a995}{#1}"), Br("\\tealE", "\\textcolor{##208170}{#1}"), Br("\\greenA", "\\textcolor{##b6ffb0}{#1}"), Br("\\greenB", "\\textcolor{##8af281}{#1}"), Br("\\greenC", "\\textcolor{##74cf70}{#1}"), Br("\\greenD", "\\textcolor{##1fab54}{#1}"), Br("\\greenE", "\\textcolor{##0d923f}{#1}"), Br("\\goldA", "\\textcolor{##ffd0a9}{#1}"), Br("\\goldB", "\\textcolor{##ffbb71}{#1}"), Br("\\goldC", "\\textcolor{##ff9c39}{#1}"), Br("\\goldD", "\\textcolor{##e07d10}{#1}"), Br("\\goldE", "\\textcolor{##a75a05}{#1}"), Br("\\redA", "\\textcolor{##fca9a9}{#1}"), Br("\\redB", "\\textcolor{##ff8482}{#1}"), Br("\\redC", "\\textcolor{##f9685d}{#1}"), Br("\\redD", "\\textcolor{##e84d39}{#1}"), Br("\\redE", "\\textcolor{##bc2612}{#1}"), Br("\\maroonA", "\\textcolor{##ffbde0}{#1}"), Br("\\maroonB", "\\textcolor{##ff92c6}{#1}"), Br("\\maroonC", "\\textcolor{##ed5fa6}{#1}"), Br("\\maroonD", "\\textcolor{##ca337c}{#1}"), Br("\\maroonE", "\\textcolor{##9e034e}{#1}"), Br("\\purpleA", "\\textcolor{##ddd7ff}{#1}"), Br("\\purpleB", "\\textcolor{##c6b9fc}{#1}"), Br("\\purpleC", "\\textcolor{##aa87ff}{#1}"), Br("\\purpleD", "\\textcolor{##7854ab}{#1}"), Br("\\purpleE", "\\textcolor{##543b78}{#1}"), Br("\\mintA", "\\textcolor{##f5f9e8}{#1}"), Br("\\mintB", "\\textcolor{##edf2df}{#1}"), Br("\\mintC", "\\textcolor{##e0e5cc}{#1}"), Br("\\grayA", "\\textcolor{##f6f7f7}{#1}"), Br("\\grayB", "\\textcolor{##f0f1f2}{#1}"), Br("\\grayC", "\\textcolor{##e3e5e6}{#1}"), Br("\\grayD", "\\textcolor{##d6d8da}{#1}"), Br("\\grayE", "\\textcolor{##babec2}{#1}"), Br("\\grayF", "\\textcolor{##888d93}{#1}"), Br("\\grayG", "\\textcolor{##626569}{#1}"), Br("\\grayH", "\\textcolor{##3b3e40}{#1}"), Br("\\grayI", "\\textcolor{##21242c}{#1}"), Br("\\kaBlue", "\\textcolor{##314453}{#1}"), Br("\\kaGreen", "\\textcolor{##71B307}{#1}"); const Rn = { "^": !0, _: !0, "\\limits": !0, "\\nolimits": !0 }; class Hn { constructor(e, t, r) { this.settings = void 0, this.expansionCount = void 0, this.lexer = void 0, this.macros = void 0, this.stack = void 0, this.mode = void 0, this.settings = t, this.expansionCount = 0, this.feed(e), this.macros = new zn(An, t.macros), this.mode = r, this.stack = [] } feed(e) { this.lexer = new Mn(e, this.settings) } switchMode(e) { this.mode = e } beginGroup() { this.macros.beginGroup() } endGroup() { this.macros.endGroup() } endGroups() { this.macros.endGroups() } future() { return 0 === this.stack.length && this.pushToken(this.lexer.lex()), this.stack[this.stack.length - 1] } popToken() { return this.future(), this.stack.pop() } pushToken(e) { this.stack.push(e) } pushTokens(e) { this.stack.push(...e) } scanArgument(e) { let t, r, n; if (e) { if (this.consumeSpaces(), "[" !== this.future().text) return null; t = this.popToken(), ({ tokens: n, end: r } = this.consumeArg(["]"])) } else ({ tokens: n, start: t, end: r } = this.consumeArg()); return this.pushToken(new Nr("EOF", r.loc)), this.pushTokens(n), t.range(r, "") } consumeSpaces() { for (; ;) { if (" " !== this.future().text) break; this.stack.pop() } } consumeArg(e) { const t = [], r = e && e.length > 0; r || this.consumeSpaces(); const o = this.future(); let s, i = 0, a = 0; do { if (s = this.popToken(), t.push(s), "{" === s.text) ++i; else if ("}" === s.text) { if (--i, -1 === i) throw new n("Extra }", s) } else if ("EOF" === s.text) throw new n("Unexpected end of input in a macro argument, expected '" + (e && r ? e[a] : "}") + "'", s); if (e && r) if ((0 === i || 1 === i && "{" === e[a]) && s.text === e[a]) { if (++a, a === e.length) { t.splice(-a, a); break } } else a = 0 } while (0 !== i || r); return "{" === o.text && "}" === t[t.length - 1].text && (t.pop(), t.shift()), t.reverse(), { tokens: t, start: o, end: s } } consumeArgs(e, t) { if (t) { if (t.length !== e + 1) throw new n("The length of delimiters doesn't match the number of args!"); const r = t[0]; for (let e = 0; e < r.length; e++) { const t = this.popToken(); if (r[e] !== t.text) throw new n("Use of the macro doesn't match its definition", t) } } const r = []; for (let n = 0; n < e; n++)r.push(this.consumeArg(t && t[n + 1]).tokens); return r } countExpansion(e) { if (this.expansionCount += e, this.expansionCount > this.settings.maxExpand) throw new n("Too many expansions: infinite loop or need to increase maxExpand setting") } expandOnce(e) { const t = this.popToken(), r = t.text, o = t.noexpand ? null : this._getExpansion(r); if (null == o || e && o.unexpandable) { if (e && null == o && "\\" === r[0] && !this.isDefined(r)) throw new n("Undefined control sequence: " + r); return this.pushToken(t), !1 } this.countExpansion(1); let s = o.tokens; const i = this.consumeArgs(o.numArgs, o.delimiters); if (o.numArgs) { s = s.slice(); for (let e = s.length - 1; e >= 0; --e) { let t = s[e]; if ("#" === t.text) { if (0 === e) throw new n("Incomplete placeholder at end of macro body", t); if (t = s[--e], "#" === t.text) s.splice(e + 1, 1); else { if (!/^[1-9]$/.test(t.text)) throw new n("Not a valid argument number", t); s.splice(e, 2, ...i[+t.text - 1]) } } } } return this.pushTokens(s), s.length } expandAfterFuture() { return this.expandOnce(), this.future() } expandNextToken() { for (; ;)if (!1 === this.expandOnce()) { const e = this.stack.pop(); return e.treatAsRelax && (e.text = "\\relax"), e } throw new Error } expandMacro(e) { return this.macros.has(e) ? this.expandTokens([new Nr(e)]) : void 0 } expandTokens(e) { const t = [], r = this.stack.length; for (this.pushTokens(e); this.stack.length > r;)if (!1 === this.expandOnce(!0)) { const e = this.stack.pop(); e.treatAsRelax && (e.noexpand = !1, e.treatAsRelax = !1), t.push(e) } return this.countExpansion(t.length), t } expandMacroAsText(e) { const t = this.expandMacro(e); return t ? t.map((e => e.text)).join("") : t } _getExpansion(e) { const t = this.macros.get(e); if (null == t) return t; if (1 === e.length) { const t = this.lexer.catcodes[e]; if (null != t && 13 !== t) return } const r = "function" == typeof t ? t(this) : t; if ("string" == typeof r) { let e = 0; if (-1 !== r.indexOf("#")) { const t = r.replace(/##/g, ""); for (; -1 !== t.indexOf("#" + (e + 1));)++e } const t = new Mn(r, this.settings), n = []; let o = t.lex(); for (; "EOF" !== o.text;)n.push(o), o = t.lex(); n.reverse(); return { tokens: n, numArgs: e } } return r } isDefined(e) { return this.macros.has(e) || yn.hasOwnProperty(e) || oe.math.hasOwnProperty(e) || oe.text.hasOwnProperty(e) || Rn.hasOwnProperty(e) } isExpandable(e) { const t = this.macros.get(e); return null != t ? "string" == typeof t || "function" == typeof t || !t.unexpandable : yn.hasOwnProperty(e) && !yn[e].primitive } } const On = /^[\u208a\u208b\u208c\u208d\u208e\u2080\u2081\u2082\u2083\u2084\u2085\u2086\u2087\u2088\u2089\u2090\u2091\u2095\u1d62\u2c7c\u2096\u2097\u2098\u2099\u2092\u209a\u1d63\u209b\u209c\u1d64\u1d65\u2093\u1d66\u1d67\u1d68\u1d69\u1d6a]/, En = Object.freeze({ "\u208a": "+", "\u208b": "-", "\u208c": "=", "\u208d": "(", "\u208e": ")", "\u2080": "0", "\u2081": "1", "\u2082": "2", "\u2083": "3", "\u2084": "4", "\u2085": "5", "\u2086": "6", "\u2087": "7", "\u2088": "8", "\u2089": "9", "\u2090": "a", "\u2091": "e", "\u2095": "h", "\u1d62": "i", "\u2c7c": "j", "\u2096": "k", "\u2097": "l", "\u2098": "m", "\u2099": "n", "\u2092": "o", "\u209a": "p", "\u1d63": "r", "\u209b": "s", "\u209c": "t", "\u1d64": "u", "\u1d65": "v", "\u2093": "x", "\u1d66": "\u03b2", "\u1d67": "\u03b3", "\u1d68": "\u03c1", "\u1d69": "\u03d5", "\u1d6a": "\u03c7", "\u207a": "+", "\u207b": "-", "\u207c": "=", "\u207d": "(", "\u207e": ")", "\u2070": "0", "\xb9": "1", "\xb2": "2", "\xb3": "3", "\u2074": "4", "\u2075": "5", "\u2076": "6", "\u2077": "7", "\u2078": "8", "\u2079": "9", "\u1d2c": "A", "\u1d2e": "B", "\u1d30": "D", "\u1d31": "E", "\u1d33": "G", "\u1d34": "H", "\u1d35": "I", "\u1d36": "J", "\u1d37": "K", "\u1d38": "L", "\u1d39": "M", "\u1d3a": "N", "\u1d3c": "O", "\u1d3e": "P", "\u1d3f": "R", "\u1d40": "T", "\u1d41": "U", "\u2c7d": "V", "\u1d42": "W", "\u1d43": "a", "\u1d47": "b", "\u1d9c": "c", "\u1d48": "d", "\u1d49": "e", "\u1da0": "f", "\u1d4d": "g", "\u02b0": "h", "\u2071": "i", "\u02b2": "j", "\u1d4f": "k", "\u02e1": "l", "\u1d50": "m", "\u207f": "n", "\u1d52": "o", "\u1d56": "p", "\u02b3": "r", "\u02e2": "s", "\u1d57": "t", "\u1d58": "u", "\u1d5b": "v", "\u02b7": "w", "\u02e3": "x", "\u02b8": "y", "\u1dbb": "z", "\u1d5d": "\u03b2", "\u1d5e": "\u03b3", "\u1d5f": "\u03b4", "\u1d60": "\u03d5", "\u1d61": "\u03c7", "\u1dbf": "\u03b8" }), Ln = { "\u0301": { text: "\\'", math: "\\acute" }, "\u0300": { text: "\\`", math: "\\grave" }, "\u0308": { text: '\\"', math: "\\ddot" }, "\u0303": { text: "\\~", math: "\\tilde" }, "\u0304": { text: "\\=", math: "\\bar" }, "\u0306": { text: "\\u", math: "\\breve" }, "\u030c": { text: "\\v", math: "\\check" }, "\u0302": { text: "\\^", math: "\\hat" }, "\u0307": { text: "\\.", math: "\\dot" }, "\u030a": { text: "\\r", math: "\\mathring" }, "\u030b": { text: "\\H" }, "\u0327": { text: "\\c" } }, Dn = { "\xe1": "a\u0301", "\xe0": "a\u0300", "\xe4": "a\u0308", "\u01df": "a\u0308\u0304", "\xe3": "a\u0303", "\u0101": "a\u0304", "\u0103": "a\u0306", "\u1eaf": "a\u0306\u0301", "\u1eb1": "a\u0306\u0300", "\u1eb5": "a\u0306\u0303", "\u01ce": "a\u030c", "\xe2": "a\u0302", "\u1ea5": "a\u0302\u0301", "\u1ea7": "a\u0302\u0300", "\u1eab": "a\u0302\u0303", "\u0227": "a\u0307", "\u01e1": "a\u0307\u0304", "\xe5": "a\u030a", "\u01fb": "a\u030a\u0301", "\u1e03": "b\u0307", "\u0107": "c\u0301", "\u1e09": "c\u0327\u0301", "\u010d": "c\u030c", "\u0109": "c\u0302", "\u010b": "c\u0307", "\xe7": "c\u0327", "\u010f": "d\u030c", "\u1e0b": "d\u0307", "\u1e11": "d\u0327", "\xe9": "e\u0301", "\xe8": "e\u0300", "\xeb": "e\u0308", "\u1ebd": "e\u0303", "\u0113": "e\u0304", "\u1e17": "e\u0304\u0301", "\u1e15": "e\u0304\u0300", "\u0115": "e\u0306", "\u1e1d": "e\u0327\u0306", "\u011b": "e\u030c", "\xea": "e\u0302", "\u1ebf": "e\u0302\u0301", "\u1ec1": "e\u0302\u0300", "\u1ec5": "e\u0302\u0303", "\u0117": "e\u0307", "\u0229": "e\u0327", "\u1e1f": "f\u0307", "\u01f5": "g\u0301", "\u1e21": "g\u0304", "\u011f": "g\u0306", "\u01e7": "g\u030c", "\u011d": "g\u0302", "\u0121": "g\u0307", "\u0123": "g\u0327", "\u1e27": "h\u0308", "\u021f": "h\u030c", "\u0125": "h\u0302", "\u1e23": "h\u0307", "\u1e29": "h\u0327", "\xed": "i\u0301", "\xec": "i\u0300", "\xef": "i\u0308", "\u1e2f": "i\u0308\u0301", "\u0129": "i\u0303", "\u012b": "i\u0304", "\u012d": "i\u0306", "\u01d0": "i\u030c", "\xee": "i\u0302", "\u01f0": "j\u030c", "\u0135": "j\u0302", "\u1e31": "k\u0301", "\u01e9": "k\u030c", "\u0137": "k\u0327", "\u013a": "l\u0301", "\u013e": "l\u030c", "\u013c": "l\u0327", "\u1e3f": "m\u0301", "\u1e41": "m\u0307", "\u0144": "n\u0301", "\u01f9": "n\u0300", "\xf1": "n\u0303", "\u0148": "n\u030c", "\u1e45": "n\u0307", "\u0146": "n\u0327", "\xf3": "o\u0301", "\xf2": "o\u0300", "\xf6": "o\u0308", "\u022b": "o\u0308\u0304", "\xf5": "o\u0303", "\u1e4d": "o\u0303\u0301", "\u1e4f": "o\u0303\u0308", "\u022d": "o\u0303\u0304", "\u014d": "o\u0304", "\u1e53": "o\u0304\u0301", "\u1e51": "o\u0304\u0300", "\u014f": "o\u0306", "\u01d2": "o\u030c", "\xf4": "o\u0302", "\u1ed1": "o\u0302\u0301", "\u1ed3": "o\u0302\u0300", "\u1ed7": "o\u0302\u0303", "\u022f": "o\u0307", "\u0231": "o\u0307\u0304", "\u0151": "o\u030b", "\u1e55": "p\u0301", "\u1e57": "p\u0307", "\u0155": "r\u0301", "\u0159": "r\u030c", "\u1e59": "r\u0307", "\u0157": "r\u0327", "\u015b": "s\u0301", "\u1e65": "s\u0301\u0307", "\u0161": "s\u030c", "\u1e67": "s\u030c\u0307", "\u015d": "s\u0302", "\u1e61": "s\u0307", "\u015f": "s\u0327", "\u1e97": "t\u0308", "\u0165": "t\u030c", "\u1e6b": "t\u0307", "\u0163": "t\u0327", "\xfa": "u\u0301", "\xf9": "u\u0300", "\xfc": "u\u0308", "\u01d8": "u\u0308\u0301", "\u01dc": "u\u0308\u0300", "\u01d6": "u\u0308\u0304", "\u01da": "u\u0308\u030c", "\u0169": "u\u0303", "\u1e79": "u\u0303\u0301", "\u016b": "u\u0304", "\u1e7b": "u\u0304\u0308", "\u016d": "u\u0306", "\u01d4": "u\u030c", "\xfb": "u\u0302", "\u016f": "u\u030a", "\u0171": "u\u030b", "\u1e7d": "v\u0303", "\u1e83": "w\u0301", "\u1e81": "w\u0300", "\u1e85": "w\u0308", "\u0175": "w\u0302", "\u1e87": "w\u0307", "\u1e98": "w\u030a", "\u1e8d": "x\u0308", "\u1e8b": "x\u0307", "\xfd": "y\u0301", "\u1ef3": "y\u0300", "\xff": "y\u0308", "\u1ef9": "y\u0303", "\u0233": "y\u0304", "\u0177": "y\u0302", "\u1e8f": "y\u0307", "\u1e99": "y\u030a", "\u017a": "z\u0301", "\u017e": "z\u030c", "\u1e91": "z\u0302", "\u017c": "z\u0307", "\xc1": "A\u0301", "\xc0": "A\u0300", "\xc4": "A\u0308", "\u01de": "A\u0308\u0304", "\xc3": "A\u0303", "\u0100": "A\u0304", "\u0102": "A\u0306", "\u1eae": "A\u0306\u0301", "\u1eb0": "A\u0306\u0300", "\u1eb4": "A\u0306\u0303", "\u01cd": "A\u030c", "\xc2": "A\u0302", "\u1ea4": "A\u0302\u0301", "\u1ea6": "A\u0302\u0300", "\u1eaa": "A\u0302\u0303", "\u0226": "A\u0307", "\u01e0": "A\u0307\u0304", "\xc5": "A\u030a", "\u01fa": "A\u030a\u0301", "\u1e02": "B\u0307", "\u0106": "C\u0301", "\u1e08": "C\u0327\u0301", "\u010c": "C\u030c", "\u0108": "C\u0302", "\u010a": "C\u0307", "\xc7": "C\u0327", "\u010e": "D\u030c", "\u1e0a": "D\u0307", "\u1e10": "D\u0327", "\xc9": "E\u0301", "\xc8": "E\u0300", "\xcb": "E\u0308", "\u1ebc": "E\u0303", "\u0112": "E\u0304", "\u1e16": "E\u0304\u0301", "\u1e14": "E\u0304\u0300", "\u0114": "E\u0306", "\u1e1c": "E\u0327\u0306", "\u011a": "E\u030c", "\xca": "E\u0302", "\u1ebe": "E\u0302\u0301", "\u1ec0": "E\u0302\u0300", "\u1ec4": "E\u0302\u0303", "\u0116": "E\u0307", "\u0228": "E\u0327", "\u1e1e": "F\u0307", "\u01f4": "G\u0301", "\u1e20": "G\u0304", "\u011e": "G\u0306", "\u01e6": "G\u030c", "\u011c": "G\u0302", "\u0120": "G\u0307", "\u0122": "G\u0327", "\u1e26": "H\u0308", "\u021e": "H\u030c", "\u0124": "H\u0302", "\u1e22": "H\u0307", "\u1e28": "H\u0327", "\xcd": "I\u0301", "\xcc": "I\u0300", "\xcf": "I\u0308", "\u1e2e": "I\u0308\u0301", "\u0128": "I\u0303", "\u012a": "I\u0304", "\u012c": "I\u0306", "\u01cf": "I\u030c", "\xce": "I\u0302", "\u0130": "I\u0307", "\u0134": "J\u0302", "\u1e30": "K\u0301", "\u01e8": "K\u030c", "\u0136": "K\u0327", "\u0139": "L\u0301", "\u013d": "L\u030c", "\u013b": "L\u0327", "\u1e3e": "M\u0301", "\u1e40": "M\u0307", "\u0143": "N\u0301", "\u01f8": "N\u0300", "\xd1": "N\u0303", "\u0147": "N\u030c", "\u1e44": "N\u0307", "\u0145": "N\u0327", "\xd3": "O\u0301", "\xd2": "O\u0300", "\xd6": "O\u0308", "\u022a": "O\u0308\u0304", "\xd5": "O\u0303", "\u1e4c": "O\u0303\u0301", "\u1e4e": "O\u0303\u0308", "\u022c": "O\u0303\u0304", "\u014c": "O\u0304", "\u1e52": "O\u0304\u0301", "\u1e50": "O\u0304\u0300", "\u014e": "O\u0306", "\u01d1": "O\u030c", "\xd4": "O\u0302", "\u1ed0": "O\u0302\u0301", "\u1ed2": "O\u0302\u0300", "\u1ed6": "O\u0302\u0303", "\u022e": "O\u0307", "\u0230": "O\u0307\u0304", "\u0150": "O\u030b", "\u1e54": "P\u0301", "\u1e56": "P\u0307", "\u0154": "R\u0301", "\u0158": "R\u030c", "\u1e58": "R\u0307", "\u0156": "R\u0327", "\u015a": "S\u0301", "\u1e64": "S\u0301\u0307", "\u0160": "S\u030c", "\u1e66": "S\u030c\u0307", "\u015c": "S\u0302", "\u1e60": "S\u0307", "\u015e": "S\u0327", "\u0164": "T\u030c", "\u1e6a": "T\u0307", "\u0162": "T\u0327", "\xda": "U\u0301", "\xd9": "U\u0300", "\xdc": "U\u0308", "\u01d7": "U\u0308\u0301", "\u01db": "U\u0308\u0300", "\u01d5": "U\u0308\u0304", "\u01d9": "U\u0308\u030c", "\u0168": "U\u0303", "\u1e78": "U\u0303\u0301", "\u016a": "U\u0304", "\u1e7a": "U\u0304\u0308", "\u016c": "U\u0306", "\u01d3": "U\u030c", "\xdb": "U\u0302", "\u016e": "U\u030a", "\u0170": "U\u030b", "\u1e7c": "V\u0303", "\u1e82": "W\u0301", "\u1e80": "W\u0300", "\u1e84": "W\u0308", "\u0174": "W\u0302", "\u1e86": "W\u0307", "\u1e8c": "X\u0308", "\u1e8a": "X\u0307", "\xdd": "Y\u0301", "\u1ef2": "Y\u0300", "\u0178": "Y\u0308", "\u1ef8": "Y\u0303", "\u0232": "Y\u0304", "\u0176": "Y\u0302", "\u1e8e": "Y\u0307", "\u0179": "Z\u0301", "\u017d": "Z\u030c", "\u1e90": "Z\u0302", "\u017b": "Z\u0307", "\u03ac": "\u03b1\u0301", "\u1f70": "\u03b1\u0300", "\u1fb1": "\u03b1\u0304", "\u1fb0": "\u03b1\u0306", "\u03ad": "\u03b5\u0301", "\u1f72": "\u03b5\u0300", "\u03ae": "\u03b7\u0301", "\u1f74": "\u03b7\u0300", "\u03af": "\u03b9\u0301", "\u1f76": "\u03b9\u0300", "\u03ca": "\u03b9\u0308", "\u0390": "\u03b9\u0308\u0301", "\u1fd2": "\u03b9\u0308\u0300", "\u1fd1": "\u03b9\u0304", "\u1fd0": "\u03b9\u0306", "\u03cc": "\u03bf\u0301", "\u1f78": "\u03bf\u0300", "\u03cd": "\u03c5\u0301", "\u1f7a": "\u03c5\u0300", "\u03cb": "\u03c5\u0308", "\u03b0": "\u03c5\u0308\u0301", "\u1fe2": "\u03c5\u0308\u0300", "\u1fe1": "\u03c5\u0304", "\u1fe0": "\u03c5\u0306", "\u03ce": "\u03c9\u0301", "\u1f7c": "\u03c9\u0300", "\u038e": "\u03a5\u0301", "\u1fea": "\u03a5\u0300", "\u03ab": "\u03a5\u0308", "\u1fe9": "\u03a5\u0304", "\u1fe8": "\u03a5\u0306", "\u038f": "\u03a9\u0301", "\u1ffa": "\u03a9\u0300" }; class Vn { constructor(e, t) { this.mode = void 0, this.gullet = void 0, this.settings = void 0, this.leftrightDepth = void 0, this.nextToken = void 0, this.mode = "math", this.gullet = new Hn(e, t, this.mode), this.settings = t, this.leftrightDepth = 0 } expect(e, t) { if (void 0 === t && (t = !0), this.fetch().text !== e) throw new n("Expected '" + e + "', got '" + this.fetch().text + "'", this.fetch()); t && this.consume() } consume() { this.nextToken = null } fetch() { return null == this.nextToken && (this.nextToken = this.gullet.expandNextToken()), this.nextToken } switchMode(e) { this.mode = e, this.gullet.switchMode(e) } parse() { this.settings.globalGroup || this.gullet.beginGroup(), this.settings.colorIsTextColor && this.gullet.macros.set("\\color", "\\textcolor"); try { const e = this.parseExpression(!1); return this.expect("EOF"), this.settings.globalGroup || this.gullet.endGroup(), e } finally { this.gullet.endGroups() } } subparse(e) { const t = this.nextToken; this.consume(), this.gullet.pushToken(new Nr("}")), this.gullet.pushTokens(e); const r = this.parseExpression(!1); return this.expect("}"), this.nextToken = t, r } parseExpression(e, t) { const r = []; for (; ;) { "math" === this.mode && this.consumeSpaces(); const n = this.fetch(); if (-1 !== Vn.endOfExpression.indexOf(n.text)) break; if (t && n.text === t) break; if (e && yn[n.text] && yn[n.text].infix) break; const o = this.parseAtom(t); if (!o) break; "internal" !== o.type && r.push(o) } return "text" === this.mode && this.formLigatures(r), this.handleInfixNodes(r) } handleInfixNodes(e) { let t, r = -1; for (let o = 0; o < e.length; o++)if ("infix" === e[o].type) { if (-1 !== r) throw new n("only one infix operator per group", e[o].token); r = o, t = e[o].replaceWith } if (-1 !== r && t) { let n, o; const s = e.slice(0, r), i = e.slice(r + 1); let a; return n = 1 === s.length && "ordgroup" === s[0].type ? s[0] : { type: "ordgroup", mode: this.mode, body: s }, o = 1 === i.length && "ordgroup" === i[0].type ? i[0] : { type: "ordgroup", mode: this.mode, body: i }, a = "\\\\abovefrac" === t ? this.callFunction(t, [n, e[r], o], []) : this.callFunction(t, [n, o], []), [a] } return e } handleSupSubscript(e) { const t = this.fetch(), r = t.text; this.consume(), this.consumeSpaces(); const o = this.parseGroup(e); if (!o) throw new n("Expected group after '" + r + "'", t); return o } formatUnsupportedCmd(e) { const t = []; for (let r = 0; r < e.length; r++)t.push({ type: "textord", mode: "text", text: e[r] }); const r = { type: "text", mode: this.mode, body: t }; return { type: "color", mode: this.mode, color: this.settings.errorColor, body: [r] } } parseAtom(e) { const t = this.parseGroup("atom", e); if ("text" === this.mode) return t; let r, o; for (; ;) { this.consumeSpaces(); const e = this.fetch(); if ("\\limits" === e.text || "\\nolimits" === e.text) { if (t && "op" === t.type) { const r = "\\limits" === e.text; t.limits = r, t.alwaysHandleSupSub = !0 } else { if (!t || "operatorname" !== t.type) throw new n("Limit controls must follow a math operator", e); t.alwaysHandleSupSub && (t.limits = "\\limits" === e.text) } this.consume() } else if ("^" === e.text) { if (r) throw new n("Double superscript", e); r = this.handleSupSubscript("superscript") } else if ("_" === e.text) { if (o) throw new n("Double subscript", e); o = this.handleSupSubscript("subscript") } else if ("'" === e.text) { if (r) throw new n("Double superscript", e); const t = { type: "textord", mode: this.mode, text: "\\prime" }, o = [t]; for (this.consume(); "'" === this.fetch().text;)o.push(t), this.consume(); "^" === this.fetch().text && o.push(this.handleSupSubscript("superscript")), r = { type: "ordgroup", mode: this.mode, body: o } } else { if (!En[e.text]) break; { const t = On.test(e.text), n = []; for (n.push(new Nr(En[e.text])), this.consume(); ;) { const e = this.fetch().text; if (!En[e]) break; if (On.test(e) !== t) break; n.unshift(new Nr(En[e])), this.consume() } const s = this.subparse(n); t ? o = { type: "ordgroup", mode: "math", body: s } : r = { type: "ordgroup", mode: "math", body: s } } } } return r || o ? { type: "supsub", mode: this.mode, base: t, sup: r, sub: o } : t } parseFunction(e, t) { const r = this.fetch(), o = r.text, s = yn[o]; if (!s) return null; if (this.consume(), t && "atom" !== t && !s.allowedInArgument) throw new n("Got function '" + o + "' with no arguments" + (t ? " as " + t : ""), r); if ("text" === this.mode && !s.allowedInText) throw new n("Can't use function '" + o + "' in text mode", r); if ("math" === this.mode && !1 === s.allowedInMath) throw new n("Can't use function '" + o + "' in math mode", r); const { args: i, optArgs: a } = this.parseArguments(o, s); return this.callFunction(o, i, a, r, e) } callFunction(e, t, r, o, s) { const i = { funcName: e, parser: this, token: o, breakOnTokenText: s }, a = yn[e]; if (a && a.handler) return a.handler(i, t, r); throw new n("No function handler for " + e) } parseArguments(e, t) { const r = t.numArgs + t.numOptionalArgs; if (0 === r) return { args: [], optArgs: [] }; const o = [], s = []; for (let i = 0; i < r; i++) { let r = t.argTypes && t.argTypes[i]; const a = i < t.numOptionalArgs; (t.primitive && null == r || "sqrt" === t.type && 1 === i && null == s[0]) && (r = "primitive"); const l = this.parseGroupOfType("argument to '" + e + "'", r, a); if (a) s.push(l); else { if (null == l) throw new n("Null argument, please report this as a bug"); o.push(l) } } return { args: o, optArgs: s } } parseGroupOfType(e, t, r) { switch (t) { case "color": return this.parseColorGroup(r); case "size": return this.parseSizeGroup(r); case "url": return this.parseUrlGroup(r); case "math": case "text": return this.parseArgumentGroup(r, t); case "hbox": { const e = this.parseArgumentGroup(r, "text"); return null != e ? { type: "styling", mode: e.mode, body: [e], style: "text" } : null } case "raw": { const e = this.parseStringGroup("raw", r); return null != e ? { type: "raw", mode: "text", string: e.text } : null } case "primitive": { if (r) throw new n("A primitive argument cannot be optional"); const t = this.parseGroup(e); if (null == t) throw new n("Expected group as " + e, this.fetch()); return t } case "original": case null: case void 0: return this.parseArgumentGroup(r); default: throw new n("Unknown group type as " + e, this.fetch()) } } consumeSpaces() { for (; " " === this.fetch().text;)this.consume() } parseStringGroup(e, t) { const r = this.gullet.scanArgument(t); if (null == r) return null; let n, o = ""; for (; "EOF" !== (n = this.fetch()).text;)o += n.text, this.consume(); return this.consume(), r.text = o, r } parseRegexGroup(e, t) { const r = this.fetch(); let o, s = r, i = ""; for (; "EOF" !== (o = this.fetch()).text && e.test(i + o.text);)s = o, i += s.text, this.consume(); if ("" === i) throw new n("Invalid " + t + ": '" + r.text + "'", r); return r.range(s, i) } parseColorGroup(e) { const t = this.parseStringGroup("color", e); if (null == t) return null; const r = /^(#[a-f0-9]{3}|#?[a-f0-9]{6}|[a-z]+)$/i.exec(t.text); if (!r) throw new n("Invalid color: '" + t.text + "'", t); let o = r[0]; return /^[0-9a-f]{6}$/i.test(o) && (o = "#" + o), { type: "color-token", mode: this.mode, color: o } } parseSizeGroup(e) { let t, r = !1; if (this.gullet.consumeSpaces(), t = e || "{" === this.gullet.future().text ? this.parseStringGroup("size", e) : this.parseRegexGroup(/^[-+]? *(?:$|\d+|\d+\.\d*|\.\d*) *[a-z]{0,2} *$/, "size"), !t) return null; e || 0 !== t.text.length || (t.text = "0pt", r = !0); const o = /([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})/.exec(t.text); if (!o) throw new n("Invalid size: '" + t.text + "'", t); const s = { number: +(o[1] + o[2]), unit: o[3] }; if (!V(s)) throw new n("Invalid unit: '" + s.unit + "'", t); return { type: "size", mode: this.mode, value: s, isBlank: r } } parseUrlGroup(e) { this.gullet.lexer.setCatcode("%", 13), this.gullet.lexer.setCatcode("~", 12); const t = this.parseStringGroup("url", e); if (this.gullet.lexer.setCatcode("%", 14), this.gullet.lexer.setCatcode("~", 13), null == t) return null; const r = t.text.replace(/\\([#$%&~_^{}])/g, "$1"); return { type: "url", mode: this.mode, url: r } } parseArgumentGroup(e, t) { const r = this.gullet.scanArgument(e); if (null == r) return null; const n = this.mode; t && this.switchMode(t), this.gullet.beginGroup(); const o = this.parseExpression(!1, "EOF"); this.expect("EOF"), this.gullet.endGroup(); const s = { type: "ordgroup", mode: this.mode, loc: r.loc, body: o }; return t && this.switchMode(n), s } parseGroup(e, t) { const r = this.fetch(), o = r.text; let s; if ("{" === o || "\\begingroup" === o) { this.consume(); const e = "{" === o ? "}" : "\\endgroup"; this.gullet.beginGroup(); const t = this.parseExpression(!1, e), n = this.fetch(); this.expect(e), this.gullet.endGroup(), s = { type: "ordgroup", mode: this.mode, loc: Cr.range(r, n), body: t, semisimple: "\\begingroup" === o || void 0 } } else if (s = this.parseFunction(t, e) || this.parseSymbol(), null == s && "\\" === o[0] && !Rn.hasOwnProperty(o)) { if (this.settings.throwOnError) throw new n("Undefined control sequence: " + o, r); s = this.formatUnsupportedCmd(o), this.consume() } return s } formLigatures(e) { let t = e.length - 1; for (let r = 0; r < t; ++r) { const n = e[r], o = n.text; "-" === o && "-" === e[r + 1].text && (r + 1 < t && "-" === e[r + 2].text ? (e.splice(r, 3, { type: "textord", mode: "text", loc: Cr.range(n, e[r + 2]), text: "---" }), t -= 2) : (e.splice(r, 2, { type: "textord", mode: "text", loc: Cr.range(n, e[r + 1]), text: "--" }), t -= 1)), "'" !== o && "`" !== o || e[r + 1].text !== o || (e.splice(r, 2, { type: "textord", mode: "text", loc: Cr.range(n, e[r + 1]), text: o + o }), t -= 1) } } parseSymbol() { const e = this.fetch(); let t = e.text; if (/^\\verb[^a-zA-Z]/.test(t)) { this.consume(); let e = t.slice(5); const r = "*" === e.charAt(0); if (r && (e = e.slice(1)), e.length < 2 || e.charAt(0) !== e.slice(-1)) throw new n("\\verb assertion failed --\n please report what input caused this bug"); return e = e.slice(1, -1), { type: "verb", mode: "text", body: e, star: r } } Dn.hasOwnProperty(t[0]) && !oe[this.mode][t[0]] && (this.settings.strict && "math" === this.mode && this.settings.reportNonstrict("unicodeTextInMathMode", 'Accented Unicode text character "' + t[0] + '" used in math mode', e), t = Dn[t[0]] + t.slice(1)); const r = kn.exec(t); let o; if (r && (t = t.substring(0, r.index), "i" === t ? t = "\u0131" : "j" === t && (t = "\u0237")), oe[this.mode][t]) { this.settings.strict && "math" === this.mode && Ae.indexOf(t) >= 0 && this.settings.reportNonstrict("unicodeTextInMathMode", 'Latin-1/Unicode text character "' + t[0] + '" used in math mode', e); const r = oe[this.mode][t].group, n = Cr.range(e); let s; if (te.hasOwnProperty(r)) { const e = r; s = { type: "atom", mode: this.mode, family: e, loc: n, text: t } } else s = { type: r, mode: this.mode, loc: n, text: t }; o = s } else { if (!(t.charCodeAt(0) >= 128)) return null; this.settings.strict && (S(t.charCodeAt(0)) ? "math" === this.mode && this.settings.reportNonstrict("unicodeTextInMathMode", 'Unicode text character "' + t[0] + '" used in math mode', e) : this.settings.reportNonstrict("unknownSymbol", 'Unrecognized Unicode character "' + t[0] + '" (' + t.charCodeAt(0) + ")", e)), o = { type: "textord", mode: "text", loc: Cr.range(e), text: t } } if (this.consume(), r) for (let t = 0; t < r[0].length; t++) { const s = r[0][t]; if (!Ln[s]) throw new n("Unknown accent ' " + s + "'", e); const i = Ln[s][this.mode] || Ln[s].text; if (!i) throw new n("Accent " + s + " unsupported in " + this.mode + " mode", e); o = { type: "accent", mode: this.mode, loc: Cr.range(e), label: i, isStretchy: !1, isShifty: !0, base: o } } return o } } Vn.endOfExpression = ["}", "\\endgroup", "\\end", "\\right", "&"]; var Pn = function (e, t) { if (!("string" == typeof e || e instanceof String)) throw new TypeError("KaTeX can only parse string typed expression"); const r = new Vn(e, t); delete r.gullet.macros.current["\\df@tag"]; let o = r.parse(); if (delete r.gullet.macros.current["\\current@color"], delete r.gullet.macros.current["\\color"], r.gullet.macros.get("\\df@tag")) { if (!t.displayMode) throw new n("\\tag works only in display equations"); o = [{ type: "tag", mode: "text", body: o, tag: r.subparse([new Nr("\\df@tag")]) }] } return o }; let Fn = function (e, t, r) { t.textContent = ""; const n = Un(e, r).toNode(); t.appendChild(n) }; "undefined" != typeof document && "CSS1Compat" !== document.compatMode && ("undefined" != typeof console && console.warn("Warning: KaTeX doesn't work in quirks mode. Make sure your website has a suitable doctype."), Fn = function () { throw new n("KaTeX doesn't work in quirks mode.") }); const Gn = function (e, t, r) { if (r.throwOnError || !(e instanceof n)) throw e; const o = Ve.makeSpan(["katex-error"], [new Z(t)]); return o.setAttribute("title", e.toString()), o.setAttribute("style", "color:" + r.errorColor), o }, Un = function (e, t) { const r = new m(t); try { const t = Pn(e, r); return zt(t, e, r) } catch (t) { return Gn(t, e, r) } }; var Yn = { version: "0.16.11", render: Fn, renderToString: function (e, t) { return Un(e, t).toMarkup() }, ParseError: n, SETTINGS_SCHEMA: h, __parse: function (e, t) { const r = new m(t); return Pn(e, r) }, __renderToDomTree: Un, __renderToHTMLTree: function (e, t) { const r = new m(t); try { return function (e, t, r) { const n = mt(e, St(r)), o = Ve.makeSpan(["katex"], [n]); return Mt(o, r) }(Pn(e, r), 0, r) } catch (t) { return Gn(t, e, r) } }, __setFontMetrics: function (e, t) { T[e] = t }, __defineSymbol: se, __defineFunction: je, __defineMacro: Br, __domTree: { Span: W, Anchor: _, SymbolNode: Z, SvgNode: K, PathNode: J, LineNode: Q } }; return t = t.default }() })); diff --git a/docs/model/hardware/install_other_devices.md b/docs/model/hardware/install_other_devices.md new file mode 100644 index 0000000000..f5fbb4b723 --- /dev/null +++ b/docs/model/hardware/install_other_devices.md @@ -0,0 +1,66 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 多硬件安装飞桨 + +本文档主要针对昇腾 NPU 硬件平台,介绍如何安装飞桨。 + +## 1. 昇腾 NPU 飞桨安装 + +### 1.1 环境准备 + +当前 PaddleOCR 支持昇腾 910B 芯片,昇腾驱动版本为 23.0.3。考虑到环境差异性,我们推荐使用飞桨官方提供的标准镜像完成环境准备。 + +#### 拉取镜像 + +此镜像仅为开发环境,镜像中不包含预编译的飞桨安装包,镜像中已经默认安装了昇腾算子库 CANN-8.0.RC1。 + +```bash linenums="1" +# 适用于 X86 架构,暂时不提供 Arch64 架构镜像 +docker pull registry.baidubce.com/device/paddle-npu:cann80RC1-ubuntu20-x86_64-gcc84-py39 +``` + +#### 启动容器 + +ASCEND_RT_VISIBLE_DEVICES 指定可见的 NPU 卡号 + +```bash linenums="1" +docker run -it --name paddle-npu-dev -v $(pwd):/work \ + --privileged --network=host --shm-size=128G -w=/work \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -e ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ + registry.baidubce.com/device/paddle-npu:cann80RC1-ubuntu20-x86_64-gcc84-py39 /bin/bash +``` + +### 1.2 安装 paddle 包 + +当前提供 Python3.9 的 wheel 安装包。如有其他 Python 版本需求,可以参考[飞桨官方文档](https://www.paddlepaddle.org.cn/install/quick)自行编译安装。 + +#### 1. 下载安装 Python3.9 的 wheel 安装包 + +```bash linenums="1" +# 注意需要先安装飞桨 cpu 版本 +pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddle-device/npu/paddlepaddle-0.0.0-cp39-cp39-linux_x86_64.whl +pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddle-device/npu/paddle_custom_npu-0.0.0-cp39-cp39-linux_x86_64.whl +``` + +#### 2. 验证安装包 + +安装完成之后,运行如下命令。 + +```bash linenums="1" +python -c "import paddle; paddle.utils.run_check()" +``` + +预期得到如下输出结果 + +```bash linenums="1" +Running verify PaddlePaddle program ... +PaddlePaddle works well on 1 npu. +PaddlePaddle works well on 8 npus. +PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now. +``` diff --git a/docs/model/hardware/supported_models.md b/docs/model/hardware/supported_models.md new file mode 100644 index 0000000000..9ca4bd5f97 --- /dev/null +++ b/docs/model/hardware/supported_models.md @@ -0,0 +1,12 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# PaddleOCR模型列表 + +*多硬件安装方式请参考[多硬件安装文档](./install_other_devices.md)* + +| 模型名称 | 昇腾NPU | +| ---------------- | -------- | +| PP-OCRv4 | √ | diff --git a/docs/model/index.md b/docs/model/index.md new file mode 100644 index 0000000000..7929863cfb --- /dev/null +++ b/docs/model/index.md @@ -0,0 +1,26 @@ +--- +comments: true +hide: + - toc +# - navigation +--- + +## PP-OCR 系列模型列表(更新中) + +| 模型简介 | 模型名称 | 推荐场景 | 检测模型| 方向分类器 | 识别模型| +| ----- | -------------- | --------------- | ------- | ------- | ---------- | +| 中英文超轻量 PP-OCRv4 模型(15.8M) | ch_PP-OCRv4_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_train.tar) | +| 中英文超轻量 PP-OCRv3 模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| 英文超轻量 PP-OCRv3 模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | + +- 超轻量 OCR 系列更多模型下载(包括多语言),可以参考[PP-OCR 系列模型下载](../ppocr/model_list.md),文档分析相关模型参考[PP-Structure 系列模型下载](../ppstructure/models_list.md) + +### PaddleOCR 场景应用模型 + +| 行业 | 类别 | 亮点| 文档说明| 模型下载 | +| ---- | ---- | -------- | ---- | ----- | +| 制造 | 数码管识别 | 数码管数据合成、漏识别调优 | [光功率计数码管字符识别](../applications/光功率计数码管字符识别.md) | [下载链接](../applications/overview.md) | +| 金融 | 通用表单识别 | 多模态通用表单结构化提取 | [多模态表单识别](../applications/多模态表单识别.md) | [下载链接](../applications/overview.md) | +| 交通 | 车牌识别 | 多角度图像处理、轻量模型、端侧部署 | [轻量级车牌识别](../applications/轻量级车牌识别.md) | [下载链接](../applications/overview.md) | + +- 更多制造、金融、交通行业的主要 OCR 垂类应用模型(如电表、液晶屏、高精度 SVTR 模型等),可参考[场景应用模型下载](../applications/overview.md) diff --git a/docs/ppocr/blog/PP-OCRv3_introduction.md b/docs/ppocr/blog/PP-OCRv3_introduction.md new file mode 100644 index 0000000000..dd4257a24f --- /dev/null +++ b/docs/ppocr/blog/PP-OCRv3_introduction.md @@ -0,0 +1,183 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-OCRv3 + +## 1. 简介 + +PP-OCRv3在PP-OCRv2的基础上进一步升级。整体的框架图保持了与PP-OCRv2相同的pipeline,针对检测模型和识别模型进行了优化。其中,检测模块仍基于DB算法优化,而识别模块不再采用CRNN,换成了IJCAI 2022最新收录的文本识别算法[SVTR](https://arxiv.org/abs/2205.00159),并对其进行产业适配。PP-OCRv3系统框图如下所示(粉色框中为PP-OCRv3新增策略): + +![img](./images/ppocrv3_framework-0052468.png) + +从算法改进思路上看,分别针对检测和识别模型,进行了共9个方面的改进: + +- 检测模块: + - LK-PAN:大感受野的PAN结构; + - DML:教师模型互学习策略; + - RSE-FPN:残差注意力机制的FPN结构; + +- 识别模块: + - SVTR_LCNet:轻量级文本识别网络; + - GTC:Attention指导CTC训练策略; + - TextConAug:挖掘文字上下文信息的数据增广策略; + - TextRotNet:自监督的预训练模型; + - UDML:联合互学习策略; + - UIM:无标注数据挖掘方案。 + +从效果上看,速度可比情况下,多种场景精度均有大幅提升: + +- 中文场景,相对于PP-OCRv2中文模型提升超5%; +- 英文数字场景,相比于PP-OCRv2英文模型提升11%; +- 多语言场景,优化80+语种识别效果,平均准确率提升超5%。 + +## 2. 检测优化 + +PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.pdf)(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。如下图所示,CML的核心思想结合了①传统的Teacher指导Student的标准蒸馏与 ②Students网络之间的DML互学习,可以让Students网络互学习的同时,Teacher网络予以指导。PP-OCRv3分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 + +![img](./images/ppocrv3_det_cml.png) + +消融实验如下: + +|序号|策略|模型大小|hmean|速度(cpu + mkldnn)| +|-|-|-|-|-| +|baseline teacher|PP-OCR server|49.0M|83.20%|171ms| +|teacher1|DB-R50-LK-PAN|124.0M|85.00%|396ms| +|teacher2|DB-R50-LK-PAN-DML|124.0M|86.00%|396ms| +|baseline student|PP-OCRv2|3.0M|83.20%|117ms| +|student0|DB-MV3-RSE-FPN|3.6M|84.50%|124ms| +|student1|DB-MV3-CML(teacher2)|3.0M|84.30%|117ms| +|student2|DB-MV3-RSE-FPN-CML(teacher2)|3.60M|85.40%|124ms| + +测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 + +### (1)LK-PAN:大感受野的PAN结构 + +LK-PAN (Large Kernel PAN) 是一个具有更大感受野的轻量级[PAN](https://arxiv.org/pdf/1803.01534.pdf)结构,核心是将PAN结构的path augmentation中卷积核从`3*3`改为`9*9`。通过增大卷积核,提升特征图每个位置覆盖的感受野,更容易检测大字体的文字以及极端长宽比的文字。使用LK-PAN结构,可以将教师模型的hmean从83.2%提升到85.0%。 + +![img](./images/LKPAN.png) + +### (2)DML:教师模型互学习策略 + +[DML](https://arxiv.org/abs/1706.00384) (Deep Mutual Learning)互学习蒸馏方法,如下图所示,通过两个结构相同的模型互相学习,可以有效提升文本检测模型的精度。教师模型采用DML策略,hmean从85%提升到86%。将PP-OCRv2中CML的教师模型更新为上述更高精度的教师模型,学生模型的hmean可以进一步从83.2%提升到84.3%。 + +![img](./images/teacher_dml.png) + +### (3)RSE-FPN:残差注意力机制的FPN结构 + +RSE-FPN(Residual Squeeze-and-Excitation FPN)如下图所示,引入残差结构和通道注意力结构,将FPN中的卷积层更换为通道注意力结构的RSEConv层,进一步提升特征图的表征能力。考虑到PP-OCRv2的检测模型中FPN通道数非常小,仅为96,如果直接用SEblock代替FPN中卷积会导致某些通道的特征被抑制,精度会下降。RSEConv引入残差结构会缓解上述问题,提升文本检测效果。进一步将PP-OCRv2中CML的学生模型的FPN结构更新为RSE-FPN,学生模型的hmean可以进一步从84.3%提升到85.4%。 + +![img](./images/RSEFPN.png) + +## 3. 识别优化 + +PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。直接将PP-OCRv2的识别模型,替换成SVTR_Tiny,识别准确率从74.8%提升到80.1%(+5.3%),但是预测速度慢了将近11倍,CPU上预测一条文本行,将近100ms。因此,如下图所示,PP-OCRv3采用如下6个优化策略进行识别模型加速。 + +![img](./images/v3_rec_pipeline.png) + +基于上述策略,PP-OCRv3识别模型相比PP-OCRv2,在速度可比的情况下,精度进一步提升4.6%。 具体消融实验如下所示: + +| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU + MKLDNN)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv2 | 8.0M | 74.80% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 04 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | +| 05 | + GTC | 12.0M | 75.80% | 7.60ms | +| 06 | + TextConAug | 12.0M | 76.30% | 7.60ms | +| 07 | + TextRotNet | 12.0M | 76.90% | 7.60ms | +| 08 | + UDML | 12.0M | 78.40% | 7.60ms | +| 09 | + UIM | 12.0M | 79.40% | 7.60ms | + +注: 测试速度时,实验01-03输入图片尺寸均为(3,32,320),04-08输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时开启MKLDNN加速。 + +### (1)SVTR_LCNet:轻量级文本识别网络 + +SVTR_LCNet是针对文本识别任务,将基于Transformer的[SVTR](https://arxiv.org/abs/2205.00159)网络和轻量级CNN网络[PP-LCNet](https://arxiv.org/abs/2109.15099) 融合的一种轻量级文本识别网络。使用该网络,预测速度优于PP-OCRv2的识别模型20%,但是由于没有采用蒸馏策略,该识别模型效果略差。此外,进一步将输入图片规范化高度从32提升到48,预测速度稍微变慢,但是模型效果大幅提升,识别准确率达到73.98%(+2.08%),接近PP-OCRv2采用蒸馏策略的识别模型效果。 + +SVTR_Tiny 网络结构如下所示: + +![img](./images/svtr_tiny.png) + +由于 MKLDNN 加速库支持的模型结构有限,SVTR 在 CPU+MKLDNN 上相比 PP-OCRv2 慢了10倍。PP-OCRv3 期望在提升模型精度的同时,不带来额外的推理耗时。通过分析发现,SVTR_Tiny 结构的主要耗时模块为 Mixing Block,因此我们对 SVTR_Tiny 的结构进行了一系列优化(详细速度数据请参考下方消融实验表格): + +1. 将 SVTR 网络前半部分替换为 PP-LCNet 的前三个stage,保留4个 Global Mixing Block ,精度为76%,加速69%,网络结构如下所示: + + ![img](./images/svtr_g4.png) + +2. 将4个 Global Mixing Block 减小到2个,精度为72.9%,加速69%,网络结构如下所示: + + ![img](./images/svtr_g2.png) + +3. 实验发现 Global Mixing Block 的预测速度与输入其特征的shape有关,因此后移 Global Mixing Block 的位置到池化层之后,精度下降为71.9%,速度超越基于CNN结构的PP-OCRv2-baseline 22%,网络结构如下所示: + + ![img](./images/LCNet_SVTR.png) + +具体消融实验如下所示: + +| ID | 策略 | 模型大小 | 精度 | 速度(CPU + MKLDNN)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv2-baseline | 8.0M | 69.30% | 8.54ms | +| 02 | SVTR_Tiny | 21.0M | 80.10% | 97.00ms | +| 03 | SVTR_LCNet(G4) | 9.2M | 76.00% | 30.00ms | +| 04 | SVTR_LCNet(G2) | 13.0M | 72.98% | 9.37ms | +| 05 | SVTR_LCNet(h32) | 12.0M | 71.90% | 6.60ms | +| 06 | SVTR_LCNet(h48) | 12.0M | 73.98% | 7.60ms | + +注: 测试速度时,01-05输入图片尺寸均为(3,32,320); PP-OCRv2-baseline 代表没有借助蒸馏方法训练得到的模型 + +### (2)GTC:Attention指导CTC训练策略 + +[GTC](https://arxiv.org/pdf/2002.01276.pdf)(Guided Training of CTC),利用Attention模块CTC训练,融合多种文本特征的表达,是一种有效的提升文本识别的策略。使用该策略,预测时完全去除 Attention 模块,在推理阶段不增加任何耗时,识别模型的准确率进一步提升到75.8%(+1.82%)。训练流程如下所示: + +![img](./images/GTC.png) + +### (3)TextConAug:挖掘文字上下文信息的数据增广策略 + +TextConAug是一种挖掘文字上下文信息的数据增广策略,主要思想来源于论文[ConCLR](https://www.cse.cuhk.edu.hk/~byu/papers/C139-AAAI2022-ConCLR.pdf),作者提出ConAug数据增广,在一个batch内对2张不同的图像进行联结,组成新的图像并进行自监督对比学习。PP-OCRv3将此方法应用到有监督的学习任务中,设计了TextConAug数据增强方法,可以丰富训练数据上下文信息,提升训练数据多样性。使用该策略,识别模型的准确率进一步提升到76.3%(+0.5%)。TextConAug示意图如下所示: + +![img](./images/recconaug.png) + +### (4)TextRotNet:自监督的预训练模型 + +TextRotNet是使用大量无标注的文本行数据,通过自监督方式训练的预训练模型,参考于论文[STR-Fewer-Labels](https://github.com/ku21fan/STR-Fewer-Labels)。该模型可以初始化SVTR_LCNet的初始权重,从而帮助文本识别模型收敛到更佳位置。使用该策略,识别模型的准确率进一步提升到76.9%(+0.6%)。TextRotNet训练流程如下图所示: + +img + +### (5)UDML:联合互学习策略 + +UDML(Unified-Deep Mutual Learning)联合互学习是PP-OCRv2中就采用的对于文本识别非常有效的提升模型效果的策略。在PP-OCRv3中,针对两个不同的SVTR_LCNet和Attention结构,对他们之间的PP-LCNet的特征图、SVTR模块的输出和Attention模块的输出同时进行监督训练。使用该策略,识别模型的准确率进一步提升到78.4%(+1.5%)。 + +### (6)UIM:无标注数据挖掘方案 + +UIM(Unlabeled Images Mining)是一种非常简单的无标注数据挖掘方案。核心思想是利用高精度的文本识别大模型对无标注数据进行预测,获取伪标签,并且选择预测置信度高的样本作为训练数据,用于训练小模型。使用该策略,识别模型的准确率进一步提升到79.4%(+1%)。实际操作中,我们使用全量数据集训练高精度SVTR-Tiny模型(acc=82.5%)进行数据挖掘,点击获取[模型下载地址和使用教程](../applications/高精度中文识别模型.md)。 + +img + +## 4. 端到端评估 + +经过以上优化,最终PP-OCRv3在速度可比情况下,中文场景端到端Hmean指标相比于PP-OCRv2提升5%,效果大幅提升。具体指标如下表所示: + +| Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | Time Cost (T4 GPU, ms) | +|-----|-----|--------|----| --- | +| PP-OCR mobile | 50.30% | 8.1 | 356.00 | 116.00 | +| PP-OCR server | 57.00% | 155.1 | 1056.00 | 200.00 | +| PP-OCRv2 | 57.60% | 11.6 | 330.00 | 111.00 | +| PP-OCRv3 | 62.90% | 15.6 | 331.00 | 86.64 | + +测试环境:CPU型号为Intel Gold 6148,CPU预测时开启MKLDNN加速。 + +除了更新中文模型,本次升级也同步优化了英文数字模型,端到端效果提升11%,如下表所示: + +| Model | Recall | Precision | Hmean | +|-----|-----|--------|----| +| PP-OCR_en | 38.99% | 45.91% | 42.17% | +| PP-OCRv3_en | 50.95% | 55.53% | 53.14% | + +同时,也对已支持的80余种语言识别模型进行了升级更新,在有评估集的四种语系识别准确率平均提升5%以上,如下表所示: + +| Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 | +|-----|-----|--------|----| --- | +| PP-OCR_mul | 69.60% | 40.50% | 38.50% | 55.40% | +| PP-OCRv3_mul | 75.20%| 45.37% | 45.80% | 60.10% | diff --git a/docs/ppocr/blog/PP-OCRv4_introduction.md b/docs/ppocr/blog/PP-OCRv4_introduction.md new file mode 100644 index 0000000000..0579a607d9 --- /dev/null +++ b/docs/ppocr/blog/PP-OCRv4_introduction.md @@ -0,0 +1,153 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-OCRv4 + +## 1. 简介 + +PP-OCRv4在PP-OCRv3的基础上进一步升级。整体的框架图保持了与PP-OCRv3相同的pipeline,针对检测模型和识别模型进行了数据、网络结构、训练策略等多个模块的优化。 PP-OCRv4系统框图如下所示: + +![img](./images/ppocrv4_framework.png) + +从算法改进思路上看,分别针对检测和识别模型,进行了共10个方面的改进: + +* 检测模块: + * LCNetV3:精度更高的骨干网络 + * PFHead:并行head分支融合结构 + * DSR: 训练中动态增加shrink ratio + * CML:添加Student和Teacher网络输出的KL div loss + +* 识别模块: + * SVTR_LCNetV3:精度更高的骨干网络 + * Lite-Neck:精简的Neck结构 + * GTC-NRTR:稳定的Attention指导分支 + * Multi-Scale:多尺度训练策略 + * DF: 数据挖掘方案 + * DKD :DKD蒸馏策略 + +从效果上看,速度可比情况下,多种场景精度均有大幅提升: + +* 中文场景,相对于PP-OCRv3中文模型提升超4%; +* 英文数字场景,相比于PP-OCRv3英文模型提升6%; +* 多语言场景,优化80个语种识别效果,平均准确率提升超8%。 + +## 2. 检测优化 + +PP-OCRv4检测模型在PP-OCRv3检测模型的基础上,在网络结构,训练策略,蒸馏策略三个方面做了优化。首先,PP-OCRv4检测模型使用PP-LCNetV3替换MobileNetv3,并提出并行分支融合的PFhead结构;其次,训练时动态调整shrink ratio的比例;最后,PP-OCRv4对CML的蒸馏loss进行优化,进一步提升文字检测效果。 + +消融实验如下: + +|序号|策略|模型大小|hmean|速度(cpu + mkldnn)| +|-|-|-|-|-| +|baseline|PP-OCRv3|3.4M|78.84%|69ms| +|baseline student|PP-OCRv3 student|3.4M|76.22%|69ms| +|01|+PFHead|3.6M|76.97%|96ms| +|02|+Dynamic Shrink Ratio|3.6M|78.24%|96ms| +|03|+PP-LCNetv3|4.8M|79.08%|94ms| +|03|+CML|4.8M|79.87%|67ms| + +测试环境: Intel Gold 6148 CPU,预测引擎使用openvino。 + +### (1)PFhead:多分支融合Head结构 + +PFhead结构如下图所示,PFHead在经过第一个转置卷积后,分别进行上采样和转置卷积,上采样的输出通过3x3卷积得到输出结果,然后和转置卷积的分支的结果级联并经过1x1卷积层,最后1x1卷积的结果和转置卷积的结果相加得到最后输出的概率图。PP-OCRv4学生检测模型使用PFhead,hmean从76.22%增加到76.97%。 + + ![img](./images/PFHead.png) + +### (2)DSR: 收缩比例动态调整策略 + +动态shrink ratio(dynamic shrink ratio): 在训练中,shrink ratio由固定值调整为动态变化,随着训练epoch的增加,shrink ratio从0.4线性增加到0.6。该策略在PP-OCRv4学生检测模型上,hmean从76.97%提升到78.24%。 + +### (3) PP-LCNetV3:精度更高的骨干网络 + +PP-LCNetV3系列模型是PP-LCNet系列模型的延续,覆盖了更大的精度范围,能够适应不同下游任务的需要。PP-LCNetV3系列模型从多个方面进行了优化,提出了可学习仿射变换模块,对重参数化策略、激活函数进行了改进,同时调整了网络深度与宽度。最终,PP-LCNetV3系列模型能够在性能与效率之间达到最佳的平衡,在不同精度范围内取得极致的推理速度。使用PP-LCNetV3替换MobileNetv3 backbone,PP-OCRv4学生检测模型hmean从78.24%提升到79.08%。 + +### (4)CML: 融合KD的互学习策略 + +PP-OCRv4检测模型对PP-OCRv3中的CML(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了优化。如下图所示,在计算Student Model和Teacher Model的distill Loss时,额外添加KL div loss,让两者输出的response maps分布接近,由此进一步提升Student网络的精度,检测Hmean从79.08%增加到79.56%,端到端指标从61.31%增加到61.87%。 + + ![img](./images/ppocrv4_det_cml.png) + +## 3. 识别优化 + +PP-OCRv4识别模型在PP-OCRv3的基础上进一步升级。如下图所示,整体的框架图保持了与PP-OCRv3识别模型相同的pipeline,分别进行了数据、网络结构、训练策略等方面的优化。 + +![img](./images/v4_rec_pipeline.png) + +经过如图所示的策略优化,PP-OCRv4识别模型相比PP-OCRv3,在速度可比的情况下,精度进一步提升4%。 具体消融实验如下所示: + +| ID | 策略 | 模型大小 | 精度 | 预测耗时(CPU openvino)| +|-----|-----|--------|----| --- | +| 01 | PP-OCRv3 | 12M | 71.50% | 8.54ms | +| 02 | +DF | 12M | 72.70% | 8.54ms | +| 03 | + LiteNeck + GTC | 9.6M | 73.21% | 9.09ms | +| 04 | + PP-LCNetV3 | 11M | 74.18% | 9.8ms | +| 05 | + multi-scale | 11M | 74.20% | 9.8ms | +| 06 | + TextConAug | 11M | 74.72% | 9.8ms | +| 08 | + UDML | 11M | 75.45% | 9.8ms | + +注: 测试速度时,输入图片尺寸均为(3,48,320)。在实际预测时,图像为变长输入,速度会有所变化。测试环境: Intel Gold 6148 CPU,预测时使用Openvino预测引擎。 + +### (1)DF:数据挖掘方案 + +DF(Data Filter) 是一种简单有效的数据挖掘方案。核心思想是利用已有模型预测训练数据,通过置信度和预测结果等信息,对全量的训练数据进行筛选。具体的:首先使用少量数据快速训练得到一个低精度模型,使用该低精度模型对千万级的数据进行预测,去除置信度大于0.95的样本,该部分被认为是对提升模型精度无效的冗余样本。其次使用PP-OCRv3作为高精度模型,对剩余数据进行预测,去除置信度小于0.15的样本,该部分被认为是难以识别或质量很差的样本。 +使用该策略,千万级别训练数据被精简至百万级,模型训练时间从2周减少到5天,显著提升了训练效率,同时精度提升至72.7%(+1.2%)。 + +![img](./images/DF.png) + +### (2)PP-LCNetV3:精度更优的骨干网络 + +PP-LCNetV3系列模型是PP-LCNet系列模型的延续,覆盖了更大的精度范围,能够适应不同下游任务的需要。PP-LCNetV3系列模型从多个方面进行了优化,提出了可学习仿射变换模块,对重参数化策略、激活函数进行了改进,同时调整了网络深度与宽度。最终,PP-LCNetV3系列模型能够在性能与效率之间达到最佳的平衡,在不同精度范围内取得极致的推理速度。 + +### (3)Lite-Neck:精简参数的Neck结构 + +Lite-Neck整体结构沿用PP-OCRv3版本的结构,在参数上稍作精简,识别模型整体的模型大小可从12M降低到8.5M,而精度不变;在CTCHead中,将Neck输出特征的维度从64提升到120,此时模型大小从8.5M提升到9.6M。 + +### (4)GTC-NRTR:Attention指导CTC训练策略 + +GTC(Guided Training of CTC),是PP-OCRv3识别模型的最有效的策略之一,融合多种文本特征的表达,有效的提升文本识别精度。在PP-OCRv4中使用训练更稳定的Transformer模型NRTR作为指导分支,相比V3版本中的SAR基于循环神经网络的结构,NRTR基于Transformer实现解码过程泛化能力更强,能有效指导CTC分支学习,解决简单场景下快速过拟合的问题。使用Lite-Neck和GTC-NRTR两个策略,识别精度提升至73.21%(+0.5%)。 + +![img](./images/ppocrv4_gtc.png) + +### (5)Multi-Scale:多尺度训练策略 + +动态尺度训练策略,是在训练过程中随机resize输入图片的高度,以增强识别模型在端到端串联使用时的鲁棒性。在训练时,每个iter从(32,48,64)三种高度中随机选择一种高度进行resize。实验证明,使用该策略,尽管在识别测试集上准确率没有提升,但在端到端串联评估时,指标提升0.5%。 + +![img](./images/multi_scale.png) + +### (6)DKD:蒸馏策略 + +识别模型的蒸馏包含两个部分,NRTRhead蒸馏和CTCHead蒸馏; + +对于NRTR head,使用了DKD loss蒸馏,拉近学生模型和教师模型的NRTR head logits。最终NRTR head的loss是学生与教师间的DKD loss和与ground truth的cross entropy loss的加权和,用于监督学生模型的backbone训练。通过实验,我们发现加入DKD loss后,计算与ground truth的cross entropy loss时去除label smoothing可以进一步提高精度,因此我们在这里使用的是不带label smoothing的cross entropy loss。 + +对于CTCHead,由于CTC的输出中存在Blank位,即使教师模型和学生模型的预测结果一样,二者的输出的logits分布也会存在差异,影响教师模型向学生模型的知识传递。PP-OCRv4识别模型蒸馏策略中,将CTC输出logits沿着文本长度维度计算均值,将多字符识别问题转换为多字符分类问题,用于监督CTC Head的训练。使用该策略融合NRTRhead DKD蒸馏策略,指标从74.72%提升到75.45%。 + +## 4. 端到端评估 + +经过以上优化,最终PP-OCRv4在速度可比情况下,中文场景端到端Hmean指标相比于PP-OCRv3提升4.5%,效果大幅提升。具体指标如下表所示: + +| Model | Hmean | Model Size (M) | Time Cost (CPU, ms) | +|-----|-----|--------|----| +| PP-OCRv3 | 57.99% | 15.6 | 78 | +| PP-OCRv4 | 62.24% | 15.8 | 76 | + +测试环境:CPU型号为Intel Gold 6148,CPU预测时使用openvino。 + +除了更新中文模型,本次升级也优化了英文数字模型,在自有评估集上文本识别准确率提升6%,如下表所示: + +| Model | ACC | +|-----|-----| +| PP-OCR_en | 54.38% | +| PP-OCRv3_en | 64.04% | +| PP-OCRv4_en | 70.1% | + +同时,对已支持的80余种语言识别模型进行了升级更新,在有评估集的四种语系识别准确率平均提升8%以上,如下表所示: + +| Model | 拉丁语系 | 阿拉伯语系 | 日语 | 韩语 | +|-----|-----|--------|----| --- | +| PP-OCR_mul | 69.60% | 40.50% | 38.50% | 55.40% | +| PP-OCRv3_mul | 71.57%| 72.90% | 45.85% | 77.23% | +| PP-OCRv4_mul | 80.00%| 75.48% | 56.50% | 83.25% | diff --git a/docs/ppocr/blog/clone.en.md b/docs/ppocr/blog/clone.en.md new file mode 100644 index 0000000000..f72ce0431c --- /dev/null +++ b/docs/ppocr/blog/clone.en.md @@ -0,0 +1,31 @@ +--- +comments: true +--- + +# Project Clone + +## 1. Clone PaddleOCR + +```bash linenums="1" +# Recommend +git clone https://github.com/PaddlePaddle/PaddleOCR + +# If you cannot pull successfully due to network problems, you can switch to the mirror hosted on Gitee: + +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: The mirror on Gitee may not keep in synchronization with the latest project on GitHub. There might be a delay of 3-5 days. Please try GitHub at first. +``` + +## 2. Install third-party libraries + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` + +If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. + +Please try to download Shapely whl file from [http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). + +Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) diff --git a/docs/ppocr/blog/clone.md b/docs/ppocr/blog/clone.md new file mode 100644 index 0000000000..efc4b658d7 --- /dev/null +++ b/docs/ppocr/blog/clone.md @@ -0,0 +1,26 @@ +--- +comments: true +--- + +# 项目克隆 + +## 1. 克隆PaddleOCR repo代码 + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +如果因为网络问题无法pull成功,也可选择使用码云上的托管: + +```bash linenums="1" +git clone https://gitee.com/paddlepaddle/PaddleOCR +``` + +注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 + +## 2. 安装第三方库 + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` diff --git a/docs/ppocr/blog/config.en.md b/docs/ppocr/blog/config.en.md new file mode 100644 index 0000000000..21584b403a --- /dev/null +++ b/docs/ppocr/blog/config.en.md @@ -0,0 +1,245 @@ +--- +comments: true +--- + +# Configuration + +## 1. Optional Parameter List + +The following list can be viewed through `--help` + +| FLAG | Supported script | Use | Defaults | Note | +| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | +| -c | ALL | Specify configuration file to use | None | **Please refer to the parameter introduction for configuration file usage** | +| -o | ALL | set configuration options | None | Configuration using -o has higher priority than the configuration file selected with -c. E.g: -o Global.use_gpu=false | + +## 2. Introduction to Global Parameters of Configuration File + +Take rec_chinese_lite_train_v2.0.yml as an example + +### Global + +| Parameter | Use | Defaults | Note | +| :----------------------: | :---------------------: | :--------------: | :--------------------: | +| use_gpu | Set using GPU or not | true | \ | +| epoch_num | Maximum training epoch number | 500 | \ | +| log_smooth_window | Log queue length, the median value in the queue each time will be printed | 20 | \ | +| print_batch_step | Set print log interval | 10 | \ | +| save_model_dir | Set model save path | output/{算法名称} | \ | +| save_epoch_step | Set model save interval | 3 | \ | +| eval_batch_step | Set the model evaluation interval | 2000 or [1000, 2000] | running evaluation every 2000 iters or evaluation is run every 2000 iterations after the 1000th iteration | +| cal_metric_during_train | Set whether to evaluate the metric during the training process. At this time, the metric of the model under the current batch is evaluated | true | \ | +| load_static_weights | Set whether the pre-training model is saved in static graph mode (currently only required by the detection algorithm) | true | \ | +| pretrained_model | Set the path of the pre-trained model | ./pretrain_models/CRNN/best_accuracy | \ | +| checkpoints | set model parameter path | None | Used to load parameters after interruption to continue training| +| use_visualdl | Set whether to enable visualdl for visual log display | False | [Tutorial](https://www.paddlepaddle.org.cn/paddle/visualdl) | +| use_wandb | Set whether to enable W&B for visual log display | False | [Documentation](https://docs.wandb.ai/) +| infer_img | Set inference image path or folder path | ./infer_img | \|| +| character_dict_path | Set dictionary path | ./ppocr/utils/ppocr_keys_v1.txt | If the character_dict_path is None, model can only recognize number and lower letters | +| max_text_length | Set the maximum length of text | 25 | \ | +| use_space_char | Set whether to recognize spaces | True | \| | +| label_list | Set the angle supported by the direction classifier | ['0','180'] | Only valid in angle classifier model | +| save_res_path | Set the save address of the test model results | ./output/det_db/predicts_db.txt | Only valid in the text detection model | + +### Optimizer ([ppocr/optimizer](../../ppocr/optimizer)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | Optimizer class name | Adam | Currently supports`Momentum`,`Adam`,`RMSProp`, see [ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) | +| beta1 | Set the exponential decay rate for the 1st moment estimates | 0.9 | \ | +| beta2 | Set the exponential decay rate for the 2nd moment estimates | 0.999 | \ | +| clip_norm | The maximum norm value | - | \ | +| **lr** | Set the learning rate decay method | - | \ | +| name | Learning rate decay class name | Cosine | Currently supports`Linear`,`Cosine`,`Step`,`Piecewise`, see[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) | +| learning_rate | Set the base learning rate | 0.001 | \ | +| **regularizer** | Set network regularization method | - | \ | +| name | Regularizer class name | L2 | Currently support`L1`,`L2`, see[ppocr/optimizer/regularizer.py](../../ppocr/optimizer/regularizer.py) | +| factor | Regularizer coefficient | 0.00001 | \ | + +### Architecture ([ppocr/modeling](../../ppocr/modeling)) + +In PaddleOCR, the network is divided into four stages: Transform, Backbone, Neck and Head + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| model_type | Network Type | rec | Currently support`rec`,`det`,`cls` | +| algorithm | Model name | CRNN | See [algorithm_overview](./algorithm_overview_en.md) for the support list | +| **Transform** | Set the transformation method | - | Currently only recognition algorithms are supported, see [ppocr/modeling/transform](../../ppocr/modeling/transforms) for details | +| name | Transformation class name | TPS | Currently supports `TPS` | +| num_fiducial | Number of TPS control points | 20 | Ten on the top and bottom | +| loc_lr | Localization network learning rate | 0.1 | \ | +| model_name | Localization network size | small | Currently support`small`,`large` | +| **Backbone** | Set the network backbone class name | - | see [ppocr/modeling/backbones](../../ppocr/modeling/backbones) | +| name | backbone class name | ResNet | Currently support`MobileNetV3`,`ResNet` | +| layers | resnet layers | 34 | Currently support18,34,50,101,152,200 | +| model_name | MobileNetV3 network size | small | Currently support`small`,`large` | +| **Neck** | Set network neck | - | see[ppocr/modeling/necks](../../ppocr/modeling/necks) | +| name | neck class name | SequenceEncoder | Currently support`SequenceEncoder`,`DBFPN` | +| encoder_type | SequenceEncoder encoder type | rnn | Currently support`reshape`,`fc`,`rnn` | +| hidden_size | rnn number of internal units | 48 | \ | +| out_channels | Number of DBFPN output channels | 256 | \ | +| **Head** | Set the network head | - | see[ppocr/modeling/heads](../../ppocr/modeling/heads) | +| name | head class name | CTCHead | Currently support`CTCHead`,`DBHead`,`ClsHead` | +| fc_decay | CTCHead regularization coefficient | 0.0004 | \ | +| k | DBHead binarization coefficient | 50 | \ | +| class_dim | ClsHead output category number | 2 | \ | + +### Loss ([ppocr/losses](../../ppocr/losses)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | loss class name | CTCLoss | Currently support`CTCLoss`,`DBLoss`,`ClsLoss` | +| balance_loss | Whether to balance the number of positive and negative samples in DBLossloss (using OHEM) | True | \ | +| ohem_ratio | The negative and positive sample ratio of OHEM in DBLossloss | 3 | \ | +| main_loss_type | The loss used by shrink_map in DBLossloss | DiceLoss | Currently support`DiceLoss`,`BCELoss` | +| alpha | The coefficient of shrink_map_loss in DBLossloss | 5 | \ | +| beta | The coefficient of threshold_map_loss in DBLossloss | 10 | \ | + +### PostProcess ([ppocr/postprocess](../../ppocr/postprocess)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | Post-processing class name | CTCLabelDecode | Currently support`CTCLoss`,`AttnLabelDecode`,`DBPostProcess`,`ClsPostProcess` | +| thresh | The threshold for binarization of the segmentation map in DBPostProcess | 0.3 | \ | +| box_thresh | The threshold for filtering output boxes in DBPostProcess. Boxes below this threshold will not be output | 0.7 | \ | +| max_candidates | The maximum number of text boxes output in DBPostProcess | 1000 | | +| unclip_ratio | The unclip ratio of the text box in DBPostProcess | 2.0 | \ | + +### Metric ([ppocr/metrics](../../ppocr/metrics)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | Metric method name | CTCLabelDecode | Currently support`DetMetric`,`RecMetric`,`ClsMetric` | +| main_indicator | Main indicators, used to select the best model | acc | For the detection method is hmean, the recognition and classification method is acc | + +### Dataset ([ppocr/data](../../ppocr/data)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| **dataset** | Return one sample per iteration | - | - | +| name | dataset class name | SimpleDataSet | Currently support`SimpleDataSet`,`LMDBDataSet` | +| data_dir | Image folder path | ./train_data | \ | +| label_file_list | Groundtruth file path | ["./train_data/train_list.txt"] | This parameter is not required when dataset is LMDBDataSet | +| ratio_list | Ratio of data set | [1.0] | If there are two train_lists in label_file_list and ratio_list is [0.4,0.6], 40% will be sampled from train_list1, and 60% will be sampled from train_list2 to combine the entire dataset | +| transforms | List of methods to transform images and labels | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | see[ppocr/data/imaug](../../ppocr/data/imaug) | +| **loader** | dataloader related | - | | +| shuffle | Does each epoch disrupt the order of the data set | True | \ | +| batch_size_per_card | Single card batch size during training | 256 | \ | +| drop_last | Whether to discard the last incomplete mini-batch because the number of samples in the data set cannot be divisible by batch_size | True | \ | +| num_workers | The number of sub-processes used to load data, if it is 0, the sub-process is not started, and the data is loaded in the main process | 8 | \ | + +### Weights & Biases ([W&B](../../ppocr/utils/loggers/wandb_logger.py)) + +| Parameter | Use | Defaults | Note | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| project | Project to which the run is to be logged | uncategorized | \ +| name | Alias/Name of the run | Randomly generated by wandb | \ +| id | ID of the run | Randomly generated by wandb | \ +| entity | User or team to which the run is being logged | The logged in user | \ +| save_dir | local directory in which all the models and other data is saved | wandb | \ +| config | model configuration | None | \ + +## 3. Multilingual Config File Generation + +PaddleOCR currently supports recognition for 80 languages (besides Chinese). A multi-language configuration file template is +provided under the path `configs/rec/multi_languages`: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 + +There are two ways to create the required configuration file: + +1. Automatically generated by script + +Script [generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) can help you generate configuration files for multi-language models. + +- Take Italian as an example, if your data is prepared in the following format: + + ```text linenums="1" + |-train_data + |- it_train.txt # train_set label + |- it_val.txt # val_set label + |- data + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... + ``` + + You can use the default parameters to generate a configuration file: + + ```bash linenums="1" + # The code needs to be run in the specified directory + cd PaddleOCR/configs/rec/multi_language/ + # Set the configuration file of the language to be generated through the -l or --language parameter. + # This command will write the default parameters into the configuration file + python3 generate_multi_language_configs.py -l it + ``` + +- If your data is placed in another location, or you want to use your own dictionary, you can generate the configuration file by specifying the relevant parameters: + + ```bash linenums="1" + # -l or --language field is required + # --train to modify the training set + # --val to modify the validation set + # --data_dir to modify the data set directory + # --dict to modify the dict path + # -o to modify the corresponding default parameters + cd PaddleOCR/configs/rec/multi_language/ + python3 generate_multi_language_configs.py -l it \ # language + --train {path/of/train_label.txt} \ # path of train_label + --val {path/of/val_label.txt} \ # path of val_label + --data_dir {train_data/path} \ # root directory of training data + --dict {path/of/dict} \ # path of dict + -o Global.use_gpu=False # whether to use gpu + ... + + ``` + +Italian is made up of Latin letters, so after executing the command, you will get the rec_latin_lite_train.yml. + +2. Manually modify the configuration file + + You can also manually modify the following fields in the template: + + ```yaml linenums="1" + Global: + use_gpu: True + epoch_num: 500 + ... + character_dict_path: {path/of/dict} # path of dict + + Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ # root directory of training data + label_file_list: ["./train_data/train_list.txt"] # train label path + ... + + Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ # root directory of val data + label_file_list: ["./train_data/val_list.txt"] # val label path + ... + + ``` + +Currently, the multi-language algorithms supported by PaddleOCR are: + +| Configuration file | Algorithm name | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | chinese traditional | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | English(Case sensitive) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | French | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | German | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Japanese | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Korean | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Latin | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | arabic | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | cyrillic | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | devanagari | + +For more supported languages, please refer to : [Multi-language model](./multi_languages.en.md) + +The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. + +- [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA),Extraction code:frgi. +- [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) diff --git a/docs/ppocr/blog/config.md b/docs/ppocr/blog/config.md new file mode 100644 index 0000000000..c68ebe75d2 --- /dev/null +++ b/docs/ppocr/blog/config.md @@ -0,0 +1,222 @@ +--- +comments: true +--- + +# 配置文件内容与生成 + +## 1. 可选参数列表 + +以下列表可以通过`--help`查看 + +| FLAG | 支持脚本 | 用途 | 默认值 | 备注 | +| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | +| -c | ALL | 指定配置文件 | None | **配置模块说明请参考 参数介绍** | +| -o | ALL | 设置配置文件里的参数内容 | None | 使用-o配置相较于-c选择的配置文件具有更高的优先级。例如:`-o Global.use_gpu=false` | + +## 2. 配置文件参数介绍 + +以 `rec_chinese_lite_train_v2.0.yml` 为例 + +### Global + +| 字段 | 用途 | 默认值 | 备注 | +| :----------------------: | :---------------------: | :--------------: | :--------------------: | +| use_gpu | 设置代码是否在gpu运行 | true | \ | +| epoch_num | 最大训练epoch数 | 500 | \ | +| log_smooth_window | log队列长度,每次打印输出队列里的中间值 | 20 | \ | +| print_batch_step | 设置打印log间隔 | 10 | \ | +| save_model_dir | 设置模型保存路径 | output/{算法名称} | \ | +| save_epoch_step | 设置模型保存间隔 | 3 | \ | +| eval_batch_step | 设置模型评估间隔 | 2000 或 [1000, 2000] | 2000 表示每2000次迭代评估一次,[1000, 2000]表示从1000次迭代开始,每2000次评估一次 | +| cal_metric_during_train | 设置是否在训练过程中评估指标,此时评估的是模型在当前batch下的指标 | true | \ | +| load_static_weights | 设置预训练模型是否是静态图模式保存(目前仅检测算法需要) | true | \ | +| pretrained_model | 设置加载预训练模型路径 | ./pretrain_models/CRNN/best_accuracy | \ | +| checkpoints | 加载模型参数路径 | None | 用于中断后加载参数继续训练 | +| use_visualdl | 设置是否启用visualdl进行可视化log展示 | False | [教程地址](https://www.paddlepaddle.org.cn/paddle/visualdl) | +| infer_img | 设置预测图像路径或文件夹路径 | ./infer_img | \|| +| character_dict_path | 设置字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | 如果为空,则默认使用小写字母+数字作为字典 | +| max_text_length | 设置文本最大长度 | 25 | \ | +| use_space_char | 设置是否识别空格 | True | \| | +| label_list | 设置方向分类器支持的角度 | ['0','180'] | 仅在方向分类器中生效 | +| save_res_path | 设置检测模型的结果保存地址 | ./output/det_db/predicts_db.txt | 仅在检测模型中生效 | + +### Optimizer ([ppocr/optimizer](../../ppocr/optimizer)) + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: |:-------------:|:-------------:| :--------------------: | +| name | 优化器类名 | Adam | 目前支持`Momentum`,`Adam`,`RMSProp`, 见[ppocr/optimizer/optimizer.py](../../ppocr/optimizer/optimizer.py) | +| beta1 | 设置一阶矩估计的指数衰减率 | 0.9 | \ | +| beta2 | 设置二阶矩估计的指数衰减率 | 0.999 | \ | +| clip_norm | 所允许的二范数最大值 | | \ | +| **lr** | 设置学习率decay方式 | - | \ | +| name | 学习率decay类名 | Cosine | 目前支持`Linear`,`Cosine`,`Step`,`Piecewise`, 见[ppocr/optimizer/learning_rate.py](../../ppocr/optimizer/learning_rate.py) | +| learning_rate | 基础学习率 | 0.001 | \ | +| **regularizer** | 设置网络正则化方式 | - | \ | +| name | 正则化类名 | L2 | 目前支持`L1`,`L2`, 见[ppocr/optimizer/regularizer.py](../../ppocr/optimizer/regularizer.py) | +| factor | 正则化系数 | 0.00001 | \ | + +### Architecture ([ppocr/modeling](../../ppocr/modeling)) + +在PaddleOCR中,网络被划分为Transform,Backbone,Neck和Head四个阶段 + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| model_type | 网络类型 | rec | 目前支持`rec`,`det`,`cls` | +| algorithm | 模型名称 | CRNN | 支持列表见[algorithm_overview](./algorithm_overview.md) | +| **Transform** | 设置变换方式 | - | 目前仅rec类型的算法支持, 具体见[ppocr/modeling/transforms](../../ppocr/modeling/transforms) | +| name | 变换方式类名 | TPS | 目前支持`TPS` | +| num_fiducial | TPS控制点数 | 20 | 上下边各十个 | +| loc_lr | 定位网络学习率 | 0.1 | \ | +| model_name | 定位网络大小 | small | 目前支持`small`,`large` | +| **Backbone** | 设置网络backbone类名 | - | 具体见[ppocr/modeling/backbones](../../ppocr/modeling/backbones) | +| name | backbone类名 | ResNet | 目前支持`MobileNetV3`,`ResNet` | +| layers | resnet层数 | 34 | 支持18,34,50,101,152,200 | +| model_name | MobileNetV3 网络大小 | small | 支持`small`,`large` | +| **Neck** | 设置网络neck | - | 具体见[ppocr/modeling/necks](../../ppocr/modeling/necks) | +| name | neck类名 | SequenceEncoder | 目前支持`SequenceEncoder`,`DBFPN` | +| encoder_type | SequenceEncoder编码器类型 | rnn | 支持`reshape`,`fc`,`rnn` | +| hidden_size | rnn内部单元数 | 48 | \ | +| out_channels | DBFPN输出通道数 | 256 | \ | +| **Head** | 设置网络Head | - | 具体见[ppocr/modeling/heads](../../ppocr/modeling/heads) | +| name | head类名 | CTCHead | 目前支持`CTCHead`,`DBHead`,`ClsHead` | +| fc_decay | CTCHead正则化系数 | 0.0004 | \ | +| k | DBHead二值化系数 | 50 | \ | +| class_dim | ClsHead输出分类数 | 2 | \ | + +### Loss ([ppocr/losses](../../ppocr/losses)) + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | 网络loss类名 | CTCLoss | 目前支持`CTCLoss`,`DBLoss`,`ClsLoss` | +| balance_loss | DBLossloss中是否对正负样本数量进行均衡(使用OHEM) | True | \ | +| ohem_ratio | DBLossloss中的OHEM的负正样本比例 | 3 | \ | +| main_loss_type | DBLossloss中shrink_map所采用的loss | DiceLoss | 支持`DiceLoss`,`BCELoss` | +| alpha | DBLossloss中shrink_map_loss的系数 | 5 | \ | +| beta | DBLossloss中threshold_map_loss的系数 | 10 | \ | + +### PostProcess ([ppocr/postprocess](../../ppocr/postprocess)) + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | 后处理类名 | CTCLabelDecode | 目前支持`CTCLoss`,`AttnLabelDecode`,`DBPostProcess`,`ClsPostProcess` | +| thresh | DBPostProcess中分割图进行二值化的阈值 | 0.3 | \ | +| box_thresh | DBPostProcess中对输出框进行过滤的阈值,低于此阈值的框不会输出 | 0.7 | \ | +| max_candidates | DBPostProcess中输出的最大文本框数量 | 1000 | | +| unclip_ratio | DBPostProcess中对文本框进行放大的比例 | 2.0 | \ | + +### Metric ([ppocr/metrics](../../ppocr/metrics)) + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| name | 指标评估方法名称 | CTCLabelDecode | 目前支持`DetMetric`,`RecMetric`,`ClsMetric` | +| main_indicator | 主要指标,用于选取最优模型 | acc | 对于检测方法为hmean,识别和分类方法为acc | + +### Dataset ([ppocr/data](../../ppocr/data)) + +| 字段 | 用途 | 默认值 | 备注 | +| :---------------------: | :---------------------: | :--------------: | :--------------------: | +| **dataset** | 每次迭代返回一个样本 | - | - | +| name | dataset类名 | SimpleDataSet | 目前支持`SimpleDataSet`和`LMDBDataSet` | +| data_dir | 数据集图片存放路径 | ./train_data | \ | +| label_file_list | 数据标签路径 | ["./train_data/train_list.txt"] | dataset为LMDBDataSet时不需要此参数 | +| ratio_list | 数据集的比例 | [1.0] | 若label_file_list中有两个train_list,且ratio_list为[0.4,0.6],则从train_list1中采样40%,从train_list2中采样60%组合整个dataset | +| transforms | 对图片和标签进行变换的方法列表 | [DecodeImage,CTCLabelEncode,RecResizeImg,KeepKeys] | 见[ppocr/data/imaug](../../ppocr/data/imaug) | +| **loader** | dataloader相关 | - | | +| shuffle | 每个epoch是否将数据集顺序打乱 | True | \ | +| batch_size_per_card | 训练时单卡batch size | 256 | \ | +| drop_last | 是否丢弃因数据集样本数不能被 batch_size 整除而产生的最后一个不完整的mini-batch | True | \ | +| num_workers | 用于加载数据的子进程个数,若为0即为不开启子进程,在主进程中进行数据加载 | 8 | \ | + +## 3. 多语言配置文件生成 + +PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi_languages` 路径下提供了一个多语言的配置文件模版: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 + +您有两种方式创建所需的配置文件: + +1. 通过脚本自动生成 + +[generate_multi_language_configs.py](../../configs/rec/multi_language/generate_multi_language_configs.py) 可以帮助您生成多语言模型的配置文件 + +* 以意大利语为例,如果您的数据是按如下格式准备的: + + ```text linenums="1" + |-train_data + |- it_train.txt # 训练集标签 + |- it_val.txt # 验证集标签 + |- data + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... + ``` + + 可以使用默认参数,生成配置文件: + + ```bash linenums="1" + # 该代码需要在指定目录运行 + cd PaddleOCR/configs/rec/multi_language/ + # 通过-l或者--language参数设置需要生成的语种的配置文件,该命令会将默认参数写入配置文件 + python3 generate_multi_language_configs.py -l it + ``` + +* 如果您的数据放置在其他位置,或希望使用自己的字典,可以通过指定相关参数来生成配置文件: + + ```bash linenums="1" + # -l或者--language字段是必须的 + # --train修改训练集,--val修改验证集,--data_dir修改数据集目录,--dict修改字典路径, -o修改对应默认参数 + cd PaddleOCR/configs/rec/multi_language/ + python3 generate_multi_language_configs.py -l it \ # 语种 + --train {path/of/train_label.txt} \ # 训练标签文件的路径 + --val {path/of/val_label.txt} \ # 验证集标签文件的路径 + --data_dir {train_data/path} \ # 训练数据的根目录 + --dict {path/of/dict} \ # 字典文件路径 + -o Global.use_gpu=False # 是否使用gpu + ... + + ``` + +意大利文由拉丁字母组成,因此执行完命令后会得到名为 rec_latin_lite_train.yml 的配置文件。 + +2. 手动修改配置文件 + + 您也可以手动修改模版中的以下几个字段得到配置文件: + + ```yaml linenums="1" + Global: + use_gpu: True + epoch_num: 500 + ... + character_dict_path: {path/of/dict} # 字典文件所在路径 + + Train: + dataset: + name: SimpleDataSet + data_dir: train_data/ # 数据存放根目录 + label_file_list: ["./train_data/train_list.txt"] # 训练集label路径 + ... + + Eval: + dataset: + name: SimpleDataSet + data_dir: train_data/ # 数据存放根目录 + label_file_list: ["./train_data/val_list.txt"] # 验证集label路径 + ... + + ``` + +目前PaddleOCR支持的多语言算法有: + +| 配置文件 | 算法名称 | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 中文繁体 | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 英语(区分大小写) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 法语 | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 德语 | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 日语 | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 韩语 | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 拉丁字母 | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 阿拉伯字母 | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 | + +更多支持语种请参考: [多语言模型](./multi_languages.md) diff --git a/docs/ppocr/blog/customize.en.md b/docs/ppocr/blog/customize.en.md new file mode 100644 index 0000000000..9385af7115 --- /dev/null +++ b/docs/ppocr/blog/customize.en.md @@ -0,0 +1,39 @@ +--- +comments: true +--- + +# HOW TO MAKE YOUR OWN LIGHTWEIGHT OCR MODEL? + +The process of making a customized ultra-lightweight OCR models can be divided into three steps: training text detection model, training text recognition model, and concatenate the predictions from previous steps. + +## STEP1: TRAIN TEXT DETECTION MODEL + +PaddleOCR provides two text detection algorithms: EAST and DB. Both support MobileNetV3 and ResNet50_vd backbone networks, select the corresponding configuration file as needed and start training. For example, to train with MobileNetV3 as the backbone network for DB detection model : + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml 2>&1 | tee det_db.log +``` + +For more details about data preparation and training tutorials, refer to the documentation [Text detection model training/evaluation/prediction](../model_train/detection.en.md) + +## STEP2: TRAIN TEXT RECOGNITION MODEL + +PaddleOCR provides four text recognition algorithms: CRNN, Rosetta, STAR-Net, and RARE. They all support two backbone networks: MobileNetV3 and ResNet34_vd, select the corresponding configuration files as needed to start training. For example, to train a CRNN recognition model that uses MobileNetV3 as the backbone network: + +```bash linenums="1" +python3 tools/train.py -c configs/rec/rec_chinese_lite_train.yml 2>&1 | tee rec_ch_lite.log +``` + +For more details about data preparation and training tutorials, refer to the documentation [Text recognition model training/evaluation/prediction](../model_train/recognition.en.md) + +## STEP3: CONCATENATE PREDICTIONS + +PaddleOCR provides a concatenation tool for detection and recognition models, which can connect any trained detection model and any recognition model into a two-stage text recognition system. The input image goes through four main stages: text detection, text rectification, text recognition, and score filtering to output the text position and recognition results, and at the same time, you can choose to visualize the results. + +When performing prediction, you need to specify the path of a single image or a image folder through the parameter `image_dir`, the parameter `det_model_dir` specifies the path of detection model, and the parameter `rec_model_dir` specifies the path of recognition model. The visualized results are saved to the `./inference_results` folder by default. + +```bash linenums="1" +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/det/" --rec_model_dir="./inference/rec/" +``` + +For more details about text detection and recognition concatenation, please refer to the document [Inference](../infer_deploy/python_infer.en.md) diff --git a/docs/ppocr/blog/customize.md b/docs/ppocr/blog/customize.md new file mode 100644 index 0000000000..cf72b2d173 --- /dev/null +++ b/docs/ppocr/blog/customize.md @@ -0,0 +1,39 @@ +--- +comments: true +--- + +# 如何生产自定义超轻量模型? + +生产自定义的超轻量模型可分为三步:训练文本检测模型、训练文本识别模型、模型串联预测。 + +## step1:训练文本检测模型 + +PaddleOCR提供了EAST、DB两种文本检测算法,均支持MobileNetV3、ResNet50_vd两种骨干网络,根据需要选择相应的配置文件,启动训练。例如,训练使用MobileNetV3作为骨干网络的DB检测模型(即超轻量模型使用的配置): + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml 2>&1 | tee det_db.log +``` + +更详细的数据准备和训练教程参考文档教程中[文本检测模型训练/评估/预测](../model_train/detection.md)。 + +## step2:训练文本识别模型 + +PaddleOCR提供了CRNN、Rosetta、STAR-Net、RARE四种文本识别算法,均支持MobileNetV3、ResNet34_vd两种骨干网络,根据需要选择相应的配置文件,启动训练。例如,训练使用MobileNetV3作为骨干网络的CRNN识别模型(即超轻量模型使用的配置): + +```bash linenums="1" +python3 tools/train.py -c configs/rec/rec_chinese_lite_train.yml 2>&1 | tee rec_ch_lite.log +``` + +更详细的数据准备和训练教程参考文档教程中[文本识别模型训练/评估/预测](../model_train/recognition.md)。 + +## step3:模型串联预测 + +PaddleOCR提供了检测和识别模型的串联工具,可以将训练好的任一检测模型和任一识别模型串联成两阶段的文本识别系统。输入图像经过文本检测、检测框矫正、文本识别、得分过滤四个主要阶段输出文本位置和识别结果,同时可选择对结果进行可视化。 + +在执行预测时,需要通过参数image_dir指定单张图像或者图像集合的路径、参数det_model_dir指定检测inference模型的路径和参数rec_model_dir指定识别inference模型的路径。可视化识别结果默认保存到 ./inference_results 文件夹里面。 + +```bash linenums="1" +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/11.jpg" --det_model_dir="./inference/det/" --rec_model_dir="./inference/rec/" +``` + +更多的文本检测、识别串联推理使用方式请参考文档教程中的[基于预测引擎推理](../infer_deploy/python_infer.md)。 diff --git a/docs/ppocr/blog/distributed_training.en.md b/docs/ppocr/blog/distributed_training.en.md new file mode 100644 index 0000000000..dd225758a4 --- /dev/null +++ b/docs/ppocr/blog/distributed_training.en.md @@ -0,0 +1,65 @@ +--- +comments: true +--- + +# Distributed training + +## Introduction + +The high performance of distributed training is one of the core advantages of PaddlePaddle. In the classification task, distributed training can achieve almost linear speedup ratio. Generally, OCR training task need massive training data. Such as recognition, PP-OCR v2.0 model is trained based on 1800W dataset, which is very time-consuming if using single machine. Therefore, the distributed training is used in PaddleOCR to speedup the training task. For more information about distributed training, please refer to [distributed training quick start tutorial](https://fleet-x.readthedocs.io/en/latest/paddle_fleet_rst/parameter_server/ps_quick_start.html). + +## Quick Start + +### Training with single machine + +Take recognition as an example. After the data is prepared locally, start the training task with the interface of `paddle.distributed.launch`. The start command as follows: + +```bash linenums="1" +python3 -m paddle.distributed.launch \ + --log_dir=./log/ \ + --gpus "0,1,2,3,4,5,6,7" \ + tools/train.py \ + -c configs/rec/rec_mv3_none_bilstm_ctc.yml +``` + +### Training with multi machine + +Compared with single machine, training with multi machine only needs to add the parameter `--ips` to start command, which represents the IP list of machines used for distributed training, and the IP of different machines are separated by commas. The start command as follows: + +```bash linenums="1" +ip_list="192.168.0.1,192.168.0.2" +python3 -m paddle.distributed.launch \ + --log_dir=./log/ \ + --ips="${ip_list}" \ + --gpus="0,1,2,3,4,5,6,7" \ + tools/train.py \ + -c configs/rec/rec_mv3_none_bilstm_ctc.yml +``` + +**Notice:** + +* The IP addresses of different machines need to be separated by commas, which can be queried through `ifconfig` or `ipconfig`. +* Different machines need to be set to be secret free and can `ping` success with others directly, otherwise communication cannot establish between them. +* The code, data and start command between different machines must be completely consistent and then all machines need to run start command. The first machine in the `ip_list` is set to `trainer0`, and so on. + +## Performance comparison + +We conducted model training on 2x8 P40 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 260k Chinese dataset | 2.50d/66.70% | 1.67d/67.00% | **1.5** | + +We conducted model training on 3x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 3x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.80h/76.20% | 19.75h/74.77% | **2.52** | + +Note: when training with 3x8 GPUs, the single card batch size is unchanged compared with the 1x8 GPUs' training process, and the learning rate is multiplied by 2 (if it is multiplied by 3 by default, the accuracy is only 73.42%). + +We conducted model training on 4x8 V100 GPUs. Accuracy, training time, and multi machine acceleration ratio of different models are shown below. + +| Model | Configuration | Configuration | 8 GPU training time / Accuracy | 4x8 GPU training time / Accuracy | Acceleration ratio | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.00% | **3.5** | diff --git a/docs/ppocr/blog/distributed_training.md b/docs/ppocr/blog/distributed_training.md new file mode 100644 index 0000000000..182c67d17c --- /dev/null +++ b/docs/ppocr/blog/distributed_training.md @@ -0,0 +1,67 @@ +--- +comments: true +--- + +# 分布式训练 + +## 简介 + +* 分布式训练的高性能,是飞桨的核心优势技术之一,在分类任务上,分布式训练可以达到几乎线性的加速比。OCR训练任务中往往包含大量训练数据,以识别为例,ppocrv2.0模型在训练时使用了1800W数据,如果使用单机训练,会非常耗时。因此,PaddleOCR中使用分布式训练接口完成训练任务,同时支持单机训练与多机训练。更多关于分布式训练的方法与文档可以参考:[分布式训练快速开始教程](https://fleet-x.readthedocs.io/en/latest/paddle_fleet_rst/parameter_server/ps_quick_start.html)。 + +## 使用方法 + +### 单机训练 + +* 以识别为例,本地准备好数据之后,使用`paddle.distributed.launch`的接口启动训练任务即可。下面为运行代码示例。 + +```bash linenums="1" +python3 -m paddle.distributed.launch \ + --log_dir=./log/ \ + --gpus "0,1,2,3,4,5,6,7" \ + tools/train.py \ + -c configs/rec/rec_mv3_none_bilstm_ctc.yml +``` + +### 多机训练 + +* 相比单机训练,多机训练时,只需要添加`--ips`的参数,该参数表示需要参与分布式训练的机器的ip列表,不同机器的ip用逗号隔开。下面为运行代码示例。 + +```bash linenums="1" +ip_list="192.168.0.1,192.168.0.2" +python3 -m paddle.distributed.launch \ + --log_dir=./log/ \ + --ips="${ip_list}" \ + --gpus="0,1,2,3,4,5,6,7" \ + tools/train.py \ + -c configs/rec/rec_mv3_none_bilstm_ctc.yml +``` + +**注:** + +* 不同机器的ip信息需要用逗号隔开,可以通过`ifconfig`或者`ipconfig`查看。 +* 不同机器之间需要做免密设置,且可以直接ping通,否则无法完成通信。 +* 不同机器之间的代码、数据与运行命令或脚本需要保持一致,且所有的机器上都需要运行设置好的训练命令或者脚本。最终`ip_list`中的第一台机器的第一块设备是trainer0,以此类推。 + +## 性能效果测试 + +在2机8卡P40的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 2机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| CRNN | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 26W中文数据集 | 2.50d/66.7% | 1.67d/67.0% | **1.5** | + +在3机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 3机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SLANet | [SLANet.yml](../../configs/table/SLANet.yml) | PubTabNet | 49.8h/76.2% | 19.75h/74.77% | **2.52** | + +注意:这里3机8卡训练时,单卡batch size相比于单机8卡不变,学习率乘以2 (默认乘以3的话,精度仅有73.42%) + +在4机8卡V100的机器上进行模型训练,不同模型的精度、训练耗时、多机加速比情况如下所示。 + +| 模型 | 配置 | 数据集 | 单机8卡耗时/精度 | 4机8卡耗时/精度 | 加速比 | +|:------:|:-----:|:--------:|:--------:|:--------:|:-----:| +| SVTR | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | PP-OCRv3_rec data | 10d/- | 2.84d/74.0% | **3.5** | + +**注意**: 在训练的GPU卡数过多时,精度会稍微有所损失(1%左右),此时可以尝试通过添加warmup或者适当增加迭代轮数来弥补精度损失。 diff --git a/docs/ppocr/blog/enhanced_ctc_loss.en.md b/docs/ppocr/blog/enhanced_ctc_loss.en.md new file mode 100644 index 0000000000..0bd4be56d5 --- /dev/null +++ b/docs/ppocr/blog/enhanced_ctc_loss.en.md @@ -0,0 +1,100 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Enhanced CTC Loss + +In OCR recognition, CRNN is a text recognition algorithm widely applied in the industry. In the training phase, it uses CTCLoss to calculate the network loss. In the inference phase, it uses CTCDecode to obtain the decoding result. Although the CRNN algorithm has been proven to achieve reliable recognition results in actual business, users have endless requirements for recognition accuracy. So how to improve the accuracy of text recognition? Taking CTCLoss as the starting point, this paper explores the improved fusion scheme of CTCLoss from three different perspectives: Hard Example Mining, Multi-task Learning, and Metric Learning. Based on the exploration, we propose EnhancedCTCLoss, which includes the following 3 components: Focal-CTC Loss, A-CTC Loss, C-CTC Loss. + +## 1. Focal-CTC Loss + +Focal Loss was proposed by the paper, "[Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)". When the loss was first proposed, it was mainly to solve the problem of a serious imbalance in the ratio of positive and negative samples in one-stage target detection. This loss function reduces the weight of a large number of simple negative samples in training and also can be understood as a kind of difficult sample mining. +The form of the loss function is as follows: + +$$ +\begin{equation} +L_{fl}=\left\{ +\begin{array}{cl} +-\alpha(1 - y^{'})^{\gamma}logy^{'} ,& y = 1 \\ +-(1 - \alpha)y^{'\gamma}log(1 - y^{'}), & y = 0 \\ +\end{array} \right. +\end{equation} +$$ + +Among them, y' is the output of the activation function, and the value is between 0-1. It adds a modulation factor (1-y’)^γ and a balance factor α on the basis of the original cross-entropy loss. When α = 1, y = 1, the comparison between the loss function and the cross-entropy loss is shown in the following figure: + +![img](./images/focal_loss_image.png) + +As can be seen from the above figure, when γ > 0, the adjustment coefficient (1-y’)^γ gives smaller weight to the easy-to-classify sample loss, making the network pay more attention to the difficult and misclassified samples. The adjustment factor γ is used to adjust the rate at which the weight of simple samples decreases. When γ = 0, it is the cross-entropy loss function. When γ increases, the influence of the adjustment factor will also increase. Experiments revealed that 2 is the optimal value of γ. The balance factor α is used to balance the uneven proportions of the positive and negative samples. In the text, α is taken as 0.25. + +For the classic CTC algorithm, suppose a certain feature sequence (f1, f2, ......ft), after CTC decoding, the probability that the result is equal to label is y', then the probability that the CTC decoding result is not equal to label is (1-y'); it is not difficult to find that the CTCLoss value and y' have the following relationship: + +$$ +L_{CTC} = -log(y^{'}) +$$ + +Combining the idea of Focal Loss, assigning larger weights to difficult samples and smaller weights to simple samples can make the network focus more on the mining of difficult samples and further improve the accuracy of recognition. Therefore, we propose Focal-CTC Loss. Its definition is as follows: + +$$ +L_{Focal\_CTC} = \alpha * (1 - y^{'})^{\gamma} * L_{CTC} +$$ + +In the experiment, the value of γ is 2, α = 1, see this for specific implementation: [rec_ctc_loss.py](../../ppocr/losses/rec_ctc_loss.py) + +## 2. A-CTC Loss +A-CTC Loss is short for CTC Loss + ACE Loss. Among them, ACE Loss was proposed by the paper, “[Aggregation Cross-Entropy for Sequence Recognition](https://arxiv.org/abs/1904.08364)”. Compared with CTCLoss, ACE Loss has the following two advantages: ++ ACE Loss can solve the recognition problem of 2-D text, while CTCLoss can only process 1-D text ++ ACE Loss is better than CTC loss in time complexity and space complexity + +The advantages and disadvantages of the OCR recognition algorithm summarized by the predecessors are shown in the following figure: + +![img](./images/rec_algo_compare.png) + +Although ACELoss does handle 2D predictions, as shown in the figure above, and has advantages in memory usage and inference speed, in practice, we found that using ACELoss alone, the recognition effect is not as good as CTCLoss. Consequently, we tried to combine CTCLoss and ACELoss, and CTCLoss is the mainstay while ACELoss acts as an auxiliary supervision loss. This attempt has achieved better results. On our internal experimental data set, compared to using CTCLoss alone, the recognition accuracy can be improved by about 1%. +A_CTC Loss is defined as follows: + +$$ +L_{A-CTC} = L_{CTC} + \lambda * L_{ACE} +$$ + +In the experiment, λ = 0.1. See the ACE loss implementation code: [ace_loss.py](../../ppocr/losses/ace_loss.py) + +## 3. C-CTC Loss +C-CTC Loss is short for CTC Loss + Center Loss. Among them, Center Loss was proposed by the paper, “[A Discriminative Feature Learning Approach for Deep Face Recognition](https://link.springer.com/chapter/10.1007/978-3-319-46478-7_31)“. It was first used in face recognition tasks to increase the distance between classes and reduce the distance within classes. It is an earlier and also widely used algorithm. + +In the task of Chinese OCR recognition, through the analysis of bad cases, we found that a major difficulty in Chinese recognition is that there are many similar characters, which are easy to misunderstand. From this, we thought about whether we can learn from the idea of n to increase the class spacing of similar characters, to improve recognition accuracy. However, Metric Learning is mainly used in the field of image recognition, and the label of the training data is a fixed value; for OCR recognition, it is a sequence recognition task essentially, and there is no explicit alignment between features and labels. Therefore, how to combine the two is still a direction worth exploring. + +By trying Arcmargin, Cosmargin and other methods, we finally found that Centerloss can help further improve the accuracy of recognition. C_CTC Loss is defined as follows: + +$$ +L_{C-CTC} = L_{CTC} + \lambda * L_{center} +$$ + +In the experiment, we set λ=0.25. See the center_loss implementation code: [center_loss.py](../../ppocr/losses/center_loss.py) + +It is worth mentioning that in C-CTC Loss, choosing to initialize the Center randomly does not bring significant improvement. Our Center initialization method is as follows: ++ Based on the original CTCLoss, a network N is obtained by training ++ Select the training set, identify the completely correct part, and form the set G ++ Send each sample in G to the network, perform forward calculation, and extract the correspondence between the input of the last FC layer (ie feature) and the result of argmax calculation (ie index) ++ Aggregate features with the same index, calculate the average, and get the initial center of each character. + +Taking the configuration file `configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml` as an example, the center extraction command is as follows: + +```bash linenums="1" +python tools/export_center.py -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml -o Global.pretrained_model="./output/rec_mobile_pp-OCRv2/best_accuracy" +``` + +After running, `train_center.pkl` will be generated in the main directory of PaddleOCR. + +## 4. Experiment + +For the above three solutions, we conducted training and evaluation based on Baidu's internal data set. The experimental conditions are shown in the following table: + +| algorithm | Focal_CTC | A_CTC | C-CTC | +| :-------- | :-------- | ----: | :---: | +| gain | +0.3% | +0.7% | +1.7% | + +Based on the above experimental conclusions, we adopted the C-CTC strategy in PP-OCRv2. It is worth mentioning that, because PP-OCRv2 deals with the recognition task of 6625 Chinese characters, the character set is relatively large and there are many similar characters, so the C-CTC solution brings a significant improvement on this task. But if you switch to other OCR recognition tasks, the conclusion may be different. You can try Focal-CTC, A-CTC, C-CTC, and the combined solution EnhancedCTC. We believe it will bring different degrees of improvement. + +The unified combined plan is shown in the following file: [rec_enhanced_ctc_loss.py](../../ppocr/losses/rec_enhanced_ctc_loss.py) diff --git a/docs/ppocr/blog/enhanced_ctc_loss.md b/docs/ppocr/blog/enhanced_ctc_loss.md new file mode 100644 index 0000000000..5c28c306c8 --- /dev/null +++ b/docs/ppocr/blog/enhanced_ctc_loss.md @@ -0,0 +1,101 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Enhanced CTC Loss + +在OCR识别中, CRNN是一种在工业界广泛使用的文字识别算法。 在训练阶段,其采用CTCLoss来计算网络损失; 在推理阶段,其采用CTCDecode来获得解码结果。虽然CRNN算法在实际业务中被证明能够获得很好的识别效果, 然而用户对识别准确率的要求却是无止境的,如何进一步提升文字识别的准确率呢? 本文以CTCLoss为切人点,分别从难例挖掘、 多任务学习、 Metric Learning 3个不同的角度探索了CTCLoss的改进融合方案,提出了EnhancedCTCLoss,其包括如下3个组成部分: Focal-CTC Loss,A-CTC Loss, C-CTC Loss。 + +## 1. Focal-CTC Loss + +Focal Loss 出自论文《Focal Loss for Dense Object Detection》, 该loss最先提出的时候主要是为了解决one-stage目标检测中正负样本比例严重失衡的问题。该损失函数降低了大量简单负样本在训练中所占的权重,也可理解为一种困难样本挖掘。 +其损失函数形式如下: + +$$ +\begin{equation} +L_{fl}=\left\{ +\begin{array}{cl} +-\alpha(1 - y^{'})^{\gamma}logy^{'} ,& y = 1 \\ +-(1 - \alpha)y^{'\gamma}log(1 - y^{'}), & y = 0 \\ +\end{array} \right. +\end{equation} +$$ + +其中, y' 是经过激活函数的输出,取值在0-1之间。其在原始的交叉熵损失的基础上加了一个调制系数(1 – y’)^ γ和平衡因子α。 当α = 1,y=1时,其损失函数与交叉熵损失的对比如下图所示: + +![img](./images/focal_loss_image.png) + +从上图可以看到, 当γ> 0时,调整系数(1-y’)^γ 赋予易分类样本损失一个更小的权重,使得网络更关注于困难的、错分的样本。 调整因子γ用于调节简单样本权重降低的速率,当γ为0时即为交叉熵损失函数,当γ增加时,调整因子的影响也会随之增大。实验发现γ为2是最优。平衡因子α用来平衡正负样本本身的比例不均,文中α取0.25。 + +对于经典的CTC算法,假设某个特征序列(f1, f2, ......ft), 经过CTC解码之后结果等于label的概率为y’, 则CTC解码结果不为label的概率即为(1-y’);不难发现, CTCLoss值和y’有如下关系: + +$$ +L_{CTC} = -log(y^{'}) +$$ + +结合Focal Loss的思想,赋予困难样本较大的权重,简单样本较小的权重,可以使网络更加聚焦于对困难样本的挖掘,进一步提升识别的准确率,由此我们提出了Focal-CTC Loss; 其定义如下所示: + +$$ +L_{Focal\_CTC} = \alpha * (1 - y^{'})^{\gamma} * L_{CTC} +$$ + +实验中,γ取值为2, α= 1, 具体实现见: [rec_ctc_loss.py](../../ppocr/losses/rec_ctc_loss.py) + +## 2. A-CTC Loss + +A-CTC Loss是CTC Loss + ACE Loss的简称。 其中ACE Loss出自论文< Aggregation Cross-Entropy for Sequence Recognition>. ACE Loss相比于CTCLoss,主要有如下两点优势: + ++ ACE Loss能够解决2-D文本的识别问题; CTCLoss只能够处理1-D文本 ++ ACE Loss 在时间复杂度和空间复杂度上优于CTC loss + +前人总结的OCR识别算法的优劣如下图所示: + +![img](./images/rec_algo_compare.png) + +虽然ACELoss确实如上图所说,可以处理2D预测,在内存占用及推理速度方面具备优势,但在实践过程中,我们发现单独使用ACE Loss, 识别效果并不如CTCLoss. 因此,我们尝试将CTCLoss和ACELoss进行结合,同时以CTCLoss为主,将ACELoss 定位为一个辅助监督loss。 这一尝试收到了效果,在我们内部的实验数据集上,相比单独使用CTCLoss,识别准确率可以提升1%左右。 +A_CTC Loss定义如下: + +$$ +L_{A-CTC} = L_{CTC} + \lambda * L_{ACE} +$$ + +实验中,λ = 0.1. ACE loss实现代码见: [ace_loss.py](../../ppocr/losses/ace_loss.py) + +## 3. C-CTC Loss + +C-CTC Loss是CTC Loss + Center Loss的简称。 其中Center Loss出自论文 < A Discriminative Feature Learning Approach for Deep Face Recognition>. 最早用于人脸识别任务,用于增大类间距离,减小类内距离, 是Metric Learning领域一种较早的、也比较常用的一种算法。 +在中文OCR识别任务中,通过对badcase分析, 我们发现中文识别的一大难点是相似字符多,容易误识。 由此我们想到是否可以借鉴Metric Learing的想法, 增大相似字符的类间距,从而提高识别准确率。然而,MetricLearning主要用于图像识别领域,训练数据的标签为一个固定的值;而对于OCR识别来说,其本质上是一个序列识别任务,特征和label之间并不具有显式的对齐关系,因此两者如何结合依然是一个值得探索的方向。 +通过尝试Arcmargin, Cosmargin等方法, 我们最终发现Centerloss 有助于进一步提升识别的准确率。C_CTC Loss定义如下: + +$$ +L_{C-CTC} = L_{CTC} + \lambda * L_{center} +$$ + +实验中,我们设置λ=0.25. center_loss实现代码见: [center_loss.py](../../ppocr/losses/center_loss.py) + +值得一提的是, 在C-CTC Loss中,选择随机初始化Center并不能够带来明显的提升. 我们的Center初始化方法如下: + ++ 基于原始的CTCLoss, 训练得到一个网络N ++ 挑选出训练集中,识别完全正确的部分, 组成集合G ++ 将G中的每个样本送入网络,进行前向计算, 提取最后一个FC层的输入(即feature)及其经过argmax计算的结果(即index)之间的对应关系 ++ 将相同index的feature进行聚合,计算平均值,得到各自字符的初始center. + +以配置文件`configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml`为例, center提取命令如下所示: + +```bash linenums="1" +python tools/export_center.py -c configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml -o Global.pretrained_model="./output/rec_mobile_pp-OCRv2/best_accuracy" +``` + +运行完后,会在PaddleOCR主目录下生成`train_center.pkl`. + +## 4. 实验 + +对于上述的三种方案,我们基于百度内部数据集进行了训练、评测,实验情况如下表所示: + +|algorithm| Focal_CTC | A_CTC | C-CTC | +|:------| :------| ------: | :------: | +|gain| +0.3% | +0.7% | +1.7% | + +基于上述实验结论,我们在PP-OCRv2中,采用了C-CTC的策略。 值得一提的是,由于PP-OCRv2 处理的是6625个中文字符的识别任务,字符集比较大,形似字较多,所以在该任务上C-CTC 方案带来的提升较大。 但如果换做其他OCR识别任务,结论可能会有所不同。大家可以尝试Focal-CTC,A-CTC, C-CTC以及组合方案EnhancedCTC,相信会带来不同程度的提升效果。 +统一的融合方案见如下文件: [rec_enhanced_ctc_loss.py](../../ppocr/losses/rec_enhanced_ctc_loss.py) diff --git a/docs/ppocr/blog/images/11_det.jpg b/docs/ppocr/blog/images/11_det.jpg new file mode 100644 index 0000000000..fe0cd23cc2 Binary files /dev/null and b/docs/ppocr/blog/images/11_det.jpg differ diff --git a/docs/ppocr/blog/images/11_det_rec.jpg b/docs/ppocr/blog/images/11_det_rec.jpg new file mode 100644 index 0000000000..31c566478f Binary files /dev/null and b/docs/ppocr/blog/images/11_det_rec.jpg differ diff --git a/docs/ppocr/blog/images/12_det.jpg b/docs/ppocr/blog/images/12_det.jpg new file mode 100644 index 0000000000..71627f0b8d Binary files /dev/null and b/docs/ppocr/blog/images/12_det.jpg differ diff --git a/docs/ppocr/blog/images/12_det_rec.jpg b/docs/ppocr/blog/images/12_det_rec.jpg new file mode 100644 index 0000000000..9db8b57e12 Binary files /dev/null and b/docs/ppocr/blog/images/12_det_rec.jpg differ diff --git a/docs/ppocr/blog/images/187578511-9f3c351e-b68c-4359-a6e5-475810993c61.png b/docs/ppocr/blog/images/187578511-9f3c351e-b68c-4359-a6e5-475810993c61.png new file mode 100644 index 0000000000..86c062684a Binary files /dev/null and b/docs/ppocr/blog/images/187578511-9f3c351e-b68c-4359-a6e5-475810993c61.png differ diff --git a/docs/ppocr/blog/images/254-20240709081442260.jpg b/docs/ppocr/blog/images/254-20240709081442260.jpg new file mode 100644 index 0000000000..c871fb042c Binary files /dev/null and b/docs/ppocr/blog/images/254-20240709081442260.jpg differ diff --git a/docs/ppocr/blog/images/5e612.png b/docs/ppocr/blog/images/5e612.png new file mode 100644 index 0000000000..79e3d0e1e2 Binary files /dev/null and b/docs/ppocr/blog/images/5e612.png differ diff --git a/docs/ppocr/blog/images/DF.png b/docs/ppocr/blog/images/DF.png new file mode 100644 index 0000000000..f14953d481 Binary files /dev/null and b/docs/ppocr/blog/images/DF.png differ diff --git a/docs/ppocr/blog/images/GTC.png b/docs/ppocr/blog/images/GTC.png new file mode 100644 index 0000000000..30a9cdd146 Binary files /dev/null and b/docs/ppocr/blog/images/GTC.png differ diff --git a/docs/ppocr/blog/images/LCNet_SVTR.png b/docs/ppocr/blog/images/LCNet_SVTR.png new file mode 100644 index 0000000000..7f0d701d27 Binary files /dev/null and b/docs/ppocr/blog/images/LCNet_SVTR.png differ diff --git a/docs/ppocr/blog/images/LKPAN.png b/docs/ppocr/blog/images/LKPAN.png new file mode 100644 index 0000000000..6b16053623 Binary files /dev/null and b/docs/ppocr/blog/images/LKPAN.png differ diff --git a/docs/ppocr/blog/images/PFHead.png b/docs/ppocr/blog/images/PFHead.png new file mode 100644 index 0000000000..3728dc44e5 Binary files /dev/null and b/docs/ppocr/blog/images/PFHead.png differ diff --git a/docs/ppocr/blog/images/RSEFPN.png b/docs/ppocr/blog/images/RSEFPN.png new file mode 100644 index 0000000000..ddf7c52fb5 Binary files /dev/null and b/docs/ppocr/blog/images/RSEFPN.png differ diff --git a/docs/ppocr/blog/images/SSL.png b/docs/ppocr/blog/images/SSL.png new file mode 100644 index 0000000000..1344a2a77c Binary files /dev/null and b/docs/ppocr/blog/images/SSL.png differ diff --git a/docs/ppocr/blog/images/UIM.png b/docs/ppocr/blog/images/UIM.png new file mode 100644 index 0000000000..7479bdf4a9 Binary files /dev/null and b/docs/ppocr/blog/images/UIM.png differ diff --git a/docs/ppocr/blog/images/arabic_0.jpg b/docs/ppocr/blog/images/arabic_0.jpg new file mode 100644 index 0000000000..9941b90642 Binary files /dev/null and b/docs/ppocr/blog/images/arabic_0.jpg differ diff --git a/docs/ppocr/blog/images/focal_loss_image.png b/docs/ppocr/blog/images/focal_loss_image.png new file mode 100644 index 0000000000..430550a732 Binary files /dev/null and b/docs/ppocr/blog/images/focal_loss_image.png differ diff --git a/docs/ppocr/blog/images/french_0.jpg b/docs/ppocr/blog/images/french_0.jpg new file mode 100644 index 0000000000..3c2abe6304 Binary files /dev/null and b/docs/ppocr/blog/images/french_0.jpg differ diff --git a/docs/ppocr/blog/images/img_02.jpg b/docs/ppocr/blog/images/img_02.jpg new file mode 100644 index 0000000000..3e139c76bc Binary files /dev/null and b/docs/ppocr/blog/images/img_02.jpg differ diff --git a/docs/ppocr/blog/images/img_12.jpg b/docs/ppocr/blog/images/img_12.jpg new file mode 100644 index 0000000000..822d562eda Binary files /dev/null and b/docs/ppocr/blog/images/img_12.jpg differ diff --git a/docs/ppocr/blog/images/japan_2-20240709081138234.jpg b/docs/ppocr/blog/images/japan_2-20240709081138234.jpg new file mode 100644 index 0000000000..7038ba2eff Binary files /dev/null and b/docs/ppocr/blog/images/japan_2-20240709081138234.jpg differ diff --git a/docs/ppocr/blog/images/korean.jpg b/docs/ppocr/blog/images/korean.jpg new file mode 100644 index 0000000000..e5d863cd86 Binary files /dev/null and b/docs/ppocr/blog/images/korean.jpg differ diff --git a/docs/ppocr/blog/images/korean_0.jpg b/docs/ppocr/blog/images/korean_0.jpg new file mode 100644 index 0000000000..3fe6305aa0 Binary files /dev/null and b/docs/ppocr/blog/images/korean_0.jpg differ diff --git a/docs/ppocr/blog/images/multi_scale.png b/docs/ppocr/blog/images/multi_scale.png new file mode 100644 index 0000000000..673d306399 Binary files /dev/null and b/docs/ppocr/blog/images/multi_scale.png differ diff --git a/docs/ppocr/blog/images/ppocrv3_det_cml.png b/docs/ppocr/blog/images/ppocrv3_det_cml.png new file mode 100644 index 0000000000..ccb5c8b21f Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv3_det_cml.png differ diff --git a/docs/ppocr/blog/images/ppocrv3_framework-0052468.png b/docs/ppocr/blog/images/ppocrv3_framework-0052468.png new file mode 100644 index 0000000000..e05279f7f5 Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv3_framework-0052468.png differ diff --git a/docs/ppocr/blog/images/ppocrv3_framework.png b/docs/ppocr/blog/images/ppocrv3_framework.png new file mode 100644 index 0000000000..e05279f7f5 Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv3_framework.png differ diff --git a/docs/ppocr/blog/images/ppocrv4_det_cml.png b/docs/ppocr/blog/images/ppocrv4_det_cml.png new file mode 100644 index 0000000000..9132c0a67c Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv4_det_cml.png differ diff --git a/docs/ppocr/blog/images/ppocrv4_framework.png b/docs/ppocr/blog/images/ppocrv4_framework.png new file mode 100644 index 0000000000..fa31f4c12e Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv4_framework.png differ diff --git a/docs/ppocr/blog/images/ppocrv4_gtc.png b/docs/ppocr/blog/images/ppocrv4_gtc.png new file mode 100644 index 0000000000..7e6a3f5c13 Binary files /dev/null and b/docs/ppocr/blog/images/ppocrv4_gtc.png differ diff --git a/docs/ppocr/blog/images/rec_algo_compare.png b/docs/ppocr/blog/images/rec_algo_compare.png new file mode 100644 index 0000000000..2dde496c75 Binary files /dev/null and b/docs/ppocr/blog/images/rec_algo_compare.png differ diff --git a/docs/ppocr/blog/images/recconaug.png b/docs/ppocr/blog/images/recconaug.png new file mode 100644 index 0000000000..899bc430de Binary files /dev/null and b/docs/ppocr/blog/images/recconaug.png differ diff --git a/docs/ppocr/blog/images/svtr_g2.png b/docs/ppocr/blog/images/svtr_g2.png new file mode 100644 index 0000000000..2573afafbb Binary files /dev/null and b/docs/ppocr/blog/images/svtr_g2.png differ diff --git a/docs/ppocr/blog/images/svtr_g4.png b/docs/ppocr/blog/images/svtr_g4.png new file mode 100644 index 0000000000..f85d66d97f Binary files /dev/null and b/docs/ppocr/blog/images/svtr_g4.png differ diff --git a/docs/ppocr/blog/images/svtr_tiny.png b/docs/ppocr/blog/images/svtr_tiny.png new file mode 100644 index 0000000000..01e22e74b5 Binary files /dev/null and b/docs/ppocr/blog/images/svtr_tiny.png differ diff --git a/docs/ppocr/blog/images/teacher_dml.png b/docs/ppocr/blog/images/teacher_dml.png new file mode 100644 index 0000000000..ea09cacda8 Binary files /dev/null and b/docs/ppocr/blog/images/teacher_dml.png differ diff --git a/docs/ppocr/blog/images/v3_rec_pipeline.png b/docs/ppocr/blog/images/v3_rec_pipeline.png new file mode 100644 index 0000000000..aa61cc4f16 Binary files /dev/null and b/docs/ppocr/blog/images/v3_rec_pipeline.png differ diff --git a/docs/ppocr/blog/images/v4_rec_pipeline.png b/docs/ppocr/blog/images/v4_rec_pipeline.png new file mode 100644 index 0000000000..b1ec7a9689 Binary files /dev/null and b/docs/ppocr/blog/images/v4_rec_pipeline.png differ diff --git a/docs/ppocr/blog/images/word_308.png b/docs/ppocr/blog/images/word_308.png new file mode 100644 index 0000000000..a8d094faff Binary files /dev/null and b/docs/ppocr/blog/images/word_308.png differ diff --git a/docs/ppocr/blog/inference_args.en.md b/docs/ppocr/blog/inference_args.en.md new file mode 100644 index 0000000000..d03de70dd9 --- /dev/null +++ b/docs/ppocr/blog/inference_args.en.md @@ -0,0 +1,131 @@ +--- +comments: true +--- + + +# PaddleOCR Model Inference Parameter Explanation + +When using PaddleOCR for model inference, you can customize the modification parameters to modify the model, data, preprocessing, postprocessing, etc. (parameter file: [utility.py](../../tools/infer/utility.py)),The detailed parameter explanation is as follows: + +* Global parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| image_dir | str | None, must be specified explicitly | Image or folder path | +| page_num | int | 0 | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | +| vis_font_path | str | "./doc/fonts/simfang.ttf" | font path for visualization | +| drop_score | float | 0.5 | Results with a recognition score less than this value will be discarded and will not be returned as results | +| use_pdserving | bool | False | Whether to use Paddle Serving for prediction | +| warmup | bool | False | Whether to enable warmup, this method can be used when statistical prediction time | +| draw_img_save_dir | str | "./inference_results" | The saving folder of the system's tandem prediction OCR results | +| save_crop_res | bool | False | Whether to save the recognized text image for OCR | +| crop_res_save_dir | str | "./output" | Save the text image path recognized by OCR | +| use_mp | bool | False | Whether to enable multi-process prediction | +| total_process_num | int | 6 | The number of processes, which takes effect when `use_mp` is `True` | +| process_id | int | 0 | The id number of the current process, no need to modify it yourself | +| benchmark | bool | False | Whether to enable benchmark, and make statistics on prediction speed, memory usage, etc. | +| save_log_path | str | "./log_output/" | Folder where log results are saved when `benchmark` is enabled | +| show_log | bool | True | Whether to show the log information in the inference | +| use_onnx | bool | False | Whether to enable onnx prediction | + +* Prediction engine related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_gpu | bool | True | Whether to use GPU for prediction | +| ir_optim | bool | True | Whether to analyze and optimize the calculation graph. The prediction process can be accelerated when `ir_optim` is enabled | +| use_tensorrt | bool | False | Whether to enable tensorrt | +| min_subgraph_size | int | 15 | The minimum subgraph size in tensorrt. When the size of the subgraph is greater than this value, it will try to use the trt engine to calculate the subgraph. | +| precision | str | fp32 | The precision of prediction, supports `fp32`, `fp16`, `int8` | +| enable_mkldnn | bool | True | Whether to enable mkldnn | +| cpu_threads | int | 10 | When mkldnn is enabled, the number of threads predicted by the cpu | + +* Text detection model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_algorithm | str | "DB" | Text detection algorithm name, currently supports `DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | +| det_model_dir | str | xx | Detection inference model paths | +| det_limit_side_len | int | 960 | image side length limit | +| det_limit_type | str | "max" | The side length limit type, currently supports `min`and `max`. `min` means to ensure that the shortest side of the image is not less than `det_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `det_limit_side_len` | + +The relevant parameters of the DB algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, only pixels with a score greater than this threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | Within the detection box, when the average score of all pixels is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion factor of the `Vatti clipping` algorithm, which is used to expand the text area | +| max_batch_size | int | 10 | max batch size | +| use_dilation | bool | False | Whether to inflate the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + +The relevant parameters of the EAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_east_score_thresh | float | 0.8 | Threshold for score map in EAST postprocess | +| det_east_cover_thresh | float | 0.1 | Average score threshold for text boxes in EAST postprocess | +| det_east_nms_thresh | float | 0.2 | Threshold of nms in EAST postprocess | + +The relevant parameters of the SAST algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_sast_score_thresh | float | 0.5 | Score thresholds in SAST postprocess | +| det_sast_nms_thresh | float | 0.5 | Thresholding of nms in SAST postprocess | +| det_box_type | str | 'quad' | Whether polygon detection, curved text scene (such as Total-Text) is set to 'poly' | + +The relevant parameters of the PSE algorithm are as follows + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| det_pse_thresh | float | 0.0 | Threshold for binarizing the output image | +| det_pse_box_thresh | float | 0.85 | Threshold for filtering boxes, below this threshold is discarded | +| det_pse_min_area | float | 16 | The minimum area of the box, below this threshold is discarded | +| det_box_type | str | "quad" | The type of the returned box, quad: four point coordinates, poly: all point coordinates of the curved text | +| det_pse_scale | int | 1 | The ratio of the input image relative to the post-processed image, such as an image of `640*640`, the network output is `160*160`, and when the scale is 2, the shape of the post-processed image is `320*320`. Increasing this value can speed up the post-processing speed, but it will bring about a decrease in accuracy | + +* Text recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| rec_algorithm | str | "CRNN" | Text recognition algorithm name, currently supports `CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | +| rec_model_dir | str | None, it is required if using the recognition model | recognition inference model paths | +| rec_image_shape | str | "3,48,320" ] | Image size at the time of recognition | +| rec_batch_num | int | 6 | batch size | +| max_text_length | int | 25 | The maximum length of the recognition result, valid in `SRN` | +| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | character dictionary file | +| use_space_char | bool | True | Whether to include spaces, if `True`, the `space` character will be added at the end of the character dictionary | + +* End-to-end text detection and recognition model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| e2e_algorithm | str | "PGNet" | End-to-end algorithm name, currently supports `PGNet` | +| e2e_model_dir | str | None, it is required if using the end-to-end model | end-to-end model inference model path | +| e2e_limit_side_len | int | 768 | End-to-end input image side length limit | +| e2e_limit_type | str | "max" | End-to-end side length limit type, currently supports `min` and `max`. `min` means to ensure that the shortest side of the image is not less than `e2e_limit_side_len`, `max` means to ensure that the longest side of the image is not greater than `e2e_limit_side_len` | +| e2e_pgnet_score_thresh | float | 0.5 | End-to-end score threshold, results below this threshold are discarded | +| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | Recognition dictionary file path | +| e2e_pgnet_valid_set | str | "totaltext" | The name of the validation set, currently supports `totaltext`, `partvgg`, the post-processing methods corresponding to different data sets are different, and it can be consistent with the training process | +| e2e_pgnet_mode | str | "fast" | PGNet's detection result score calculation method, supports `fast` and `slow`, `fast` calculates the average score according to all pixels within the bounding rectangle of the polygon, `slow` calculates the average score according to all pixels within the original polygon, The calculation speed is relatively slower, but more accurate. | + +* Angle classifier model related parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| use_angle_cls | bool | False | whether to use an angle classifier | +| cls_model_dir | str | None, if you need to use, you must specify the path explicitly | angle classifier inference model path | +| cls_image_shape | str | "3,48,192" | prediction shape | +| label_list | list | ['0', '180'] | The angle value corresponding to the class id | +| cls_batch_num | int | 6 | batch size | +| cls_thresh | float | 0.9 | Prediction threshold, when the model prediction result is 180 degrees, and the score is greater than the threshold, the final prediction result is considered to be 180 degrees and needs to be flipped | + +* OCR image preprocessing parameters + +| parameters | type | default | implication | +| :--: | :--: | :--: | :--: | +| invert | bool | False | whether to invert image before processing | +| binarize | bool | False | whether to threshold binarize image before processing | +| alphacolor | tuple | "255,255,255" | Replacement color for the alpha channel, if the latter is present; R,G,B integers | diff --git a/docs/ppocr/blog/inference_args.md b/docs/ppocr/blog/inference_args.md new file mode 100644 index 0000000000..1eec868995 --- /dev/null +++ b/docs/ppocr/blog/inference_args.md @@ -0,0 +1,118 @@ +# PaddleOCR模型推理参数解释 + +在使用PaddleOCR进行模型推理时,可以自定义修改参数,来修改模型、数据、预处理、后处理等内容(参数文件:[utility.py](../../tools/infer/utility.py)),详细的参数解释如下所示。 + +* 全局信息 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| image_dir | str | 无,必须显式指定 | 图像或者文件夹路径 | +| page_num | int | 0 | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | +| vis_font_path | str | "./doc/fonts/simfang.ttf" | 用于可视化的字体路径 | +| drop_score | float | 0.5 | 识别得分小于该值的结果会被丢弃,不会作为返回结果 | +| use_pdserving | bool | False | 是否使用Paddle Serving进行预测 | +| warmup | bool | False | 是否开启warmup,在统计预测耗时的时候,可以使用这种方法 | +| draw_img_save_dir | str | "./inference_results" | 系统串联预测OCR结果的保存文件夹 | +| save_crop_res | bool | False | 是否保存OCR的识别文本图像 | +| crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 | +| use_mp | bool | False | 是否开启多进程预测 | +| total_process_num | int | 6 | 开启的进程数,`use_mp`为`True`时生效 | +| process_id | int | 0 | 当前进程的id号,无需自己修改 | +| benchmark | bool | False | 是否开启benchmark,对预测速度、显存占用等进行统计 | +| save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 | +| show_log | bool | True | 是否显示预测中的日志信息 | +| use_onnx | bool | False | 是否开启onnx预测 | + +* 预测引擎相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| use_gpu | bool | True | 是否使用GPU进行预测 | +| ir_optim | bool | True | 是否对计算图进行分析与优化,开启后可以加速预测过程 | +| use_tensorrt | bool | False | 是否开启tensorrt | +| min_subgraph_size | int | 15 | tensorrt中最小子图size,当子图的size大于该值时,才会尝试对该子图使用trt engine计算 | +| precision | str | fp32 | 预测的精度,支持`fp32`, `fp16`, `int8` 3种输入 | +| enable_mkldnn | bool | True | 是否开启mkldnn | +| cpu_threads | int | 10 | 开启mkldnn时,cpu预测的线程数 | + +* 文本检测模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE`, `DB++`, `FCE` | +| det_model_dir | str | xx | 检测inference模型路径 | +| det_limit_side_len | int | 960 | 检测的图像边长限制 | +| det_limit_type | str | "max" | 检测的边长限制类型,目前支持`min`和`max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | + +其中,DB算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | DB输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 | +| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 | +| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 | +| max_batch_size | int | 10 | 预测的batch size | +| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 | +| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + +EAST算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_east_score_thresh | float | 0.8 | EAST后处理中score map的阈值 | +| det_east_cover_thresh | float | 0.1 | EAST后处理中文本框的平均得分阈值 | +| det_east_nms_thresh | float | 0.2 | EAST后处理中nms的阈值 | + +SAST算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 | +| det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 | +| det_box_type | str | quad | 是否多边形检测,弯曲文本场景(如Total-Text)设置为'poly' | + +PSE算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 | +| det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值,低于此阈值的丢弃 | +| det_pse_min_area | float | 16 | box的最小面积,低于此阈值的丢弃 | +| det_box_type | str | "quad" | 返回框的类型,quad:四点坐标,poly: 弯曲文本的所有点坐标 | +| det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`,scale为2的情况下,进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 | + +* 文本识别模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR`, `ViTSTR`, `ABINet`, `VisionLAN`, `SPIN`, `RobustScanner`, `SVTR`, `SVTR_LCNet` | +| rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 | +| rec_image_shape | str | "3,48,320" | 识别时的图像尺寸 | +| rec_batch_num | int | 6 | 识别的batch size | +| max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 | +| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 | +| use_space_char | bool | True | 是否包含空格,如果为`True`,则会在最后字符字典中补充`空格`字符 | + +* 端到端文本检测与识别模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| e2e_algorithm | str | "PGNet" | 端到端算法名称,目前支持`PGNet` | +| e2e_model_dir | str | 无,如果使用端到端模型,该项是必填项 | 端到端模型inference模型路径 | +| e2e_limit_side_len | int | 768 | 端到端的输入图像边长限制 | +| e2e_limit_type | str | "max" | 端到端的边长限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`e2e_limit_side_len`,`max`表示保证图像最长边不大于`e2e_limit_side_len` | +| e2e_pgnet_score_thresh | float | 0.5 | 端到端得分阈值,小于该阈值的结果会被丢弃 | +| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | 识别的字典文件路径 | +| e2e_pgnet_valid_set | str | "totaltext" | 验证集名称,目前支持`totaltext`, `partvgg`,不同数据集对应的后处理方式不同,与训练过程保持一致即可 | +| e2e_pgnet_mode | str | "fast" | PGNet的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + +* 方向分类器模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| use_angle_cls | bool | False | 是否使用方向分类器 | +| cls_model_dir | str | 无,如果需要使用,则必须显式指定路径 | 方向分类器inference模型路径 | +| cls_image_shape | str | "3,48,192" | 预测尺度 | +| label_list | list | ['0', '180'] | class id对应的角度值 | +| cls_batch_num | int | 6 | 方向分类器预测的batch size | +| cls_thresh | float | 0.9 | 预测阈值,模型预测结果为180度,且得分大于该阈值时,认为最终预测结果为180度,需要翻转 | diff --git a/docs/ppocr/blog/multi_languages.en.md b/docs/ppocr/blog/multi_languages.en.md new file mode 100644 index 0000000000..ed71a2d439 --- /dev/null +++ b/docs/ppocr/blog/multi_languages.en.md @@ -0,0 +1,223 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Multi-language model + +**Recent Update** + +- 2022.5.8 update the `PP-OCRv3` version of the multi-language detection and recognition model, and the average recognition accuracy has increased by more than 5%. +- 2021.4.9 supports the detection and recognition of 80 languages +- 2021.4.9 supports **lightweight high-precision** English model detection and recognition + +PaddleOCR aims to create a rich, leading, and practical OCR tool library, which not only provides +Chinese and English models in general scenarios, but also provides models specifically trained +in English scenarios. And multilingual models covering [80 languages](#language_abbreviations). + +Among them, the English model supports the detection and recognition of uppercase and lowercase +letters and common punctuation, and the recognition of space characters is optimized: + +![img](./images/img_12.jpg) + +The multilingual models cover Latin, Arabic, Traditional Chinese, Korean, Japanese, etc.: + +![img](./images/japan_2-20240709081138234.jpg) + +![img](./images/french_0.jpg) + +![img](./images/korean_0.jpg) + +![img](./images/arabic_0.jpg) + +This document will briefly introduce how to use the multilingual model. + +## 1 Installation + +### 1.1 Paddle installation + +```bash linenums="1" +# cpu +pip install paddlepaddle + +# gpu +pip install paddlepaddle-gpu +``` + +### 1.2 PaddleOCR package installation + +```bash linenums="1" +pip install paddleocr +``` + +Build and install locally + +```bash linenums="1" +python3 -m build +pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x is the version number of paddleocr +``` + +## 2 Quick use + +### 2.1 Command line operation + +View help information + +```bash linenums="1" +paddleocr -h +``` + +- Whole image prediction (detection + recognition) + +PaddleOCR currently supports 80 languages, which can be specified by the --lang parameter. +The supported languages are listed in the [table](#language_abbreviations). + +``` bash +paddleocr --image_dir doc/imgs_en/254.jpg --lang=en +``` + +![](./images/254-20240709081442260.jpg) + +![img](./images/img_02.jpg) + +The result is a list. Each item contains a text box, text and recognition confidence + +```text linenums="1" +[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]] +[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]] +[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]] +[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]] +[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]] +[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]] +[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]] +...... +``` + +- Recognition + +```bash linenums="1" +paddleocr --image_dir doc/imgs_words_en/word_308.png --det false --lang=en +``` + +![img](./images/word_308.png) + +The result is a 2-tuple, which contains the recognition result and recognition confidence + +```text linenums="1" +(0.99879867, 'LITTLE') +``` + +- Detection + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false +``` + +The result is a list. Each item represents the coordinates of a text box. + +```bash linenums="1" +[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] +[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] +[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] +...... +``` + +### 2.2 Run with Python script + +PPOCR is able to run with Python scripts for easy integration with your own code: + +- Whole image prediction (detection + recognition) + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Also switch the language by modifying the lang parameter +ocr = PaddleOCR(lang="korean") # The model file will be downloaded automatically when executed for the first time +img_path ='doc/imgs/korean_1.jpg' +result = ocr.ocr(img_path) +# Recognition and detection can be performed separately through parameter control +# result = ocr.ocr(img_path, det=False) Only perform recognition +# result = ocr.ocr(img_path, rec=False) Only perform detection +# Print detection frame and recognition result +for line in result: + print(line) + +# Visualization +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/korean.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Visualization of results: + +![img](./images/korean.jpg) + +PPOCR also supports direction classification. For more detailed usage, please refer to: [whl package instructions](whl_en.md). + +## 3 Custom training + +PPOCR supports using your own data for custom training or fine-tune, where the recognition model can refer to [French configuration file](../../configs/rec/multi_language/rec_french_lite_train.yml) +Modify the training data path, dictionary and other parameters. + +For specific data preparation and training process, please refer to: [Text Detection](../doc_en/detection_en.md), [Text Recognition](../doc_en/recognition_en.md), more functions such as predictive deployment, +For functions such as data annotation, you can read the complete [Document Tutorial](../../README.md). + +## 4 Inference and Deployment + +In addition to installing the whl package for quick forecasting, +PPOCR also provides a variety of forecasting deployment methods. +If necessary, you can read related documents: + +- [Python Inference](./inference_ppocr_en.md) +- [C++ Inference](../../deploy/cpp_infer/readme.md) +- [Serving](../../deploy/hubserving/readme_en.md) +- [Mobile](../../deploy/lite/readme.md) +- [Benchmark](./benchmark_en.md) + +## 5 Support languages and abbreviations + +| Language | Abbreviation | | Language | Abbreviation | +| --- | --- | --- | --- | --- | +|Chinese & English|ch| |Arabic|ar| +|English|en| |Hindi|hi| +|French|fr| |Uyghur|ug| +|German|german| |Persian|fa| +|Japan|japan| |Urdu|ur| +|Korean|korean| | Serbian(latin) |rs_latin| +|Chinese Traditional |chinese_cht| |Occitan |oc| +| Italian |it| |Marathi|mr| +|Spanish |es| |Nepali|ne| +| Portuguese|pt| |Serbian(cyrillic)|rs_cyrillic| +|Russia|ru||Bulgarian |bg| +|Ukranian|uk| |Estonian |et| +|Belarusian|be| |Irish |ga| +|Telugu |te| |Croatian |hr| +|Saudi Arabia|sa| |Hungarian |hu| +|Tamil |ta| |Indonesian|id| +|Afrikaans |af| |Icelandic|is| +|Azerbaijani |az||Kurdish|ku| +|Bosnian|bs| |Lithuanian |lt| +|Czech|cs| |Latvian |lv| +|Welsh |cy| |Maori|mi| +|Danish|da| |Malay|ms| +|Maltese |mt| |Adyghe |ady| +|Dutch |nl| |Kabardian |kbd| +|Norwegian |no| |Avar |ava| +|Polish |pl| |Dargwa |dar| +|Romanian |ro| |Ingush |inh| +|Slovak |sk| |Lak |lbe| +|Slovenian |sl| |Lezghian |lez| +|Albanian |sq| |Tabassaran |tab| +|Swedish |sv| |Bihari |bh| +|Swahili |sw| |Maithili |mai| +|Tagalog |tl| |Angika |ang| +|Turkish |tr| |Bhojpuri |bho| +|Uzbek |uz| |Magahi |mah| +|Vietnamese |vi| |Nagpur |sck| +|Mongolian |mn| |Newari |new| +|Abaza |abq| |Goan Konkani|gom| diff --git a/docs/ppocr/blog/multi_languages.md b/docs/ppocr/blog/multi_languages.md new file mode 100644 index 0000000000..22c331121f --- /dev/null +++ b/docs/ppocr/blog/multi_languages.md @@ -0,0 +1,273 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 多语言模型 + +**近期更新** + +- 2022.5.8 更新`PP-OCRv3`版 多语言检测和识别模型,平均识别准确率提升5%以上。 +- 2021.4.9 支持**80种**语言的检测和识别 +- 2021.4.9 支持**轻量高精度**英文模型检测识别 + +PaddleOCR 旨在打造一套丰富、领先、且实用的OCR工具库,不仅提供了通用场景下的中英文模型,也提供了专门在英文场景下训练的模型, +和覆盖[80个语言](#语种缩写)的小语种模型。 + +其中英文模型支持,大小写字母和常见标点的检测识别,并优化了空格字符的识别: + +![img](./images/img_12.jpg) + +小语种模型覆盖了拉丁语系、阿拉伯语系、中文繁体、韩语、日语等等: + +![img](./images/japan_2-20240709081138234.jpg) + +![img](./images/french_0.jpg) + +![img](./images/korean_0.jpg) + +![img](./images/arabic_0.jpg) + +本文档将简要介绍小语种模型的使用方法。 + +## 1 安装 + +### 1.1 paddle 安装 + +```bash linenums="1" +# cpu +pip install paddlepaddle + +# gpu +pip install paddlepaddle-gpu +``` + +### 1.2 paddleocr package 安装 + +pip 安装 + +```bash linenums="1" +pip install paddleocr +``` + +本地构建并安装 + +```bash linenums="1" +python3 -m build +pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x是paddleocr的版本号 +``` + +## 2 快速使用 + +### 2.1 命令行运行 + +查看帮助信息 + +```bash linenums="1" +paddleocr -h +``` + +- 整图预测(检测+识别) + +Paddleocr目前支持80个语种,可以通过修改--lang参数进行切换,具体支持的[语种](#语种缩写)可查看表格。 + +``` bash +paddleocr --image_dir doc/imgs_en/254.jpg --lang=en +``` + +![](./images/254-20240709081442260.jpg) + +![img](./images/img_02.jpg) + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```text linenums="1" +[('PHO CAPITAL', 0.95723116), [[66.0, 50.0], [327.0, 44.0], [327.0, 76.0], [67.0, 82.0]]] +[('107 State Street', 0.96311164), [[72.0, 90.0], [451.0, 84.0], [452.0, 116.0], [73.0, 121.0]]] +[('Montpelier Vermont', 0.97389287), [[69.0, 132.0], [501.0, 126.0], [501.0, 158.0], [70.0, 164.0]]] +[('8022256183', 0.99810505), [[71.0, 175.0], [363.0, 170.0], [364.0, 202.0], [72.0, 207.0]]] +[('REG 07-24-201706:59 PM', 0.93537045), [[73.0, 299.0], [653.0, 281.0], [654.0, 318.0], [74.0, 336.0]]] +[('045555', 0.99346405), [[509.0, 331.0], [651.0, 325.0], [652.0, 356.0], [511.0, 362.0]]] +[('CT1', 0.9988654), [[535.0, 367.0], [654.0, 367.0], [654.0, 406.0], [535.0, 406.0]]] +...... +``` + +- 识别预测 + +```bash linenums="1" +paddleocr --image_dir doc/imgs_words_en/word_308.png --det false --lang=en +``` + +![img](./images/word_308.png) + +结果是一个tuple,返回识别结果和识别置信度 + +```text linenums="1" +(0.99879867, 'LITTLE') +``` + +- 检测预测 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false +``` + +结果是一个list,每个item只包含文本框 + +```bash linenums="1" +[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] +[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] +[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] +...... +``` + +### 2.2 python 脚本运行 + +ppocr 也支持在python脚本中运行,便于嵌入到您自己的代码中 : + +- 整图预测(检测+识别) + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# 同样也是通过修改 lang 参数切换语种 +ocr = PaddleOCR(lang="korean") # 首次执行会自动下载模型文件 +img_path = 'doc/imgs/korean_1.jpg ' +result = ocr.ocr(img_path) +# 可通过参数控制单独执行识别、检测 +# result = ocr.ocr(img_path, det=False) 只执行识别 +# result = ocr.ocr(img_path, rec=False) 只执行检测 +# 打印检测框和识别结果 +for line in result: + print(line) + +# 可视化 +from PIL import Image +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/korean.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果可视化: + +![img](./images/korean.jpg) + +ppocr 还支持方向分类, 更多使用方式请参考:[whl包使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.0/doc/doc_ch/whl.md) + +## 3 自定义训练 + +ppocr 支持使用自己的数据进行自定义训练或finetune, 其中识别模型可以参考[法语配置文件](../../configs/rec/multi_language/rec_french_lite_train.yml) +修改训练数据路径、字典等参数。 + +详细数据准备、训练过程可参考:[文本识别](../doc_ch/recognition.md)、[文本检测](../doc_ch/detection.md)。 + +假设已经准备好了训练数据,可根据以下步骤快速启动训练: + +- 修改配置文件 + +以 `rec_french_lite_train.yml` 为例: + +```yaml linenums="1" +Global: + ... + # 添加自定义字典,如修改字典请将路径指向新字典 + character_dict_path: ./ppocr/utils/dict/french_dict.txt + ... + # 识别空格 + use_space_char: True + +... + +Train: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data/ + # 训练集标签文件 + label_file_list: ["./train_data/french_train.txt"] + ... + +Eval: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data + # 验证集标签文件 + label_file_list: ["./train_data/french_val.txt"] + ... +``` + +- 启动训练: + +```bash linenums="1" +# 下载预训练模型 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/french_mobile_v2.0_rec_train.tar +tar -xf french_mobile_v2.0_rec_train.tar + +#加载预训练模型 单卡训练 +python3 tools/train.py -c configs/rec/rec_french_lite_train.yml -o Global.pretrained_model=french_mobile_v2.0_rec_train/best_accuracy + +#加载预训练模型 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_french_lite_train.yml -o Global.pretrained_model=french_mobile_v2.0_rec_train/best_accuracy +``` + +更多功能如预测部署、数据标注等功能可以阅读完整的[文档教程](../../README_ch.md)。 + +## 4 预测部署 + +除了安装whl包进行快速预测,ppocr 也提供了多种预测部署方式,如有需求可阅读相关文档: + +- [基于Python脚本预测引擎推理](./inference_ppocr.md) +- [基于C++预测引擎推理](../../deploy/cpp_infer/readme_ch.md) +- [服务化部署](../../deploy/hubserving/readme.md) +- [端侧部署](../../deploy/lite/readme_ch.md) +- [Benchmark](./benchmark.md) + +## 5 支持语种及缩写 + +| 语种 | 描述 | 缩写 | | 语种 | 描述 | 缩写 | +| --- | --- | --- | ---|--- | --- | --- | +|中文|chinese and english|ch| |保加利亚文|Bulgarian |bg| +|英文|english|en| |乌克兰文|Ukranian|uk| +|法文|french|fr| |白俄罗斯文|Belarusian|be| +|德文|german|german| |泰卢固文|Telugu |te| +|日文|japan|japan| | 阿巴扎文 |Abaza | abq | +|韩文|korean|korean| |泰米尔文|Tamil |ta| +|中文繁体|chinese traditional |chinese_cht| |南非荷兰文 |Afrikaans |af| +|意大利文| Italian |it| |阿塞拜疆文 |Azerbaijani |az| +|西班牙文|Spanish |es| |波斯尼亚文|Bosnian|bs| +|葡萄牙文| Portuguese|pt| |捷克文|Czech|cs| +|俄罗斯文|Russia|ru| |威尔士文 |Welsh |cy| +|阿拉伯文|Arabic|ar| |丹麦文 |Danish|da| +|印地文|Hindi|hi| |爱沙尼亚文 |Estonian |et| +|维吾尔|Uyghur|ug| |爱尔兰文 |Irish |ga| +|波斯文|Persian|fa| |克罗地亚文|Croatian |hr| +|乌尔都文|Urdu|ur| |匈牙利文|Hungarian |hu| +|塞尔维亚文(latin)| Serbian(latin) |rs_latin| |印尼文|Indonesian|id| +|欧西坦文|Occitan |oc| |冰岛文 |Icelandic|is| +|马拉地文|Marathi|mr| |库尔德文 |Kurdish|ku| +|尼泊尔文|Nepali|ne| |立陶宛文|Lithuanian |lt| +|塞尔维亚文(cyrillic)|Serbian(cyrillic)|rs_cyrillic| |拉脱维亚文 |Latvian |lv| +|毛利文|Maori|mi| | 达尔瓦文|Dargwa |dar| +|马来文 |Malay|ms| | 因古什文|Ingush |inh| +|马耳他文 |Maltese |mt| | 拉克文|Lak |lbe| +|荷兰文 |Dutch |nl| | 莱兹甘文|Lezghian |lez| +|挪威文 |Norwegian |no| |塔巴萨兰文 |Tabassaran |tab| +|波兰文|Polish |pl| | 比尔哈文|Bihari |bh| +| 罗马尼亚文|Romanian |ro| | 迈蒂利文|Maithili |mai| +| 斯洛伐克文|Slovak |sk| | 昂加文|Angika |ang| +| 斯洛文尼亚文|Slovenian |sl| | 孟加拉文|Bhojpuri |bho| +| 阿尔巴尼亚文|Albanian |sq| | 摩揭陀文 |Magahi |mah| +| 瑞典文|Swedish |sv| | 那格浦尔文|Nagpur |sck| +| 西瓦希里文|Swahili |sw| | 尼瓦尔文|Newari |new| +| 塔加洛文|Tagalog |tl| | 保加利亚文 |Goan Konkani|gom| +| 土耳其文|Turkish |tr| | 沙特阿拉伯文|Saudi Arabia|sa| +| 乌兹别克文|Uzbek |uz| | 阿瓦尔文|Avar |ava| +| 越南文|Vietnamese |vi| | 阿瓦尔文|Avar |ava| +| 蒙古文|Mongolian |mn| | 阿迪赫文|Adyghe |ady| diff --git a/docs/ppocr/blog/ocr_book.en.md b/docs/ppocr/blog/ocr_book.en.md new file mode 100644 index 0000000000..3228eb942e --- /dev/null +++ b/docs/ppocr/blog/ocr_book.en.md @@ -0,0 +1,28 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# E-book: *Dive Into OCR* + +"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR community. The main features are as follows: + +- OCR full-stack technology covering text detection, recognition and document analysis +- Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos +- Jupyter Notebook textbook, flexibly modifying code for instant results + +## Structure + +![img](./images/187578511-9f3c351e-b68c-4359-a6e5-475810993c61.png) + +- The first part is the preliminary knowledge of the book, including the knowledge index and resource links needed in the process of positioning and using the book content of the book + +- The second part is chapters 4-8 of the book, which introduce the concepts, applications, and industry practices related to the detection and identification capabilities of the OCR engine. In the "Introduction to OCR Technology", the application scenarios and challenges of OCR, the basic concepts of technology, and the pain points in industrial applications are comprehensively explained. Then, in the two chapters of "Text Detection" and "Text Recognition", the two basic tasks of OCR are introduced. In each chapter, an algorithm is accompanied by a detailed explanation of the code and practical exercises. Chapters 6 and 7 are a detailed introduction to the PP-OCR series model, PP-OCR is a set of OCR systems for industrial applications, on the basis of the basic detection and identification model, after a series of optimization strategies to achieve the general field of industrial SOTA model, while opening up a variety of predictive deployment solutions, enabling enterprises to quickly land OCR applications. + +- The third part is chapter 9-12 of the book, which introduces applications other than the two-stage OCR engine, including data synthesis, preprocessing algorithm, and end-to-end model, focusing on OCR's layout analysis, table recognition, visual document question and answer capabilities in the document scene, and also through the combination of algorithm and code, so that readers can deeply understand and apply. + +## Address + +- [E-book: *Dive Into OCR* (PDF)](https://paddleocr.bj.bcebos.com/ebook/Dive_into_OCR.pdf) +- [Notebook (.ipynb)](https://github.com/PaddleOCR-Community/Dive-into-OCR) +- [Videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) diff --git a/docs/ppocr/blog/ocr_book.md b/docs/ppocr/blog/ocr_book.md new file mode 100644 index 0000000000..e3e51a77f0 --- /dev/null +++ b/docs/ppocr/blog/ocr_book.md @@ -0,0 +1,29 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 《动手学OCR》电子书 + +《动手学OCR》是PaddleOCR团队携手华中科技大学博导/教授,IAPR Fellow 白翔、复旦大学青年研究员陈智能、中国移动研究院视觉领域资深专家黄文辉、中国工商银行大数据人工智能实验室研究员等产学研同仁,以及OCR开发者共同打造的结合OCR前沿理论与代码实践的教材。主要特色如下: + +- 覆盖从文本检测识别到文档分析的OCR全栈技术 +- 紧密结合理论实践,跨越代码实现鸿沟,并配套教学视频 +- Notebook交互式学习,灵活修改代码,即刻获得结果 + +## 本书结构 + +![](./images/5e612.png) + +- 第一部分是本书的推荐序、序言与预备知识,包含本书的定位与使用书籍内容的过程中需要用到的知识索引、资源链接等 +- 第二部分是本书的4-8章,介绍与OCR核心的检测、识别能力相关的概念、应用与产业实践。在“OCR技术导论”中总括性的解释OCR的应用场景和挑战、技术基本概念以及在产业应用中的痛点问题。然后在 +“文本检测”与“文本识别”两章中介绍OCR的两个基本任务,并在每章中配套一个算法展开代码详解与实战练习。第6、7章是关于PP-OCR系列模型的详细介绍,PP-OCR是一套面向产业应用的OCR系统,在 +基础检测和识别模型的基础之上经过一系列优化策略达到通用领域的产业级SOTA模型,同时打通多种预测部署方案,赋能企业快速落地OCR应用。 +- 第三部分是本书的9-12章,介绍两阶段OCR引擎之外的应用,包括数据合成、预处理算法、端到端模型,重点展开了OCR在文档场景下的版面分析、表格识别、视觉文档问答的能力,同样通过算法与代码结 +合的方式使得读者能够深入理解并应用。 + +## 资料地址 + +- 中文版电子书下载请扫描首页二维码入群后领取 +- [notebook教程](https://github.com/PaddleOCR-Community/Dive-into-OCR) +- [教学视频](https://aistudio.baidu.com/aistudio/education/group/info/25207) diff --git a/docs/ppocr/blog/slice.en.md b/docs/ppocr/blog/slice.en.md new file mode 100644 index 0000000000..0b8b95d135 --- /dev/null +++ b/docs/ppocr/blog/slice.en.md @@ -0,0 +1,22 @@ +--- +comments: true +--- + + +# Slice Operator + +If you have a very large image/document that you would like to run PaddleOCR (detection and recognition) on, you can use the slice operation as follows: + +`ocr_inst = PaddleOCR(**ocr_settings)` +`results = ocr_inst.ocr(img, det=True,rec=True, slice=slice, cls=False,bin=False,inv=False,alpha_color=False)` + +where +`slice = {'horizontal_stride': h_stride, 'vertical_stride':v_stride, 'merge_x_thres':x_thres, 'merge_y_thres': y_thres}` + +Here, `h_stride`, `v_stride`, `x_thres`, and `y_thres` are user-configurable values and need to be set manually. The way the `slice` operator works is that it runs a sliding window across the large input image, creating slices of it and runs the OCR algorithms on it. + +The fragmented slice-level results are then merged together to output image-level detection and recognition results. The horizontal and vertical strides cannot be lower than a certain limit (as too low values would create so many slices it would be very computationally expensive to get results for each of them). However, as an example the recommended values for an image with dimensions 6616x14886 would be as follows. + +`slice = {'horizontal_stride': 300, 'vertical_stride':500, 'merge_x_thres':50, 'merge_y_thres': 35}` + +All slice-level detections with bounding boxes as close as `merge_x_thres` and `merge_y_thres` will be merged together. diff --git a/docs/ppocr/blog/slice.md b/docs/ppocr/blog/slice.md new file mode 100644 index 0000000000..eae91ad399 --- /dev/null +++ b/docs/ppocr/blog/slice.md @@ -0,0 +1,25 @@ +--- +comments: true +--- + +# 切片操作 + +如果希望运行 PaddleOCR 处理一张非常大的图像或文档,对其进行检测和识别,可以使用切片操作,如下所示: + +```python linenums="1" +ocr_inst = PaddleOCR(**ocr_settings) +results = ocr_inst.ocr(img, det=True, rec=True, slice=slice, cls=False, bin=False, inv=False, alpha_color=False) +``` + +其中, +`slice = {'horizontal_stride': h_stride, 'vertical_stride': v_stride, 'merge_x_thres': x_thres, 'merge_y_thres': y_thres}` + +这里的 `h_stride`、`v_stride`、`x_thres` 和 `y_thres` 是用户可配置的参数,需要手动设置。切片操作符的工作原理是,在大图像上运行一个滑动窗口,创建图像的切片,并在这些切片上运行 OCR 算法。 + +然后将这些切片级别的零散结果合并,生成图像级别的检测和识别结果。水平和垂直步幅不能低于一定限度,因为过低的值会产生太多切片,导致计算结果非常耗时。例如,对于尺寸为 6616x14886 的图像,推荐使用以下参数: + +```python linenums="1" +slice = {'horizontal_stride': 300, 'vertical_stride': 500, 'merge_x_thres': 50, 'merge_y_thres': 35} +``` + +所有边界框接近 `merge_x_thres` 和 `merge_y_thres` 的切片级检测结果将被合并在一起。 diff --git a/docs/ppocr/blog/whl.en.md b/docs/ppocr/blog/whl.en.md new file mode 100644 index 0000000000..190a9372ca --- /dev/null +++ b/docs/ppocr/blog/whl.en.md @@ -0,0 +1,488 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Paddleocr Package + +## 1 Get started quickly + +### 1.1 install package + +install by pypi + +```bash linenums="1" +pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+ +``` + +build own whl package and install + +```bash linenums="1" +python3 -m build +pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x is the version of paddleocr +``` + +## 2 Use + +### 2.1 Use by code + +The paddleocr whl package will automatically download the ppocr lightweight model as the default model, which can be customized and replaced according to the section 3 **Custom Model**. + +* detection angle classification and recognition + +```python linenums="1" +from paddleocr import PaddleOCR,draw_ocr +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `french`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item contains bounding box, text and recognition confidence + +```bash linenums="1" +[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] +[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] +[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] +...... +``` + +Visualization of results + +![img](./images/12_det_rec.jpg) + +* detection and recognition + +```python linenums="1" +from paddleocr import PaddleOCR,draw_ocr +ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path, cls=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item contains bounding box, text and recognition confidence + +```bash linenums="1" +[[[442.0, 173.0], [1169.0, 173.0], [1169.0, 225.0], [442.0, 225.0]], ['ACKNOWLEDGEMENTS', 0.99283075]] +[[[393.0, 340.0], [1207.0, 342.0], [1207.0, 389.0], [393.0, 387.0]], ['We would like to thank all the designers and', 0.9357758]] +[[[399.0, 398.0], [1204.0, 398.0], [1204.0, 433.0], [399.0, 433.0]], ['contributors whohave been involved in the', 0.9592447]] +...... +``` + +Visualization of results + +![img](./images/12_det_rec.jpg) + +* classification and recognition + +```python linenums="1" +from paddleocr import PaddleOCR +ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory +img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' +result = ocr.ocr(img_path, det=False, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +Output will be a list, each item contains recognition text and confidence + +```bash linenums="1" +['PAIN', 0.990372] +``` + +* only detection + +```python linenums="1" +from paddleocr import PaddleOCR,draw_ocr +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path,rec=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item only contains bounding box + +```bash linenums="1" +[[756.0, 812.0], [805.0, 812.0], [805.0, 830.0], [756.0, 830.0]] +[[820.0, 803.0], [1085.0, 801.0], [1085.0, 836.0], [820.0, 838.0]] +[[393.0, 801.0], [715.0, 805.0], [715.0, 839.0], [393.0, 836.0]] +...... +``` + +Visualization of results + +![img](./images/12_det.jpg) + +* only recognition + +```python linenums="1" +from paddleocr import PaddleOCR +ocr = PaddleOCR(lang='en') # need to run only once to load model into memory +img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' +result = ocr.ocr(img_path, det=False, cls=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +Output will be a list, each item contains recognition text and confidence + +```bash linenums="1" +['PAIN', 0.990372] +``` + +* only classification + +```python linenums="1" +from paddleocr import PaddleOCR +ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory +img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png' +result = ocr.ocr(img_path, det=False, rec=False, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +Output will be a list, each item contains classification result and confidence + +```bash linenums="1" +['0', 0.99999964] +``` + +### 2.2 Use by command line + +show help information + +```bash linenums="1" +paddleocr -h +``` + +* detection classification and recognition + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true --lang en +``` + +Output will be a list, each item contains bounding box, text and recognition confidence + +```bash linenums="1" +[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)] +[[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)] +[[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)] +...... +``` + +pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + +```bash linenums="1" +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + +* detection and recognition + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en +``` + +Output will be a list, each item contains bounding box, text and recognition confidence + +```bash linenums="1" +[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)] +[[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)] +[[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)] +...... +``` + +* classification and recognition + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --lang en +``` + +Output will be a list, each item contains text and recognition confidence + +```bash linenums="1" +['PAIN', 0.9934559464454651] +``` + +* only detection + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --rec false +``` + +Output will be a list, each item only contains bounding box + +```bash linenums="1" +[[397.0, 802.0], [1092.0, 802.0], [1092.0, 841.0], [397.0, 841.0]] +[[397.0, 750.0], [1211.0, 750.0], [1211.0, 789.0], [397.0, 789.0]] +[[397.0, 702.0], [1209.0, 698.0], [1209.0, 734.0], [397.0, 738.0]] +...... +``` + +* only recognition + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --lang en +``` + +Output will be a list, each item contains text and recognition confidence + +```bash linenums="1" +['PAIN', 0.9934559464454651] +``` + +* only classification + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true --det false --rec false +``` + +Output will be a list, each item contains classification result and confidence + +```bash linenums="1" +['0', 0.99999964] +``` + +## 3 Use custom model + +When the built-in model cannot meet the needs, you need to use your own trained model. +First, refer to [export](../model_train/detection.en.md#4-inference) doc to convert your det and rec model to inference model, and then use it as follows + +### 3.1 Use by code + +```python linenums="1" +from paddleocr import PaddleOCR,draw_ocr +# The path of detection and recognition model must contain model and params files +ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True) +img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +### 3.2 Use by command line + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true +``` + +## 4 Use web images or numpy array as input + +### 4.1 Web image + +* Use by code + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# show result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +* Use by command line + +```bash linenums="1" +paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true +``` + +### 4.2 Numpy array + +Support numpy array as input only when used by code + +```python linenums="1" +import cv2 +from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +img = cv2.imread(img_path) +# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), If your own training model supports grayscale images, you can uncomment this line +result = ocr.ocr(img, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# show result +from PIL import Image +result = result[0] +download_with_progressbar(img_path, 'tmp.jpg') +image = Image.open('tmp.jpg').convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +## 5 PDF file + +* Use by command line + +you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + +```bash linenums="1" +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + +* Use by code + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# draw result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 Parameter Description + +| Parameter | Description | Default value | +|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| use_gpu | use GPU or not | TRUE | +| gpu_mem | GPU memory size used for initialization | 8000M | +| image_dir | The images path or folder path for predicting when used by the command line | | +| page_num | Valid when the input type is pdf file, specify to predict the previous page_num pages, all pages are predicted by default | 0 | +| det_algorithm | Type of detection algorithm selected | DB | +| det_model_dir | the text detection inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/det`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | +| det_max_side_len | The maximum size of the long side of the image. When the long side exceeds this value, the long side will be resized to this size, and the short side will be scaled proportionally | 960 | +| det_db_thresh | Binarization threshold value of DB output map | 0.3 | +| det_db_box_thresh | The threshold value of the DB output box. Boxes score lower than this value will be discarded | 0.5 | +| det_db_unclip_ratio | The expanded ratio of DB output box | 2 | +| det_db_score_mode | The parameter that control how the score of the detection frame is calculated. There are 'fast' and 'slow' options. If the text to be detected is curved, it is recommended to use 'slow' | 'fast' | +| det_east_score_thresh | Binarization threshold value of EAST output map | 0.8 | +| det_east_cover_thresh | The threshold value of the EAST output box. Boxes score lower than this value will be discarded | 0.1 | +| det_east_nms_thresh | The NMS threshold value of EAST model output box | 0.2 | +| rec_algorithm | Type of recognition algorithm selected | CRNN | +| rec_model_dir | the text recognition inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/rec`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | +| rec_image_shape | image shape of recognition algorithm | "3,32,320" | +| rec_batch_num | When performing recognition, the batchsize of forward images | 30 | +| max_text_length | The maximum text length that the recognition algorithm can recognize | 25 | +| rec_char_dict_path | the alphabet path which needs to be modified to your own path when `rec_model_Name` use mode 2 | ./ppocr/utils/ppocr_keys_v1.txt | +| use_space_char | Whether to recognize spaces | TRUE | +| drop_score | Filter the output by score (from the recognition model), and those below this score will not be returned | 0.5 | +| use_angle_cls | Whether to load classification model | FALSE | +| cls_model_dir | the classification inference model folder. There are two ways to transfer parameters, 1. None: Automatically download the built-in model to `~/.paddleocr/cls`; 2. The path of the inference model converted by yourself, the model and params files must be included in the model path | None | +| cls_image_shape | image shape of classification algorithm | "3,48,192" | +| label_list | label list of classification algorithm | ['0','180'] | +| cls_batch_num | When performing classification, the batchsize of forward images | 30 | +| enable_mkldnn | Whether to enable mkldnn | FALSE | +| use_zero_copy_run | Whether to forward by zero_copy_run | FALSE | +| lang | The support language, now only Chinese(ch)、English(en)、French(french)、German(german)、Korean(korean)、Japanese(japan) are supported | ch | +| det | Enable detction when `ppocr.ocr` func exec | TRUE | +| rec | Enable recognition when `ppocr.ocr` func exec | TRUE | +| cls | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction) | FALSE | +| show_log | Whether to print log| FALSE | +| type | Perform ocr or table structuring, the value is selected in ['ocr','structure'] | ocr | +| ocr_version | OCR Model version number, the current model support list is as follows: PP-OCRv3 supports Chinese and English detection, recognition, multilingual recognition, direction classifier models, PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv3 | diff --git a/docs/ppocr/blog/whl.md b/docs/ppocr/blog/whl.md new file mode 100644 index 0000000000..722c0462ed --- /dev/null +++ b/docs/ppocr/blog/whl.md @@ -0,0 +1,501 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# paddleocr package使用说明 + +## 1 快速上手 + +### 1.1 安装whl包 + +pip安装 + +```bash linenums="1" +pip install paddleocr +``` + +本地构建并安装 + +```bash linenums="1" +python3 -m build +pip3 install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x是paddleocr的版本号 +``` + +## 2 使用 + +### 2.1 代码使用 + +paddleocr whl包会自动下载ppocr轻量级模型作为默认模型,可以根据第3节**自定义模型**进行自定义更换。 + +* 检测+方向分类器+识别全流程 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 +# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```bash linenums="1" +[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] +[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +...... +``` + +结果可视化 + +
+ +
+ +* 检测+识别 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path, cls=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```bash linenums="1" +[[[24.0, 36.0], [304.0, 34.0], [304.0, 72.0], [24.0, 74.0]], ['纯臻营养护发素', 0.964739]] +[[[24.0, 80.0], [172.0, 80.0], [172.0, 104.0], [24.0, 104.0]], ['产品信息/参数', 0.98069626]] +[[[24.0, 109.0], [333.0, 109.0], [333.0, 136.0], [24.0, 136.0]], ['(45元/每公斤,100公斤起订)', 0.9676722]] +...... +``` + +结果可视化 + +
+ +
+ +* 方向分类器+识别 + +```python linenums="1" +from paddleocr import PaddleOCR + +ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' +result = ocr.ocr(img_path, det=False, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +结果是一个list,每个item只包含识别结果和识别置信度 + +```bash linenums="1" +['韩国小馆', 0.9907421] +``` + +* 单独执行检测 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path, rec=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +im_show = draw_ocr(image, result, txts=None, scores=None, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果是一个list,每个item只包含文本框 + +```bash linenums="1" +[[26.0, 457.0], [137.0, 457.0], [137.0, 477.0], [26.0, 477.0]] +[[25.0, 425.0], [372.0, 425.0], [372.0, 448.0], [25.0, 448.0]] +[[128.0, 397.0], [273.0, 397.0], [273.0, 414.0], [128.0, 414.0]] +...... +``` + +结果可视化 + +
+ +
+ +* 单独执行识别 + +```python linenums="1" +from paddleocr import PaddleOCR + +ocr = PaddleOCR() # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' +result = ocr.ocr(img_path, det=False) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +结果是一个list,每个item只包含识别结果和识别置信度 + +```bash linenums="1" +['韩国小馆', 0.9907421] +``` + +* 单独执行方向分类器 + +```python linenums="1" +from paddleocr import PaddleOCR + +ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs_words/ch/word_1.jpg' +result = ocr.ocr(img_path, det=False, rec=False, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) +``` + +结果是一个list,每个item只包含分类结果和分类置信度 + +```bash linenums="1" +['0', 0.9999924] +``` + +### 2.2 通过命令行使用 + +查看帮助信息 + +```bash linenums="1" +paddleocr -h +``` + +* 检测+方向分类器+识别全流程 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --use_angle_cls true +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```bash linenums="1" +[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)] +...... +``` + +此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + +```bash linenums="1" +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + +* 检测+识别 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```bash linenums="1" +[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)] +...... +``` + +* 方向分类器+识别 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false +``` + +结果是一个list,每个item只包含识别结果和识别置信度 + +```bash linenums="1" +['韩国小馆', 0.994467] +``` + +* 单独执行检测 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --rec false +``` + +结果是一个list,每个item只包含文本框 + +```bash linenums="1" +[[27.0, 459.0], [136.0, 459.0], [136.0, 479.0], [27.0, 479.0]] +[[28.0, 429.0], [372.0, 429.0], [372.0, 445.0], [28.0, 445.0]] +...... +``` + +* 单独执行识别 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --det false +``` + +结果是一个list,每个item只包含识别结果和识别置信度 + +```bash linenums="1" +['韩国小馆', 0.994467] +``` + +* 单独执行方向分类器 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls true --det false --rec false +``` + +结果是一个list,每个item只包含分类结果和分类置信度 + +```bash linenums="1" +['0', 0.9999924] +``` + +## 3 自定义模型 + +当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[模型导出](../model_train/detection.md#4-模型导出与预测)将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 + +### 3.1 代码使用 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# 模型路径下必须含有model和params文件 +ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', + rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', + use_angle_cls=True) +img_path = 'PaddleOCR/doc/imgs/11.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +### 3.2 通过命令行使用 + +```bash linenums="1" +paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true +``` + +## 4 使用网络图片或者numpy数组作为输入 + +### 4.1 网络图片 + +* 代码使用 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr, download_with_progressbar + +# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 +# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = 'http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +download_with_progressbar(img_path, 'tmp.jpg') +image = Image.open('tmp.jpg').convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +* 命令行模式 + +```bash linenums="1" +paddleocr --image_dir http://n.sinaimg.cn/ent/transform/w630h933/20171222/o111-fypvuqf1838418.jpg --use_angle_cls=true +``` + +### 4.2 numpy数组 + +仅通过代码使用时支持numpy数组作为输入 + +```python linenums="1" +import cv2 +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 +# 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = 'PaddleOCR/doc/imgs/11.jpg' +img = cv2.imread(img_path) +# img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY), 如果你自己训练的模型支持灰度图,可以将这句话的注释取消 +result = ocr.ocr(img, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='/path/to/PaddleOCR/doc/fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +## 5 PDF文件作为输入 + +* 命令行模式 + +可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + +```bash linenums="1" +paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 +``` + +* 代码使用 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=2) # need to run only once to download and load model into memory +img_path = './xxx.pdf' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +## 6 参数说明 + +| 字段 | 说明| 默认值 | +|-------------------------|----------|-------------------------| +| use_gpu| 是否使用GPU | TRUE | +| gpu_mem| 初始化占用的GPU内存大小 | 8000M | +| image_dir | 通过命令行调用时执行预测的图片或文件夹路径 | +| page_num | 当输入类型为pdf文件时有效,指定预测前面page_num页,默认预测所有页 | 0| +| det_algorithm | 使用的检测算法类型 | DB | +| det_model_dir | 检测模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/det`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | +| det_max_side_len | 检测算法前向时图片长边的最大尺寸,当长边超出这个值时会将长边resize到这个大小,短边等比例缩放 | 960 | +| det_db_thresh | DB模型输出预测图的二值化阈值 | 0.3 | +| det_db_box_thresh | DB模型输出框的阈值,低于此值的预测框会被丢弃 | 0.5 | +| det_db_unclip_ratio | DB模型输出框扩大的比例 | 2 | +| det_db_score_mode | 计算检测框score的方式,有'fast'和'slow',如果要检测的文字有弯曲,建议用'slow','slow'模式计算的box的score偏大,box不容易被过滤掉 | 'fast' | +| det_east_score_thresh | EAST模型输出预测图的二值化阈值 | 0.8 | +| det_east_cover_thresh | EAST模型输出框的阈值,低于此值的预测框会被丢弃 | 0.1 | +| det_east_nms_thresh | EAST模型输出框NMS的阈值 | 0.2 | +| rec_algorithm | 使用的识别算法类型 | CRNN | +| rec_model_dir | 识别模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/rec`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | +| rec_image_shape | 识别算法的输入图片尺寸 | "3,32,320" | +| rec_batch_num | 进行识别时,同时前向的图片数 | 30 | +| max_text_length | 识别算法能识别的最大文字长度 | 25 | +| rec_char_dict_path | 识别模型字典路径,当rec_model_dir使用方式2传参时需要修改为自己的字典路径 | ./ppocr/utils/ppocr_keys_v1.txt | +| use_space_char | 是否识别空格 | TRUE | +| drop_score | 对输出按照分数(来自于识别模型)进行过滤,低于此分数的不返回 | 0.5 | +| use_angle_cls | 是否加载分类模型 | FALSE | +| cls_model_dir | 分类模型所在文件夹。传参方式有两种,1. None: 自动下载内置模型到 `~/.paddleocr/cls`;2.自己转换好的inference模型路径,模型路径下必须包含model和params文件 | None | +| cls_image_shape | 分类算法的输入图片尺寸 | "3, 48, 192" | +| label_list | 分类算法的标签列表 | ['0', '180'] | +| cls_batch_num | 进行分类时,同时前向的图片数 |30| +| enable_mkldnn | 是否启用mkldnn | FALSE | +| use_zero_copy_run | 是否通过zero_copy_run的方式进行前向| FALSE | +| lang | 模型语言类型,目前支持 目前支持中英文(ch)、英文(en)、法语(french)、德语(german)、韩语(korean)、日语(japan) | ch | +| det | 前向时使用启动检测 | TRUE | +| rec | 前向时是否启动识别 | TRUE | +| cls | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类)| FALSE | +| show_log | 是否打印logger信息| FALSE | +| type | 执行ocr或者表格结构化, 值可选['ocr','structure'] | ocr | +| ocr_version | OCR模型版本,可选PP-OCRv3, PP-OCRv2, PP-OCR。PP-OCRv3 支持中、英文的检测、识别、多语种识别,方向分类器等模型;PP-OCRv2 目前仅支持中文的检测和识别模型;PP-OCR支持中文的检测,识别,多语种识别,方向分类器等模型 | PP-OCRv3 | diff --git a/docs/ppocr/environment.en.md b/docs/ppocr/environment.en.md new file mode 100644 index 0000000000..141f342015 --- /dev/null +++ b/docs/ppocr/environment.en.md @@ -0,0 +1,313 @@ +--- +comments: true +--- + +# Environment Preparation + +Windows and Mac users are recommended to use Anaconda to build a Python environment, and Linux users are recommended to use docker to build a Python environment. + +Recommended working environment: + +- PaddlePaddle >= 2.1.2 +- Python 3.7 +- CUDA 10.1 / CUDA 10.2 +- cuDNN 7.6 + +> If you already have a Python environment installed, you can skip to [PaddleOCR Quick Start](./quickstart_en.md). + +## 1. Python Environment Setup + +### 1.1 Windows + +#### 1.1.1 Install Anaconda + +- Note: To use PaddlePaddle you need to install python environment first, here we choose python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment. + +- Anaconda download. + + - Address: + - Most Win10 computers are 64-bit operating systems, choose x86_64 version; if the computer is a 32-bit operating system, choose x86.exe + + ![anaconda download](./images/Anaconda_download.png) + + - After the download is complete, double-click the installer to enter the graphical interface + + - The default installation location is C drive, it is recommended to change the installation location to D drive. + + ![](./images/anaconda_install_folder.png) + + - Check Conda to add environment variables and ignore the warning that + ![](./images/anaconda_install_env.png) + +#### 1.1.2 Opening the terminal and creating the Conda environment + +- Open Anaconda Prompt terminal: bottom left Windows Start Menu -> Anaconda3 -> Anaconda Prompt start console + + ![anaconda download](./images/anaconda_prompt.png) + +- Create a new Conda environment + + ```bash linenums="1" + # Enter the following command at the command line to create an environment named paddle_env + # Here to speed up the download, use the Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ # This is a one line command + ``` + + This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + The command line will then output a prompt, type y and enter to continue the installation + + ![conda create](./images/conda_new_env.png) + +- To activate the Conda environment you just created, enter the following command at the command line. + + ```bash linenums="1" + # Activate the paddle_env environment + conda activate paddle_env + # View the current location of python + where python + ``` + + ![create environment](./images/conda_list_env.png) + +The above anaconda environment and python environment are installed + +### 1.2 Mac + +#### 1.2.1 Installing Anaconda + +- Note: To use PaddlePaddle you need to install the python environment first, here we choose the python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment + +- Anaconda download:. + + - Address: + + ![anaconda download](./images/anaconda_start.png) + + - Select `Anaconda3-2021.05-MacOSX-x86_64.pkg` at the bottom to download + +- After downloading, double click on the .pkg file to enter the graphical interface + + - Just follow the default settings, it will take a while to install + +- It is recommended to install a code editor such as VSCode or PyCharm + +#### 1.2.2 Open a terminal and create a Conda environment + +- Open the terminal + + - Press command and spacebar at the same time, type "terminal" in the focus search, double click to enter terminal + +- **Add Conda to the environment variables** + + - Environment variables are added so that the system can recognize the Conda command + + - Open `~/.bash_profile` in the terminal by typing the following command. + + ```bash linenums="1" + vim ~/.bash_profile + ``` + + - Add Conda as an environment variable in `~/.bash_profile`. + + ```bash linenums="1" + # Press i first to enter edit mode + # In the first line type. + export PATH="~/opt/anaconda3/bin:$PATH" + # If you customized the installation location during installation, change ~/opt/anaconda3/bin to the bin folder in the customized installation directory + + # The modified ~/.bash_profile file should look like this (where xxx is the username) + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !!! Contents within this block are managed by 'conda init' !!! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - When you are done, press `esc` to exit edit mode, then type `:wq!` and enter to save and exit + + - Verify that the Conda command is recognized. + + - Enter `source ~/.bash_profile` in the terminal to update the environment variables + - Enter `conda info --envs` in the terminal again, if it shows that there is a base environment, then Conda has been added to the environment variables + +- Create a new Conda environment + + ```bash linenums="1" + # Enter the following command at the command line to create an environment called paddle_env + # Here to speed up the download, use Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + - The command line will then output a prompt, type y and enter to continue the installation + + ![conda_create](./images/conda_create.png) + +- To activate the Conda environment you just created, enter the following command at the command line. + + ```bash linenums="1" + # Activate the paddle_env environment + conda activate paddle_env + # View the current location of python + where python + ``` + + ![conda_actviate](./images/conda_activate.png) + +The above anaconda environment and python environment are installed + +### 1.3 Linux + +Linux users can choose to run either Anaconda or Docker. If you are familiar with Docker and need to train the PaddleOCR model, it is recommended to use the Docker environment, where the development process of PaddleOCR is run. If you are not familiar with Docker, you can also use Anaconda to run the project. + +#### 1.3.1 Anaconda environment configuration + +- Note: To use PaddlePaddle you need to install the python environment first, here we choose the python integrated environment Anaconda toolkit + + - Anaconda is a common python package manager + - After installing Anaconda, you can install the python environment, as well as numpy and other required toolkit environment + +- **Download Anaconda**. + + - Download at: + + ![img](./images/anaconda_download-20240704081644684.png) + + - Select the appropriate version for your operating system + - Type `uname -m` in the terminal to check the command set used by your system + + - Download method 1: Download locally, then transfer the installation package to the Linux server + + - Download method 2: Directly use Linux command line to download + + ```bash linenums="1" + # First install wget + sudo apt-get install wget # Ubuntu + sudo yum install wget # CentOS + ``` + + ```bash linenums="1" + # Then use wget to download from Tsinghua source + # If you want to download Anaconda3-2021.05-Linux-x86_64.sh, the download command is as follows + wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2021.05-Linux-x86_64.sh + # If you want to download another version, you need to change the file name after the last 1 / to the version you want to download + ``` + +- To install Anaconda. + + - Type `sh Anaconda3-2021.05-Linux-x86_64.sh` at the command line + - If you downloaded a different version, replace the file name of the command with the name of the file you downloaded + - Just follow the installation instructions + - You can exit by typing q when viewing the license + +- **Add conda to the environment variables** + + - If you have already added conda to the environment variable path during the installation, you can skip this step + + - Open `~/.bashrc` in a terminal. + + ```bash linenums="1" + # Enter the following command in the terminal. + vim ~/.bashrc + ``` + + - Add conda as an environment variable in `~/.bashrc`. + + ```bash linenums="1" + # Press i first to enter edit mode # In the first line enter. + export PATH="~/anaconda3/bin:$PATH" + # If you customized the installation location during installation, change ~/anaconda3/bin to the bin folder in the customized installation directory + ``` + + ```bash linenums="1" + # The modified ~/.bash_profile file should look like this (where xxx is the username) + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !!! Contents within this block are managed by 'conda init' !!! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - When you are done, press `esc` to exit edit mode, then type `:wq!` and enter to save and exit + + - Verify that the Conda command is recognized. + + - Enter `source ~/.bash_profile` in the terminal to update the environment variables + - Enter `conda info --envs` in the terminal again, if it shows that there is a base environment, then Conda has been added to the environment variables + +- Create a new Conda environment + + ```bash linenums="1" + # Enter the following command at the command line to create an environment called paddle_env + # Here to speed up the download, use Tsinghua source + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - This command will create an executable environment named paddle_env with python version 3.8, which will take a while depending on the network status + + - The command line will then output a prompt, type y and enter to continue the installation + + ![conda_create](./images/conda_create-20240704081656378.png) + +- To activate the Conda environment you just created, enter the following command at the command line. + + ```bash linenums="1" + # Activate the paddle_env environment + conda activate paddle_env + ``` + +The above anaconda environment and python environment are installed + +#### 1.3.2 Docker environment preparation + +**The first time you use this docker image, it will be downloaded automatically. Please be patient.** + +```bash linenums="1" +# Switch to the working directory +cd /home/Projects +# You need to create a docker container for the first run, and do not need to run the current command when you run it again +# Create a docker container named ppocr and map the current directory to the /paddle directory of the container + +# If using CPU, use docker instead of nvidia-docker to create docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +# If using GPU, use nvidia-docker to create docker +# docker image registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda11.2-cudnn8 is recommended for CUDA11.2 + CUDNN8. +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +``` + +You can also visit [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) to get the image that fits your machine. + +```bash linenums="1" +# ctrl+P+Q to exit docker, to re-enter docker using the following command: +sudo docker container exec -it ppocr /bin/bash +``` diff --git a/docs/ppocr/environment.md b/docs/ppocr/environment.md new file mode 100644 index 0000000000..8cd4bf7e4d --- /dev/null +++ b/docs/ppocr/environment.md @@ -0,0 +1,302 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 运行环境准备 + +Windows和Mac用户推荐使用Anaconda搭建Python环境,Linux用户建议使用docker搭建Python环境。 + +推荐环境: + +- PaddlePaddle >= 2.1.2 +- Python 3.7 +- CUDA10.1 / CUDA10.2 +- CUDNN 7.6 + +> 如果您已经安装Python环境,可以直接参考[PaddleOCR快速开始](./quick_start.md) + +## 1. Python环境搭建 + +### 1.1 Windows + +#### 1.1.1 安装Anaconda + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境。 + +- Anaconda下载: + - 地址: + - 大部分win10电脑均为64位操作系统,选择x86_64版本;若电脑为32位操作系统,则选择x86.exe + + ![anaconda download](./images/Anaconda_download.png) + - 下载完成后,双击安装程序进入图形界面 + - 默认安装位置为C盘,建议将安装位置更改到D盘: + + install config + + - 勾选conda加入环境变量,忽略警告: + + add conda to path + +#### 1.1.2 打开终端并创建conda环境 + +- 打开Anaconda Prompt终端:左下角Windows Start Menu -> Anaconda3 -> Anaconda Prompt启动控制台 + + ![anaconda download](./images/anaconda_prompt.png) + +- 创建新的conda环境 + + ```bash linenums="1" + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ # 这是一行命令 + ``` + + 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + 之后命令行中会输出提示信息,输入y并回车继续安装 + + ![conda create](./images/conda_new_env.png) + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```bash linenums="1" + # 激活paddle_env环境 + conda activate paddle_env + # 查看当前python的位置 + where python + ``` + + ![create environment](./images/conda_list_env.png) + +以上anaconda环境和python环境安装完毕 + +### 1.2 Mac + +#### 1.2.1 安装Anaconda + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境 +- Anaconda下载: + - 地址: + + ![anaconda download](./images/anaconda_start.png) + + - 选择最下方的`Anaconda3-2021.05-MacOSX-x86_64.pkg`下载 +- 下载完成后,双击.pkg文件进入图形界面 + - 按默认设置即可,安装需要花费一段时间 +- 建议安装vscode或pycharm等代码编辑器 + +#### 1.2.2 打开终端并创建conda环境 + +- 打开终端 + - 同时按下command键和空格键,在聚焦搜索中输入"终端",双击进入终端 + +- **将conda加入环境变量** + + - 加入环境变量是为了让系统能识别conda命令 + + - 输入以下命令,在终端中打开`~/.bash_profile`: + + ```bash linenums="1" + vim ~/.bash_profile + ``` + + - 在`~/.bash_profile`中将conda添加为环境变量: + + ```bash linenums="1" + # 先按i进入编辑模式 + # 在第一行输入: + export PATH="~/opt/anaconda3/bin:$PATH" + # 若安装时自定义了安装位置,则将~/opt/anaconda3/bin改为自定义的安装目录下的bin文件夹 + ``` + + ```bash linenums="1" + # 修改后的~/.bash_profile文件应如下(其中xxx为用户名): + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !! Contents within this block are managed by 'conda init' !! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - 修改完成后,先按`esc`键退出编辑模式,再输入`:wq!`并回车,以保存退出 + + - 验证是否能识别conda命令: + + - 在终端中输入`source ~/.bash_profile`以更新环境变量 + - 再在终端输入`conda info --envs`,若能显示当前有base环境,则conda已加入环境变量 + +- 创建新的conda环境 + + ```bash linenums="1" + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + - 之后命令行中会输出提示信息,输入y并回车继续安装 + + ![conda_create](./images/conda_create.png) + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```bash linenums="1" + # 激活paddle_env环境 + conda activate paddle_env + # 查看当前python的位置 + where python + ``` + + ![conda_actviate](./images/conda_activate.png) + +以上anaconda环境和python环境安装完毕 + +### 1.3 Linux + +Linux用户可选择Anaconda或Docker两种方式运行。如果你熟悉Docker且需要训练PaddleOCR模型,推荐使用Docker环境,PaddleOCR的开发流程均在Docker环境下运行。如果你不熟悉Docker,也可以使用Anaconda来运行项目。 + +#### 1.3.1 Anaconda环境配置 + +- 说明:使用paddlepaddle需要先安装python环境,这里我们选择python集成环境Anaconda工具包 + - Anaconda是1个常用的python包管理程序 + - 安装完Anaconda后,可以安装python环境,以及numpy等所需的工具包环境 + +- **下载Anaconda**: + + - 下载地址: + + ![img](./images/anaconda_download-20240704081644684.png) + + - 选择适合您操作系统的版本 + - 可在终端输入`uname -m`查询系统所用的指令集 + +- 下载法1:本地下载,再将安装包传到linux服务器上 + +- 下载法2:直接使用linux命令行下载 + + ```bash linenums="1" + # 首先安装wget + sudo apt-get install wget # Ubuntu + sudo yum install wget # CentOS + ``` + + ```bash linenums="1" + # 然后使用wget从清华源上下载 + # 如要下载Anaconda3-2021.05-Linux-x86_64.sh,则下载命令如下: + wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2021.05-Linux-x86_64.sh + + # 若您要下载其他版本,需要将最后1个/后的文件名改成您希望下载的版本 + ``` + +- 安装Anaconda: + + - 在命令行输入`sh Anaconda3-2021.05-Linux-x86_64.sh` + - 若您下载的是其它版本,则将该命令的文件名替换为您下载的文件名 + - 按照安装提示安装即可 + - 查看许可时可输入q来退出 + +- **将conda加入环境变量** + + - 加入环境变量是为了让系统能识别conda命令,若您在安装时已将conda加入环境变量path,则可跳过本步 + + - 在终端中打开`~/.bashrc`: + + ```bash linenums="1" + # 在终端中输入以下命令: + vim ~/.bashrc + ``` + + - 在`~/.bashrc`中将conda添加为环境变量: + + ```bash linenums="1" + # 先按i进入编辑模式 + # 在第一行输入: + export PATH="~/anaconda3/bin:$PATH" + # 若安装时自定义了安装位置,则将~/anaconda3/bin改为自定义的安装目录下的bin文件夹 + ``` + + ```bash linenums="1" + # 修改后的~/.bash_profile文件应如下(其中xxx为用户名): + export PATH="~/opt/anaconda3/bin:$PATH" + # >>> conda initialize >>> + # !! Contents within this block are managed by 'conda init' !! + __conda_setup="$('/Users/xxx/opt/anaconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ $? -eq 0 ]; then + eval "$__conda_setup" + else + if [ -f "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" ]; then + . "/Users/xxx/opt/anaconda3/etc/profile.d/conda.sh" + else + export PATH="/Users/xxx/opt/anaconda3/bin:$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + ``` + + - 修改完成后,先按`esc`键退出编辑模式,再输入`:wq!`并回车,以保存退出 + + - 验证是否能识别conda命令: + + - 在终端中输入`source ~/.bash_profile`以更新环境变量 + - 再在终端输入`conda info --envs`,若能显示当前有base环境,则conda已加入环境变量 + +- 创建新的conda环境 + + ```bash linenums="1" + # 在命令行输入以下命令,创建名为paddle_env的环境 + # 此处为加速下载,使用清华源 + conda create --name paddle_env python=3.8 --channel https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ + ``` + + - 该命令会创建1个名为paddle_env、python版本为3.8的可执行环境,根据网络状态,需要花费一段时间 + + - 之后命令行中会输出提示信息,输入y并回车继续安装 + + ![conda_create](./images/conda_create-20240704081656378.png) + +- 激活刚创建的conda环境,在命令行中输入以下命令: + + ```bash linenums="1" + # 激活paddle_env环境 + conda activate paddle_env + ``` + +以上anaconda环境和python环境安装完毕 + +#### 1.3.2 Docker环境配置 + +**注意:第一次使用这个镜像,会自动下载该镜像,请耐心等待。您也可以访问[DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/)获取与您机器适配的镜像。** + +```bash linenums="1" +# 切换到工作目录下 +cd /home/Projects +# 首次运行需创建一个docker容器,再次运行时不需要运行当前命令 +# 创建一个名字为ppocr的docker容器,并将当前目录映射到容器的/paddle目录下 + +#如果您希望在CPU环境下使用docker,使用docker而不是nvidia-docker创建docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +#如果使用CUDA10,请运行以下命令创建容器,设置docker容器共享内存shm-size为64G,建议设置32G以上 +# 如果是CUDA11+CUDNN8,推荐使用镜像registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda11.2-cudnn8 +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it registry.baidubce.com/paddlepaddle/paddle:2.1.3-gpu-cuda10.2-cudnn7 /bin/bash + +# ctrl+P+Q可退出docker 容器,重新进入docker 容器使用如下命令 +sudo docker container exec -it ppocr /bin/bash +``` diff --git a/docs/ppocr/images/254.jpg b/docs/ppocr/images/254.jpg new file mode 100644 index 0000000000..c871fb042c Binary files /dev/null and b/docs/ppocr/images/254.jpg differ diff --git a/docs/ppocr/images/Anaconda_download.png b/docs/ppocr/images/Anaconda_download.png new file mode 100644 index 0000000000..83a0341493 Binary files /dev/null and b/docs/ppocr/images/Anaconda_download.png differ diff --git a/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic001.jpg b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic001.jpg new file mode 100644 index 0000000000..45ffdb53aa Binary files /dev/null and b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic001.jpg differ diff --git a/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic002.jpg b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic002.jpg new file mode 100644 index 0000000000..7ac153aee0 Binary files /dev/null and b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic002.jpg differ diff --git a/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic003.jpg b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic003.jpg new file mode 100644 index 0000000000..781aade629 Binary files /dev/null and b/docs/ppocr/images/PP-OCRv2/PP-OCRv2-pic003.jpg differ diff --git a/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg new file mode 100644 index 0000000000..c35936cc1a Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg differ diff --git a/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg new file mode 100644 index 0000000000..e5ad6a4b2a Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg differ diff --git a/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg new file mode 100644 index 0000000000..dc024296bd Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg differ diff --git a/docs/ppocr/images/PP-OCRv3/en/en_1.png b/docs/ppocr/images/PP-OCRv3/en/en_1.png new file mode 100644 index 0000000000..36245613e3 Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/en/en_1.png differ diff --git a/docs/ppocr/images/PP-OCRv3/en/en_2.png b/docs/ppocr/images/PP-OCRv3/en/en_2.png new file mode 100644 index 0000000000..d2df8556ad Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/en/en_2.png differ diff --git a/docs/ppocr/images/PP-OCRv3/en/en_3.png b/docs/ppocr/images/PP-OCRv3/en/en_3.png new file mode 100644 index 0000000000..baf146c010 Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/en/en_3.png differ diff --git a/docs/ppocr/images/PP-OCRv3/en/en_4.png b/docs/ppocr/images/PP-OCRv3/en/en_4.png new file mode 100644 index 0000000000..f0f19db95b Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/en/en_4.png differ diff --git a/docs/ppocr/images/PP-OCRv3/multi_lang/japan_2.jpg b/docs/ppocr/images/PP-OCRv3/multi_lang/japan_2.jpg new file mode 100644 index 0000000000..076ced92ad Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/multi_lang/japan_2.jpg differ diff --git a/docs/ppocr/images/PP-OCRv3/multi_lang/korean_1.jpg b/docs/ppocr/images/PP-OCRv3/multi_lang/korean_1.jpg new file mode 100644 index 0000000000..f93de40e18 Binary files /dev/null and b/docs/ppocr/images/PP-OCRv3/multi_lang/korean_1.jpg differ diff --git a/docs/ppocr/images/anaconda_download-20240704081644684.png b/docs/ppocr/images/anaconda_download-20240704081644684.png new file mode 100644 index 0000000000..6ab6db3089 Binary files /dev/null and b/docs/ppocr/images/anaconda_download-20240704081644684.png differ diff --git a/docs/ppocr/images/anaconda_install_env.png b/docs/ppocr/images/anaconda_install_env.png new file mode 100644 index 0000000000..7a22542712 Binary files /dev/null and b/docs/ppocr/images/anaconda_install_env.png differ diff --git a/docs/ppocr/images/anaconda_install_folder.png b/docs/ppocr/images/anaconda_install_folder.png new file mode 100644 index 0000000000..e9fac29eaa Binary files /dev/null and b/docs/ppocr/images/anaconda_install_folder.png differ diff --git a/docs/ppocr/images/anaconda_prompt.png b/docs/ppocr/images/anaconda_prompt.png new file mode 100644 index 0000000000..1087610ae0 Binary files /dev/null and b/docs/ppocr/images/anaconda_prompt.png differ diff --git a/docs/ppocr/images/anaconda_start.png b/docs/ppocr/images/anaconda_start.png new file mode 100644 index 0000000000..a860f5e56a Binary files /dev/null and b/docs/ppocr/images/anaconda_start.png differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/00006737.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00006737.jpg new file mode 100644 index 0000000000..d7762d2e2c Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00006737.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/00009282.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00009282.jpg new file mode 100644 index 0000000000..0383d445bd Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00009282.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/00015504.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00015504.jpg new file mode 100644 index 0000000000..9162cf1479 Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00015504.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/00059985.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00059985.jpg new file mode 100644 index 0000000000..03fd19784a Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00059985.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/00111002.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00111002.jpg new file mode 100644 index 0000000000..7dae24a92d Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/00111002.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/img_12.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/img_12.jpg new file mode 100644 index 0000000000..11ac4ed6ce Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/img_12.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/rotate_00052204.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/rotate_00052204.jpg new file mode 100644 index 0000000000..643b850da8 Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/rotate_00052204.jpg differ diff --git a/docs/ppocr/images/ch_ppocr_mobile_v2.0/test_add_91.jpg b/docs/ppocr/images/ch_ppocr_mobile_v2.0/test_add_91.jpg new file mode 100644 index 0000000000..b5ded6e1de Binary files /dev/null and b/docs/ppocr/images/ch_ppocr_mobile_v2.0/test_add_91.jpg differ diff --git a/docs/ppocr/images/conda_activate.png b/docs/ppocr/images/conda_activate.png new file mode 100644 index 0000000000..a2e6074e91 Binary files /dev/null and b/docs/ppocr/images/conda_activate.png differ diff --git a/docs/ppocr/images/conda_create-20240704081656378.png b/docs/ppocr/images/conda_create-20240704081656378.png new file mode 100644 index 0000000000..533f592b7c Binary files /dev/null and b/docs/ppocr/images/conda_create-20240704081656378.png differ diff --git a/docs/ppocr/images/conda_create.png b/docs/ppocr/images/conda_create.png new file mode 100644 index 0000000000..9ff10c241b Binary files /dev/null and b/docs/ppocr/images/conda_create.png differ diff --git a/docs/ppocr/images/conda_list_env.png b/docs/ppocr/images/conda_list_env.png new file mode 100644 index 0000000000..5ffa0037c5 Binary files /dev/null and b/docs/ppocr/images/conda_list_env.png differ diff --git a/docs/ppocr/images/conda_new_env.png b/docs/ppocr/images/conda_new_env.png new file mode 100644 index 0000000000..eed667ec3d Binary files /dev/null and b/docs/ppocr/images/conda_new_env.png differ diff --git a/docs/ppocr/images/model_prod_flow_ch.png b/docs/ppocr/images/model_prod_flow_ch.png new file mode 100644 index 0000000000..4906b2716e Binary files /dev/null and b/docs/ppocr/images/model_prod_flow_ch.png differ diff --git a/docs/ppocr/images/multi_lang/arabic_0.jpg b/docs/ppocr/images/multi_lang/arabic_0.jpg new file mode 100644 index 0000000000..9941b90642 Binary files /dev/null and b/docs/ppocr/images/multi_lang/arabic_0.jpg differ diff --git a/docs/ppocr/images/multi_lang/en_1.jpg b/docs/ppocr/images/multi_lang/en_1.jpg new file mode 100644 index 0000000000..2dc84d3f04 Binary files /dev/null and b/docs/ppocr/images/multi_lang/en_1.jpg differ diff --git a/docs/ppocr/images/multi_lang/en_2.jpg b/docs/ppocr/images/multi_lang/en_2.jpg new file mode 100644 index 0000000000..455ec98ed3 Binary files /dev/null and b/docs/ppocr/images/multi_lang/en_2.jpg differ diff --git a/docs/ppocr/images/multi_lang/en_3.jpg b/docs/ppocr/images/multi_lang/en_3.jpg new file mode 100644 index 0000000000..36eb063d78 Binary files /dev/null and b/docs/ppocr/images/multi_lang/en_3.jpg differ diff --git a/docs/ppocr/images/multi_lang/french_0.jpg b/docs/ppocr/images/multi_lang/french_0.jpg new file mode 100644 index 0000000000..3c2abe6304 Binary files /dev/null and b/docs/ppocr/images/multi_lang/french_0.jpg differ diff --git a/docs/ppocr/images/multi_lang/img_01.jpg b/docs/ppocr/images/multi_lang/img_01.jpg new file mode 100644 index 0000000000..ee6ca69207 Binary files /dev/null and b/docs/ppocr/images/multi_lang/img_01.jpg differ diff --git a/docs/ppocr/images/multi_lang/img_02.jpg b/docs/ppocr/images/multi_lang/img_02.jpg new file mode 100644 index 0000000000..3e139c76bc Binary files /dev/null and b/docs/ppocr/images/multi_lang/img_02.jpg differ diff --git a/docs/ppocr/images/multi_lang/img_12.jpg b/docs/ppocr/images/multi_lang/img_12.jpg new file mode 100644 index 0000000000..822d562eda Binary files /dev/null and b/docs/ppocr/images/multi_lang/img_12.jpg differ diff --git a/docs/ppocr/images/multi_lang/japan_2.jpg b/docs/ppocr/images/multi_lang/japan_2.jpg new file mode 100644 index 0000000000..7038ba2eff Binary files /dev/null and b/docs/ppocr/images/multi_lang/japan_2.jpg differ diff --git a/docs/ppocr/images/multi_lang/korean_0.jpg b/docs/ppocr/images/multi_lang/korean_0.jpg new file mode 100644 index 0000000000..3fe6305aa0 Binary files /dev/null and b/docs/ppocr/images/multi_lang/korean_0.jpg differ diff --git a/docs/ppocr/images/ppocr_framework.png b/docs/ppocr/images/ppocr_framework.png new file mode 100644 index 0000000000..ab51c88fe6 Binary files /dev/null and b/docs/ppocr/images/ppocr_framework.png differ diff --git a/docs/ppocr/images/ppocrv2_framework.jpg b/docs/ppocr/images/ppocrv2_framework.jpg new file mode 100644 index 0000000000..e5f1a2ef47 Binary files /dev/null and b/docs/ppocr/images/ppocrv2_framework.jpg differ diff --git a/docs/ppocr/infer_deploy/Jetson_infer.en.md b/docs/ppocr/infer_deploy/Jetson_infer.en.md new file mode 100644 index 0000000000..6b429d9358 --- /dev/null +++ b/docs/ppocr/infer_deploy/Jetson_infer.en.md @@ -0,0 +1,94 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Jetson Deployment for PaddleOCR + +This section introduces the deployment of PaddleOCR on Jetson NX, TX2, nano, AGX and other series of hardware. + +## 1. Prepare Environment + +You need to prepare a Jetson development hardware. If you need TensorRT, you need to prepare the TensorRT environment. It is recommended to use TensorRT version 7.1.3; + +### 1. Install PaddlePaddle in Jetson + +The PaddlePaddle download [link](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +Please select the appropriate installation package for your Jetpack version, cuda version, and trt version. Here, we download paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl. + +Install PaddlePaddle: + +```bash linenums="1" +pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl +``` + +### 2. Download PaddleOCR code and install dependencies + +Clone the PaddleOCR code: + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +and install dependencies: + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` + +- Note: Jetson hardware CPU is poor, dependency installation is slow, please wait patiently + +## 2. Perform prediction + +Obtain the PPOCR model from the [document](../model_list.en.md) model library. The following takes the PP-OCRv3 model as an example to introduce the use of the PPOCR model on Jetson: + +Download and unzip the PP-OCRv3 models. + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xf ch_PP-OCRv3_det_infer.tar +tar xf ch_PP-OCRv3_rec_infer.tar +``` + +The text detection inference: + +```bash linenums="1" +cd PaddleOCR +python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True +``` + +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. + +![](./images/det_res_french_0.jpg) + +The text recognition inference: + +```bash linenums="1" +python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" +``` + +After executing the command, the predicted information will be printed on the terminal, and the output is as follows: + +```bash linenums="1" +[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) +``` + +The text detection and text recognition inference: + +```bash linenums="1" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --rec_image_shape="3,48,320" +``` + +After executing the command, the predicted information will be printed out in the terminal, and the visualization results will be saved in the `./inference_results/` directory. + +![](./images/00057937.jpg) + +To enable TRT prediction, you only need to set `--use_tensorrt=True` on the basis of the above command: + +```bash linenums="1" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --rec_image_shape="3,48,320" --use_gpu=True --use_tensorrt=True +``` + +For more ppocr model predictions, please refer to[document](../model_list.en.md) diff --git a/docs/ppocr/infer_deploy/Jetson_infer.md b/docs/ppocr/infer_deploy/Jetson_infer.md new file mode 100644 index 0000000000..a8f6d74a24 --- /dev/null +++ b/docs/ppocr/infer_deploy/Jetson_infer.md @@ -0,0 +1,95 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Jetson部署PaddleOCR模型 + +本节介绍PaddleOCR在Jetson NX、TX2、nano、AGX等系列硬件的部署。 + +## 1. 环境准备 + +需要准备一台Jetson开发板,如果需要TensorRT预测,需准备好TensorRT环境,建议使用7.1.3版本的TensorRT; + +### 1. Jetson安装PaddlePaddle + +PaddlePaddle下载[链接](https://www.paddlepaddle.org.cn/inference/user_guides/download_lib.html#python) +请选择适合的您Jetpack版本、cuda版本、trt版本的安装包。 + +安装命令: + +```bash linenums="1" +# 安装paddle,以paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl 为例 +pip3 install -U paddlepaddle_gpu-2.3.0rc0-cp36-cp36m-linux_aarch64.whl +``` + +### 2. 下载PaddleOCR代码并安装依赖 + +首先 clone PaddleOCR 代码: + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleOCR +``` + +然后,安装依赖: + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` + +- 注:jetson硬件CPU较差,依赖安装较慢,请耐心等待 + +## 2. 执行预测 + +从[文档](../model_list.md) 模型库中获取PPOCR模型,下面以PP-OCRv3模型为例,介绍在PPOCR模型在jetson上的使用方式: + +下载并解压PP-OCRv3模型 + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xf ch_PP-OCRv3_det_infer.tar +tar xf ch_PP-OCRv3_rec_infer.tar +``` + +执行文本检测预测: + +```bash linenums="1" +cd PaddleOCR +python3 tools/infer/predict_det.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --image_dir=./doc/imgs/french_0.jpg --use_gpu=True +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果。 + +![img](./images/det_res_french_0.jpg) + +执行文本识别预测: + +```bash linenums="1" +python3 tools/infer/predict_det.py --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs_words/en/word_2.png --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,输出如下: + +```bash linenums="1" +[2022/04/28 15:41:45] root INFO: Predicts of ./doc/imgs_words/en/word_2.png:('yourself', 0.98084533) +``` + +执行文本检测+文本识别串联预测: + +```bash linenums="1" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/ --use_gpu=True --rec_image_shape="3,48,320" +``` + +执行命令后在终端会打印出预测的信息,并在 `./inference_results/` 下保存可视化结果 + +![img](./images/00057937.jpg) + +开启TRT预测只需要在以上命令基础上设置`--use_tensorrt=True`即可: + +```bash linenums="1" +python3 tools/infer/predict_system.py --det_model_dir=./inference/ch_PP-OCRv2_det_infer/ --rec_model_dir=./inference/ch_PP-OCRv2_rec_infer/ --image_dir=./doc/imgs/00057937.jpg --use_gpu=True --use_tensorrt=True --rec_image_shape="3,48,320" +``` + +更多ppocr模型预测请参考[文档](../model_list.md) diff --git a/docs/ppocr/infer_deploy/benchmark.en.md b/docs/ppocr/infer_deploy/benchmark.en.md new file mode 100755 index 0000000000..2d1a0ecc7a --- /dev/null +++ b/docs/ppocr/infer_deploy/benchmark.en.md @@ -0,0 +1,41 @@ +--- +comments: true +--- + +# Benchmark + +This document gives the performance of the series models for Chinese and English recognition. + +## Test Data + +We collected 300 images for different real application scenarios to evaluate the overall OCR system, including contract samples, license plates, nameplates, train tickets, test sheets, forms, certificates, street view images, business cards, digital meter, etc. The following figure shows some images of the test set. + +![img](./images/doc.jpg) + +## Measurement + +Explanation: + +- The long size of the input for the text detector is 960. + +- The evaluation time-consuming stage is the complete stage from image input to result output, including image pre-processing and post-processing. + +- `Intel Xeon 6148` is the server-side CPU model. Intel MKL-DNN is used in the test to accelerate the CPU prediction speed. + +- `Snapdragon 855` is a mobile processing platform model. + +Compares the model size and F-score: + +| Model Name | Model Size
of the
Whole System\(M\) | Model Size
of the Text
Detector\(M\) | Model Size
of the Direction
Classifier\(M\) | Model Size
of the Text
Recognizer \(M\) | F\-score | +|:-:|:-:|:-:|:-:|:-:|:-:| +| PP-OCRv2 | 11\.6 | 3\.0 | 0\.9 | 8\.6 | 0\.5224 | +| PP-OCR mobile | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.503 | +| PP-OCR server | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.570 | + +Compares the time-consuming on CPU and T4 GPU (ms): + +| Model Name | CPU | T4 GPU | +|:-:|:-:|:-:| +| PP-OCRv2 | 330 | 111 | +| PP-OCR mobile | 356 | 116| +| PP-OCR server | 1056 | 200 | diff --git a/docs/ppocr/infer_deploy/benchmark.md b/docs/ppocr/infer_deploy/benchmark.md new file mode 100644 index 0000000000..5de86e3647 --- /dev/null +++ b/docs/ppocr/infer_deploy/benchmark.md @@ -0,0 +1,39 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Benchmark + +本文给出了中英文OCR系列模型精度指标和在各平台预测耗时的benchmark。 + +## 测试数据 + +针对OCR实际应用场景,包括合同,车牌,铭牌,火车票,化验单,表格,证书,街景文字,名片,数码显示屏等,收集的300张图像,每张图平均有17个文本框,下图给出了一些图像示例。 + +![img](./images/doc.jpg) + +## 评估指标 + +说明: + +- 检测输入图像的长边尺寸是960。 +- 评估耗时阶段为图像预测耗时,不包括图像的预处理和后处理。 +- `Intel至强6148`为服务器端CPU型号,测试中使用Intel MKL-DNN 加速。 +- `骁龙855`为移动端处理平台型号。 + +预测模型大小和整体识别精度对比 + +| 模型名称 | 整体模型
大小\(M\) | 检测模型
大小\(M\) | 方向分类器
模型大小\(M\) | 识别模型
大小\(M\) | 整体识别
F\-score | +| :-----------: | :-------------------: | :-------------------: | :-------------------------: | :-------------------: | :------------------: | +| PP-OCRv2 | 11\.6 | 3\.0 | 0\.9 | 8\.6 | 0\.5224 | +| PP-OCR mobile | 8\.1 | 2\.6 | 0\.9 | 4\.6 | 0\.503 | +| PP-OCR server | 155\.1 | 47\.2 | 0\.9 | 107 | 0\.570 | + +预测模型在CPU和GPU上的速度对比,单位ms + +| 模型名称 | CPU | T4 GPU | +| :-----------: | :---: | :----: | +| PP-OCRv2 | 330 | 111 | +| PP-OCR mobile | 356 | 11 6 | +| PP-OCR server | 1056 | 200 | diff --git a/docs/ppocr/infer_deploy/cpp_infer.en.md b/docs/ppocr/infer_deploy/cpp_infer.en.md new file mode 100644 index 0000000000..f18c8c8aae --- /dev/null +++ b/docs/ppocr/infer_deploy/cpp_infer.en.md @@ -0,0 +1,433 @@ +--- +comments: true +--- + +# Server-side C++ Inference + +This chapter introduces the C++ deployment steps of the PaddleOCR model. C++ is better than Python in terms of performance. Therefore, in CPU and GPU deployment scenarios, C++ deployment is mostly used. +This section will introduce how to configure the C++ environment and deploy PaddleOCR in Linux (CPU\GPU) environment. For Windows deployment please refer to [Windows](./windows_vs2019_build.en.md) compilation guidelines. + +## 1. Prepare the Environment + +### 1.1 Environment + +- Linux, docker is recommended. +- Windows. + +### 1.2 Compile OpenCV + +- First of all, you need to download the source code compiled package in the Linux environment from the OpenCV official website. Taking OpenCV 3.4.7 as an example, the download command is as follows. + +```bash linenums="1" +cd deploy/cpp_infer +wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz +tar -xf opencv-3.4.7.tar.gz +``` + +Finally, you will see the folder of `opencv-3.4.7/` in the current directory. + +- Compile OpenCV, the OpenCV source path (`root_path`) and installation path (`install_path`) should be set by yourself. Enter the OpenCV source code path and compile it in the following way. + +```bash linenums="1" +root_path=your_opencv_root_path +install_path=${root_path}/opencv3 + +rm -rf build +mkdir build +cd build + +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${install_path} \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DWITH_IPP=OFF \ + -DBUILD_IPP_IW=OFF \ + -DWITH_LAPACK=OFF \ + -DWITH_EIGEN=OFF \ + -DCMAKE_INSTALL_LIBDIR=lib64 \ + -DWITH_ZLIB=ON \ + -DBUILD_ZLIB=ON \ + -DWITH_JPEG=ON \ + -DBUILD_JPEG=ON \ + -DWITH_PNG=ON \ + -DBUILD_PNG=ON \ + -DWITH_TIFF=ON \ + -DBUILD_TIFF=ON + +make -j +make install +``` + +In the above commands, `root_path` is the downloaded OpenCV source code path, and `install_path` is the installation path of OpenCV. After `make install` is completed, the OpenCV header file and library file will be generated in this folder for later OCR source code compilation. + +The final file structure under the OpenCV installation path is as follows. + +```text linenums="1" +opencv3/ +|-- bin +|-- include +|-- lib +|-- lib64 +|-- share +``` + +### 1.3 Compile or Download or the Paddle Inference Library + +- There are 2 ways to obtain the Paddle inference library, described in detail below. + +#### 1.3.1 Direct download and installation + +[Paddle inference library official website](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#linux). You can review and select the appropriate version of the inference library on the official website. + +- After downloading, use the following command to extract files. + +```bash linenums="1" +tar -xf paddle_inference.tgz +``` + +Finally you will see the folder of `paddle_inference/` in the current path. + +#### 1.3.2 Compile the inference source code + +- If you want to get the latest Paddle inference library features, you can download the latest code from Paddle GitHub repository and compile the inference library from the source code. It is recommended to download the inference library with paddle version greater than or equal to 2.0.1. + +- You can refer to [Paddle inference library](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/inference_deployment/inference/build_and_install_lib_en.html) to get the Paddle source code from GitHub, and then compile To generate the latest inference library. The method of using git to access the code is as follows. + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/Paddle.git +git checkout develop +``` + +- Enter the Paddle directory and run the following commands to compile the paddle inference library. + +```bash linenums="1" +rm -rf build +mkdir build +cd build + +cmake .. \ + -DWITH_CONTRIB=OFF \ + -DWITH_MKL=ON \ + -DWITH_MKLDNN=ON \ + -DWITH_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_INFERENCE_API_TEST=OFF \ + -DON_INFER=ON \ + -DWITH_PYTHON=ON +make -j +make inference_lib_dist +``` + +For more compilation parameter options, please refer to the [document](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi). + +- After the compilation process, you can see the following files in the folder of `build/paddle_inference_install_dir/`. + +```text linenums="1" +build/paddle_inference_install_dir/ +|-- CMakeCache.txt +|-- paddle +|-- third_party +|-- version.txt +``` + +`paddle` is the Paddle library required for C++ prediction later, and `version.txt` contains the version information of the current inference library. + +## 2. Compile and Run the Demo + +### 2.1 Export the inference model + +- You can refer to [Model inference](./python_infer.en.md) and export the inference model. After the model is exported, assuming it is placed in the `inference` directory, the directory structure is as follows. + +```text linenums="1" +inference/ +|-- det_db +| |--inference.pdiparams +| |--inference.pdmodel +|-- rec_rcnn +| |--inference.pdiparams +| |--inference.pdmodel +|-- cls +| |--inference.pdiparams +| |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel +``` + +### 2.2 Compile PaddleOCR C++ inference demo + +- The compilation commands are as follows. The addresses of Paddle C++ inference library, opencv and other Dependencies need to be replaced with the actual addresses on your own machines. + +```bash linenums="1" +sh tools/build.sh +``` + +Specifically, you should modify the paths in `tools/build.sh`. The related content is as follows. + +```bash linenums="1" +OPENCV_DIR=your_opencv_dir +LIB_DIR=your_paddle_inference_dir +CUDA_LIB_DIR=your_cuda_lib_dir +CUDNN_LIB_DIR=your_cudnn_lib_dir +``` + +`OPENCV_DIR` is the OpenCV installation path; `LIB_DIR` is the download (`paddle_inference` folder) +or the generated Paddle inference library path (`build/paddle_inference_install_dir` folder); +`CUDA_LIB_DIR` is the CUDA library file path, in docker; it is `/usr/local/cuda/lib64`; `CUDNN_LIB_DIR` is the cuDNN library file path, in docker it is `/usr/lib/x86_64-linux-gnu/`. + +- After the compilation is completed, an executable file named `ppocr` will be generated in the `build` folder. + +### 2.3 Run the demo + +Execute the built executable file: + +```bash linenums="1" +./build/ppocr [--param1] [--param2] [...] +``` + +**Note**:ppocr uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3, 48, 320`, if you want to use the old version model, you should add the parameter `--rec_img_h=32`. + +Specifically, + +#### 1. det+cls+rec + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=true \ + --det=true \ + --rec=true \ + --cls=true \ +``` + +#### 2. det+rec + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=false \ + --det=true \ + --rec=true \ + --cls=false \ +``` + +#### 3. det + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --image_dir=../../doc/imgs/12.jpg \ + --det=true \ + --rec=false +``` + +#### 4. cls+rec + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=true \ + --cls=true \ +``` + +#### 5. rec + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=false \ + --det=false \ + --rec=true \ + --cls=false \ +``` + +#### 6. cls + +```bash linenums="1" +./build/ppocr --cls_model_dir=inference/cls \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=false \ + --cls=true \ +``` + +#### 7. layout+table + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +#### 8. layout + +```bash linenums="1" +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` + +#### 9. table + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + +More parameters are as follows, + +Common parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|use_gpu|bool|false|Whether to use GPU| +|gpu_id|int|0|GPU id when use_gpu is true| +|gpu_mem|int|4000|GPU memory requested| +|cpu_math_library_num_threads|int|10|Number of threads when using CPU inference. When machine cores is enough, the large the value, the faster the inference speed| +|enable_mkldnn|bool|true|Whether to use mkdlnn library| +|output|str|./output|Path where visualization results are saved| + +forward + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|det|bool|true|Whether to perform text detection in the forward direction| +|rec|bool|true|Whether to perform text recognition in the forward direction| +|cls|bool|false|Whether to perform text direction classification in the forward direction| + +Detection related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|det_model_dir|string|-|Address of detection inference model| +|max_side_len|int|960|Limit the maximum image height and width to 960| +|det_db_thresh|float|0.3|Used to filter the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result| +|det_db_box_thresh|float|0.5|DB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate| +|det_db_unclip_ratio|float|1.6|Indicates the compactness of the text box, the smaller the value, the closer the text box to the text| +|det_db_score_mode|string|slow| slow: use polygon box to calculate bbox score, fast: use rectangle box to calculate. Use rectangular box to calculate faster, and polygonal box more accurate for curved text area.| +|visualize|bool|true|Whether to visualize the results,when it is set as true, the prediction results will be saved in the folder specified by the `output` field on an image with the same name as the input image.| + +Classifier related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|use_angle_cls|bool|false|Whether to use the direction classifier| +|cls_model_dir|string|-|Address of direction classifier inference model| +|cls_thresh|float|0.9|Score threshold of the direction classifier| +|cls_batch_num|int|1|batch size of classifier| + +Recognition related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|rec_model_dir|string|-|Address of recognition inference model| +|rec_char_dict_path|string|../../ppocr/utils/ppocr_keys_v1.txt|dictionary file| +|rec_batch_num|int|6|batch size of recognition| +|rec_img_h|int|48|image height of recognition| +|rec_img_w|int|320|image width of recognition| + +Layout related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|layout_model_dir|string|-| Address of layout inference model| +|layout_dict_path|string|../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt|dictionary file| +|layout_score_threshold|float|0.5|Threshold of score.| +|layout_nms_threshold|float|0.5|Threshold of nms.| + +Table recognition related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|table_model_dir|string|-|Address of table recognition inference model| +|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|dictionary file| +|table_max_len|int|488|The size of the long side of the input image of the table recognition model, the final input image size of the network is(table_max_len,table_max_len)| +|merge_no_span_structure|bool|true|Whether to merge and to
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** +``` + +## 3. FAQ + + 1. Encountered the error `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, change the github address in `deploy/cpp_infer/external-cmake/auto-log.cmake` to the address. diff --git a/docs/ppocr/infer_deploy/cpp_infer.md b/docs/ppocr/infer_deploy/cpp_infer.md new file mode 100644 index 0000000000..ea11d57bbc --- /dev/null +++ b/docs/ppocr/infer_deploy/cpp_infer.md @@ -0,0 +1,443 @@ +--- +comments: true +--- + +# 服务器端C++预测 + +本章节介绍PaddleOCR 模型的C++部署方法。C++在性能计算上优于Python,因此,在大多数CPU、GPU部署场景,多采用C++的部署方式,本节将介绍如何在Linux\Windows (CPU\GPU)环境下配置C++环境并完成PaddleOCR模型部署。 + +## 1. 准备环境 + +### 1.1 运行准备 + +- Linux环境,推荐使用docker。 +- Windows环境。 + +- 该文档主要介绍基于Linux环境的PaddleOCR C++预测流程,如果需要在Windows下基于预测库进行C++预测,具体编译方法请参考[Windows下编译教程](./windows_vs2019_build.md) + +### 1.2 编译opencv库 + +- 首先需要从opencv官网上下载在Linux环境下源码编译的包,以opencv3.4.7为例,下载命令如下: + +```bash linenums="1" +cd deploy/cpp_infer +wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz +tar -xf opencv-3.4.7.tar.gz +``` + +最终可以在当前目录下看到`opencv-3.4.7/`的文件夹。 + +- 编译opencv,设置opencv源码路径(`root_path`)以及安装路径(`install_path`)。进入opencv源码路径下,按照下面的方式进行编译。 + +```bash linenums="1" +root_path="your_opencv_root_path" +install_path=${root_path}/opencv3 +build_dir=${root_path}/build + +rm -rf ${build_dir} +mkdir ${build_dir} +cd ${build_dir} + +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${install_path} \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DWITH_IPP=OFF \ + -DBUILD_IPP_IW=OFF \ + -DWITH_LAPACK=OFF \ + -DWITH_EIGEN=OFF \ + -DCMAKE_INSTALL_LIBDIR=lib64 \ + -DWITH_ZLIB=ON \ + -DBUILD_ZLIB=ON \ + -DWITH_JPEG=ON \ + -DBUILD_JPEG=ON \ + -DWITH_PNG=ON \ + -DBUILD_PNG=ON \ + -DWITH_TIFF=ON \ + -DBUILD_TIFF=ON + +make -j +make install +``` + +也可以直接修改`tools/build_opencv.sh`的内容,然后直接运行下面的命令进行编译。 + +```bash linenums="1" +sh tools/build_opencv.sh +``` + +其中`root_path`为下载的opencv源码路径,`install_path`为opencv的安装路径,`make install`完成之后,会在该文件夹下生成opencv头文件和库文件,用于后面的OCR代码编译。 + +最终在安装路径下的文件结构如下所示。 + +``` +opencv3/ +|-- bin +|-- include +|-- lib +|-- lib64 +|-- share +``` + +### 1.3 下载或者编译Paddle预测库 + +可以选择直接下载安装或者从源码编译,下文分别进行具体说明。 + +#### 1.3.1 直接下载安装 + +[Paddle预测库官网](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#linux) 上提供了不同cuda版本的Linux预测库,可以在官网查看并选择合适的预测库版本(*建议选择paddle版本>=2.0.1版本的预测库* )。 + +下载之后解压: + +```bash linenums="1" +tar -xf paddle_inference.tgz +``` + +最终会在当前的文件夹中生成`paddle_inference/`的子文件夹。 + +#### 1.3.2 预测库源码编译 + +如果希望获取最新预测库特性,可以从github上克隆最新Paddle代码进行编译,生成最新的预测库。 + +使用git获取代码: + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/Paddle.git +git checkout develop +``` + +进入Paddle目录,进行编译: + +```bash linenums="1" +rm -rf build +mkdir build +cd build + +cmake .. \ + -DWITH_CONTRIB=OFF \ + -DWITH_MKL=ON \ + -DWITH_MKLDNN=ON \ + -DWITH_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_INFERENCE_API_TEST=OFF \ + -DON_INFER=ON \ + -DWITH_PYTHON=ON +make -j +make inference_lib_dist +``` + +更多编译参数选项介绍可以参考[Paddle预测库编译文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi)。 + +编译完成之后,可以在`build/paddle_inference_install_dir/`文件下看到生成了以下文件及文件夹。 + +``` +build/paddle_inference_install_dir/ +|-- CMakeCache.txt +|-- paddle +|-- third_party +|-- version.txt +``` + +其中`paddle`就是C++预测所需的Paddle库,`version.txt`中包含当前预测库的版本信息。 + +## 2. 开始运行 + +### 2.1 准备模型 + +直接下载PaddleOCR提供的推理模型,或者参考[模型预测章节](./python_infer.md),将训练好的模型导出为推理模型。模型导出之后,假设放在`inference`目录下,则目录结构如下: + +```text linenums="1" +inference/ +|-- det_db +| |--inference.pdiparams +| |--inference.pdmodel +|-- rec_rcnn +| |--inference.pdiparams +| |--inference.pdmodel +|-- cls +| |--inference.pdiparams +| |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel +``` + +### 2.2 编译PaddleOCR C++预测demo + +编译命令如下,其中Paddle C++预测库、opencv等其他依赖库的地址需要换成自己机器上的实际地址。 + +```bash linenums="1" +sh tools/build.sh +``` + +具体的,需要修改`tools/build.sh`中环境路径,相关内容如下: + +```bash linenums="1" +OPENCV_DIR=your_opencv_dir +LIB_DIR=your_paddle_inference_dir +CUDA_LIB_DIR=your_cuda_lib_dir +CUDNN_LIB_DIR=/your_cudnn_lib_dir +``` + +其中,`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。**注意:以上路径都写绝对路径,不要写相对路径。** + +编译完成之后,会在`build`文件夹下生成一个名为`ppocr`的可执行文件。 + +### 2.3 运行demo + +本demo支持系统串联调用,也支持单个功能的调用,如,只使用检测或识别功能。 + +**注意** ppocr默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 如需使用旧版本的PP-OCR模型,则需要设置参数`--rec_img_h=32`。 + +运行方式: + +```bash linenums="1" +./build/ppocr [--param1] [--param2] [...] +``` + +具体命令如下: + +##### 1. 检测+分类+识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=true \ + --det=true \ + --rec=true \ + --cls=true \ +``` + +##### 2. 检测+识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=false \ + --det=true \ + --rec=true \ + --cls=false \ +``` + +##### 3. 检测 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --image_dir=../../doc/imgs/12.jpg \ + --det=true \ + --rec=false +``` + +##### 4. 分类+识别 + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=true \ + --cls=true \ +``` + +##### 5. 识别 + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=false \ + --det=false \ + --rec=true \ + --cls=false \ +``` + +##### 6. 分类 + +```bash linenums="1" +./build/ppocr --cls_model_dir=inference/cls \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=false \ + --cls=true \ +``` + +##### 7. 版面分析+表格识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +##### 8. 版面分析 + +```bash linenums="1" +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` + +##### 9. 表格识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + +更多支持的可调节参数解释如下: + +通用参数 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :--------------------------: | :---: | :------: | :---------------------------------------------------------------: | +| use_gpu | bool | false | 是否使用GPU | +| gpu_id | int | 0 | GPU id,使用GPU时有效 | +| gpu_mem | int | 4000 | 申请的GPU内存 | +| cpu_math_library_num_threads | int | 10 | CPU预测时的线程数,在机器核数充足的情况下,该值越大,预测速度越快 | +| enable_mkldnn | bool | true | 是否使用mkldnn库 | +| output | str | ./output | 可视化结果保存的路径 | + +前向相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :------: | :---: | :------: | :----------------------: | +| det | bool | true | 前向是否执行文字检测 | +| rec | bool | true | 前向是否执行文字识别 | +| cls | bool | false | 前向是否执行文字方向分类 | + +检测模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :-----------------: | :----: | :------: | :----------------------------------------------------------------------------------------------------------: | +| det_model_dir | string | - | 检测模型inference model地址 | +| max_side_len | int | 960 | 输入图像长宽大于960时,等比例缩放图像,使得图像最长边为960 | +| det_db_thresh | float | 0.3 | 用于过滤DB预测的二值化图像,设置为0.-0.3对结果影响不明显 | +| det_db_box_thresh | float | 0.5 | DB后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小 | +| det_db_unclip_ratio | float | 1.6 | 表示文本框的紧致程度,越小则文本框更靠近文本 | +| det_db_score_mode | string | slow | slow:使用多边形框计算bbox score,fast:使用矩形框计算。矩形框计算速度更快,多边形框对弯曲文本区域计算更准确。 | +| visualize | bool | true | 是否对结果进行可视化,为1时,预测结果会保存在`output`字段指定的文件夹下和输入图像同名的图像上。 | + +方向分类器相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :-----------: | :----: | :------: | :---------------------------: | +| use_angle_cls | bool | false | 是否使用方向分类器 | +| cls_model_dir | string | - | 方向分类器inference model地址 | +| cls_thresh | float | 0.9 | 方向分类器的得分阈值 | +| cls_batch_num | int | 1 | 方向分类器batchsize | + +文字识别模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :----------------: | :----: | :---------------------------------: | :-----------------------------: | +| rec_model_dir | string | - | 文字识别模型inference model地址 | +| rec_char_dict_path | string | ../../ppocr/utils/ppocr_keys_v1.txt | 字典文件 | +| rec_batch_num | int | 6 | 文字识别模型batchsize | +| rec_img_h | int | 48 | 文字识别模型输入图像高度 | +| rec_img_w | int | 320 | 文字识别模型输入图像宽度 | + +版面分析模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :--------------------: | :----: | :----------------------------------------------------------: | :-----------------------------: | +| layout_model_dir | string | - | 版面分析模型inference model地址 | +| layout_dict_path | string | ../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt | 字典文件 | +| layout_score_threshold | float | 0.5 | 检测框的分数阈值 | +| layout_nms_threshold | float | 0.5 | nms的阈值 | + +表格识别模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :---------------------: | :----: | :------------------------------------------------: | :----------------------------------------------------------------------------------: | +| table_model_dir | string | - | 表格识别模型inference model地址 | +| table_char_dict_path | string | ../../ppocr/utils/dict/table_structure_dict_ch.txt | 字典文件 | +| table_max_len | int | 488 | 表格识别模型输入图像长边大小,最终网络输入图像大小为(table_max_len,table_max_len) | +| merge_no_span_structure | bool | true | 是否合并 和 为 | + +- PaddleOCR也支持多语言的预测,更多支持的语言和模型可以参考[识别文档](../model_train/recognition.md)中的多语言字典与模型部分,如果希望进行多语言预测,只需将修改`rec_char_dict_path`(字典文件路径)以及`rec_model_dir`(inference模型路径)字段即可。 + +最终屏幕上会输出检测结果如下: + +- ocr + +```bash linenums="1" +predict img: ../../doc/imgs/12.jpg +../../doc/imgs/12.jpg +0 det boxes: [[74,553],[427,542],[428,571],[75,582]] rec text: 打浦路252935号 rec score: 0.947724 +1 det boxes: [[23,507],[513,488],[515,529],[24,548]] rec text: 绿洲仕格维花园公寓 rec score: 0.993728 +2 det boxes: [[187,456],[399,448],[400,480],[188,488]] rec text: 打浦路15号 rec score: 0.964994 +3 det boxes: [[42,413],[483,391],[484,428],[43,450]] rec text: 上海斯格威铂尔大酒店 rec score: 0.980086 +The detection visualized image saved in ./output//12.jpg +``` + +- layout+table + +```bash linenums="1" +predict img: ../../ppstructure/docs/table/1.png +0 type: text, region: [12,729,410,848], score: 0.781044, res: count of ocr result is : 7 +********** print ocr result ********** +0 det boxes: [[4,1],[79,1],[79,12],[4,12]] rec text: CTW1500. rec score: 0.769472 +... +6 det boxes: [[4,99],[391,99],[391,112],[4,112]] rec text: sate-of-the-artmethods[12.34.36l.ourapproachachieves rec score: 0.90414 +********** end print ocr result ********** +1 type: text, region: [69,342,342,359], score: 0.703666, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[269,2],[269,13],[8,13]] rec text: Table6.Experimentalresults on CTW-1500 rec score: 0.890454 +********** end print ocr result ********** +2 type: text, region: [70,316,706,332], score: 0.659738, res: count of ocr result is : 2 +********** print ocr result ********** +0 det boxes: [[373,2],[630,2],[630,11],[373,11]] rec text: oroposals.andthegreencontoursarefinal rec score: 0.919729 +1 det boxes: [[8,3],[357,3],[357,11],[8,11]] rec text: Visualexperimentalresultshebluecontoursareboundar rec score: 0.915963 +********** end print ocr result ********** +3 type: text, region: [489,342,789,359], score: 0.630538, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[294,2],[294,14],[8,14]] rec text: Table7.Experimentalresults onMSRA-TD500 rec score: 0.942251 +********** end print ocr result ********** +4 type: text, region: [444,751,841,848], score: 0.607345, res: count of ocr result is : 5 +********** print ocr result ********** +0 det boxes: [[19,3],[389,3],[389,17],[19,17]] rec text: Inthispaper,weproposeanovel adaptivebound rec score: 0.941031 +1 det boxes: [[4,22],[390,22],[390,36],[4,36]] rec text: aryproposalnetworkforarbitraryshapetextdetection rec score: 0.960172 +2 det boxes: [[4,42],[392,42],[392,56],[4,56]] rec text: whichadoptanboundaryproposalmodeltogeneratecoarse rec score: 0.934647 +3 det boxes: [[4,61],[389,61],[389,75],[4,75]] rec text: ooundaryproposals,andthenadoptanadaptiveboundary rec score: 0.946296 +4 det boxes: [[5,80],[387,80],[387,93],[5,93]] rec text: leformationmodelcombinedwithGCNandRNNtoper rec score: 0.952401 +********** end print ocr result ********** +5 type: title, region: [444,705,564,724], score: 0.785429, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[6,2],[113,2],[113,14],[6,14]] rec text: 5.Conclusion rec score: 0.856903 +********** end print ocr result ********** +6 type: table, region: [14,360,402,711], score: 0.963643, res:
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** +``` + +## 3. FAQ + + 1. 遇到报错 `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, 将 `deploy/cpp_infer/external-cmake/auto-log.cmake` 中的github地址改为 地址即可。 diff --git a/docs/ppocr/infer_deploy/images/00057937.jpg b/docs/ppocr/infer_deploy/images/00057937.jpg new file mode 100644 index 0000000000..a35896e78d Binary files /dev/null and b/docs/ppocr/infer_deploy/images/00057937.jpg differ diff --git a/docs/ppocr/infer_deploy/images/1.jpg b/docs/ppocr/infer_deploy/images/1.jpg new file mode 100644 index 0000000000..48a89389ae Binary files /dev/null and b/docs/ppocr/infer_deploy/images/1.jpg differ diff --git a/docs/ppocr/infer_deploy/images/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png b/docs/ppocr/infer_deploy/images/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png new file mode 100644 index 0000000000..708871f28c Binary files /dev/null and b/docs/ppocr/infer_deploy/images/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png differ diff --git a/docs/ppocr/infer_deploy/images/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif b/docs/ppocr/infer_deploy/images/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif new file mode 100644 index 0000000000..fdd910aa10 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif differ diff --git a/docs/ppocr/infer_deploy/images/cmake_step2.jpg b/docs/ppocr/infer_deploy/images/cmake_step2.jpg new file mode 100644 index 0000000000..3438157b62 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/cmake_step2.jpg differ diff --git a/docs/ppocr/infer_deploy/images/cmake_step3.jpg b/docs/ppocr/infer_deploy/images/cmake_step3.jpg new file mode 100644 index 0000000000..a9483d231f Binary files /dev/null and b/docs/ppocr/infer_deploy/images/cmake_step3.jpg differ diff --git a/docs/ppocr/infer_deploy/images/cmake_step4.jpg b/docs/ppocr/infer_deploy/images/cmake_step4.jpg new file mode 100644 index 0000000000..d6eb71ca8f Binary files /dev/null and b/docs/ppocr/infer_deploy/images/cmake_step4.jpg differ diff --git a/docs/ppocr/infer_deploy/images/deployment.jpg b/docs/ppocr/infer_deploy/images/deployment.jpg new file mode 100644 index 0000000000..c20897e5d3 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/deployment.jpg differ diff --git a/docs/ppocr/infer_deploy/images/deployment_en.jpg b/docs/ppocr/infer_deploy/images/deployment_en.jpg new file mode 100644 index 0000000000..9920bcc386 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/deployment_en.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_00018069.jpg b/docs/ppocr/infer_deploy/images/det_res_00018069.jpg new file mode 100644 index 0000000000..02f35de332 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_00018069.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_french_0.jpg b/docs/ppocr/infer_deploy/images/det_res_french_0.jpg new file mode 100644 index 0000000000..5f0e4886df Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_french_0.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_img623_sast.jpg b/docs/ppocr/infer_deploy/images/det_res_img623_sast.jpg new file mode 100644 index 0000000000..af5e2d6e2c Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_img623_sast.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_img_10_db.jpg b/docs/ppocr/infer_deploy/images/det_res_img_10_db.jpg new file mode 100644 index 0000000000..6af89f6bb3 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_img_10_db.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_img_10_east.jpg b/docs/ppocr/infer_deploy/images/det_res_img_10_east.jpg new file mode 100644 index 0000000000..908d077c3e Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_img_10_east.jpg differ diff --git a/docs/ppocr/infer_deploy/images/det_res_img_10_sast.jpg b/docs/ppocr/infer_deploy/images/det_res_img_10_sast.jpg new file mode 100644 index 0000000000..702f773e68 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/det_res_img_10_sast.jpg differ diff --git a/docs/ppocr/infer_deploy/images/doc.jpg b/docs/ppocr/infer_deploy/images/doc.jpg new file mode 100644 index 0000000000..f57e62abe1 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/doc.jpg differ diff --git a/docs/ppocr/infer_deploy/images/img_10_east_starnet.jpg b/docs/ppocr/infer_deploy/images/img_10_east_starnet.jpg new file mode 100644 index 0000000000..fd8c039230 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/img_10_east_starnet.jpg differ diff --git a/docs/ppocr/infer_deploy/images/lite_demo.png b/docs/ppocr/infer_deploy/images/lite_demo.png new file mode 100644 index 0000000000..c9daf1b2e6 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/lite_demo.png differ diff --git a/docs/ppocr/infer_deploy/images/lite_demo_onnx.png b/docs/ppocr/infer_deploy/images/lite_demo_onnx.png new file mode 100644 index 0000000000..b096f6eef5 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/lite_demo_onnx.png differ diff --git a/docs/ppocr/infer_deploy/images/lite_demo_paddle.png b/docs/ppocr/infer_deploy/images/lite_demo_paddle.png new file mode 100644 index 0000000000..b096f6eef5 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/lite_demo_paddle.png differ diff --git a/docs/ppocr/infer_deploy/images/pipeline_result.png b/docs/ppocr/infer_deploy/images/pipeline_result.png new file mode 100644 index 0000000000..ba7f24a2cc Binary files /dev/null and b/docs/ppocr/infer_deploy/images/pipeline_result.png differ diff --git a/docs/ppocr/infer_deploy/images/result.jpg b/docs/ppocr/infer_deploy/images/result.jpg new file mode 100644 index 0000000000..f15955bac2 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/result.jpg differ diff --git a/docs/ppocr/infer_deploy/images/results.png b/docs/ppocr/infer_deploy/images/results.png new file mode 100644 index 0000000000..35322bf946 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/results.png differ diff --git a/docs/ppocr/infer_deploy/images/start_server.png b/docs/ppocr/infer_deploy/images/start_server.png new file mode 100644 index 0000000000..60e19ccaed Binary files /dev/null and b/docs/ppocr/infer_deploy/images/start_server.png differ diff --git a/docs/ppocr/infer_deploy/images/system_res_00018069_v3.jpg b/docs/ppocr/infer_deploy/images/system_res_00018069_v3.jpg new file mode 100644 index 0000000000..51808ca556 Binary files /dev/null and b/docs/ppocr/infer_deploy/images/system_res_00018069_v3.jpg differ diff --git a/docs/ppocr/infer_deploy/images/vs_step1.jpg b/docs/ppocr/infer_deploy/images/vs_step1.jpg new file mode 100644 index 0000000000..69ab06c5ae Binary files /dev/null and b/docs/ppocr/infer_deploy/images/vs_step1.jpg differ diff --git a/docs/ppocr/infer_deploy/images/word_1.jpg b/docs/ppocr/infer_deploy/images/word_1.jpg new file mode 100644 index 0000000000..cb5451e15a Binary files /dev/null and b/docs/ppocr/infer_deploy/images/word_1.jpg differ diff --git a/docs/ppocr/infer_deploy/images/word_1.png b/docs/ppocr/infer_deploy/images/word_1.png new file mode 100644 index 0000000000..7b915fd6da Binary files /dev/null and b/docs/ppocr/infer_deploy/images/word_1.png differ diff --git a/docs/ppocr/infer_deploy/images/word_10.png b/docs/ppocr/infer_deploy/images/word_10.png new file mode 100644 index 0000000000..07370f757e Binary files /dev/null and b/docs/ppocr/infer_deploy/images/word_10.png differ diff --git a/docs/ppocr/infer_deploy/images/word_336.png b/docs/ppocr/infer_deploy/images/word_336.png new file mode 100644 index 0000000000..3bddd294ed Binary files /dev/null and b/docs/ppocr/infer_deploy/images/word_336.png differ diff --git a/docs/ppocr/infer_deploy/images/word_4.jpg b/docs/ppocr/infer_deploy/images/word_4.jpg new file mode 100644 index 0000000000..2c34cd33ea Binary files /dev/null and b/docs/ppocr/infer_deploy/images/word_4.jpg differ diff --git a/docs/ppocr/infer_deploy/index.en.md b/docs/ppocr/infer_deploy/index.en.md new file mode 100644 index 0000000000..75ddbc6f4f --- /dev/null +++ b/docs/ppocr/infer_deploy/index.en.md @@ -0,0 +1,24 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# PP-OCR Deployment + +## Paddle Deployment Introduction + +Paddle provides a variety of deployment schemes to meet the deployment requirements of different scenarios. Please choose according to the actual situation: + +![img](./images/deployment_en.jpg) + +PP-OCR has supported muti deployment schemes. Click the link to get the specific tutorial. + +- [Python Inference](./python_infer.en.md) +- [C++ Inference](./cpp_infer.en.md) +- [Serving (Python/C++)](./paddle_server.en.md) +- [Paddle-Lite (ARM CPU/OpenCL ARM GPU)](./lite.en.md) +- [Paddle.js](./paddle_js.en.md) +- [Jetson Inference](./Jetson_infer.en.md) +- [Paddle2ONNX](./paddle2onnx.en.md) + +If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../../algorithm/overview.en.md)。 diff --git a/docs/ppocr/infer_deploy/index.md b/docs/ppocr/infer_deploy/index.md new file mode 100644 index 0000000000..0aeb1708c5 --- /dev/null +++ b/docs/ppocr/infer_deploy/index.md @@ -0,0 +1,26 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-OCR 模型推理部署 + +## Paddle 推理部署方式简介 + +飞桨提供多种部署方案,以满足不同场景的部署需求,请根据实际情况进行选择: + +![img](./images/deployment.jpg) + +## PP-OCR 推理部署 + +PP-OCR模型已打通多种场景部署方案,点击链接获取具体的使用教程。 + +- [Python 推理](./python_infer.md) +- [C++ 推理](./cpp_infer.md) +- [Serving 服务化部署(Python/C++)](./paddle_server.md) +- [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU)](./lite.md) +- [Paddle.js 部署](./paddle_js.md) +- [Jetson 推理](./Jetson_infer.md) +- [Paddle2ONNX 推理](./paddle2onnx.md) + +需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../../algorithm/overview.md)。 diff --git a/docs/ppocr/infer_deploy/lite.en.md b/docs/ppocr/infer_deploy/lite.en.md new file mode 100644 index 0000000000..289d2ad07e --- /dev/null +++ b/docs/ppocr/infer_deploy/lite.en.md @@ -0,0 +1,309 @@ +--- +comments: true +--- + +# Mobile deployment based on Paddle-Lite + +This tutorial will introduce how to use [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) to deploy PaddleOCR ultra-lightweight Chinese and English detection models on mobile phones. + +Paddle-Lite is a lightweight inference engine for PaddlePaddle. It provides efficient inference capabilities for mobile phones and IoT, and extensively integrates cross-platform hardware to provide lightweight deployment solutions for end-side deployment issues. + +## 1. Preparation + +### Preparation environment + +- Computer (for Compiling Paddle Lite) +- Mobile phone (arm7 or arm8) + +### 1.1 Prepare the cross-compilation environment + +The cross-compilation environment is used to compile C++ demos of Paddle Lite and PaddleOCR. +Supports multiple development environments. + +For the compilation process of different development environments, please refer to the corresponding documents. + +1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker) +2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux) +3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os) + +### 1.2 Prepare Paddle-Lite library + +There are two ways to obtain the Paddle-Lite library: + +1. [Recommended] Download directly, the download link of the Paddle-Lite library is as follows: + + | Platform | Paddle-Lite library download link | + | --- | --- | + |Android|[arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.with_cv.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.with_cv.tar.gz)| + |IOS|[arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.ios.armv7.with_cv.with_extra.with_log.tiny_publish.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.ios.armv8.with_cv.with_extra.with_log.tiny_publish.tar.gz)| + + Note: 1. The above Paddle-Lite library is compiled from the Paddle-Lite 2.10 branch. For more information about Paddle-Lite 2.10, please refer to [link](https://github.com/PaddlePaddle/Paddle-Lite/releases/tag/v2.10). + + **Note: It is recommended to use paddlelite>=2.10 version of the prediction library, other prediction library versions [download link](https://github.com/PaddlePaddle/Paddle-Lite/tags)** + +2. Compile Paddle-Lite to get the prediction library. The compilation method of Paddle-Lite is as follows: + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/Paddle-Lite.git + cd Paddle-Lite + # Switch to Paddle-Lite release/v2.10 stable branch + git checkout release/v2.10 + ./lite/tools/build_android.sh --arch=armv8 --with_cv=ON --with_extra=ON + ``` + +Note: When compiling Paddle-Lite to obtain the Paddle-Lite library, you need to turn on the two options `--with_cv=ON --with_extra=ON`, `--arch` means the `arm` version, here is designated as armv8, + +More compilation commands refer to the introduction [link](https://paddle-lite.readthedocs.io/zh/release-v2.10_a/source_compile/linux_x86_compile_android.html) 。 + +After directly downloading the Paddle-Lite library and decompressing it, you can get the `inference_lite_lib.android.armv8/` folder, and the Paddle-Lite library obtained by compiling Paddle-Lite is located +`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/` folder. + +The structure of the prediction library is as follows: + +```text linenums="1" +inference_lite_lib.android.armv8/ +|-- cxx C++ prebuild library +| |-- include C++ +| | |-- paddle_api.h +| | |-- paddle_image_preprocess.h +| | |-- paddle_lite_factory_helper.h +| | |-- paddle_place.h +| | |-- paddle_use_kernels.h +| | |-- paddle_use_ops.h +| | `-- paddle_use_passes.h +| `-- lib C++ library +| |-- libpaddle_api_light_bundled.a C++ static library +| `-- libpaddle_light_api_shared.so C++ dynamic library +|-- java Java library +| |-- jar +| | `-- PaddlePredictor.jar +| |-- so +| | `-- libpaddle_lite_jni.so +| `-- src +|-- demo C++ and Java demo +| |-- cxx C++ demo +| `-- java Java demo +``` + +## 2 Run + +### 2.1 Inference Model Optimization + +Paddle Lite provides a variety of strategies to automatically optimize the original training model, including quantization, sub-graph fusion, hybrid scheduling, Kernel optimization and so on. In order to make the optimization process more convenient and easy to use, Paddle Lite provide opt tools to automatically complete the optimization steps and output a lightweight, optimal executable model. + +If you have prepared the model file ending in .nb, you can skip this step. + +The following table also provides a series of models that can be deployed on mobile phones to recognize Chinese. You can directly download the optimized model. + +|Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| +|---|---|---|---|---|---|---| +|PP-OCRv3|extra-lightweight chinese OCR optimized model|16.2M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb)|v2.10| +|PP-OCRv3(slim)|extra-lightweight chinese OCR optimized model|5.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb)|v2.10| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| +|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| + +If you directly use the model in the above table for deployment, you can skip the following steps and directly read [Section 2.2](#2.2-Run-optimized-model-on-Phone). + +If the model to be deployed is not in the above table, you need to follow the steps below to obtain the optimized model. + +Step 1: Refer to [document](https://www.paddlepaddle.org.cn/lite/v2.10/user_guides/opt/opt_python.html) to install paddlelite, which is used to convert paddle inference model to paddlelite required for running nb model + +```bash linenums="1" +pip install paddlelite==2.10 # The paddlelite version should be the same as the prediction library version +``` + +After installation, the following commands can view the help information + +```bash linenums="1" +paddle_lite_opt +``` + +Introduction to paddle_lite_opt parameters: + +|Options|Description| +|---|---| +|--model_dir|The path of the PaddlePaddle model to be optimized (non-combined form)| +|--model_file|The network structure file path of the PaddlePaddle model (combined form) to be optimized| +|--param_file|The weight file path of the PaddlePaddle model (combined form) to be optimized| +|--optimize_out_type|Output model type, currently supports two types: protobuf and naive_buffer, among which naive_buffer is a more lightweight serialization/deserialization implementation. If you need to perform model prediction on the mobile side, please set this option to naive_buffer. The default is protobuf| +|--optimize_out|The output path of the optimized model| +|--valid_targets|The executable backend of the model, the default is arm. Currently it supports x86, arm, opencl, npu, xpu, multiple backends can be specified at the same time (separated by spaces), and Model Optimize Tool will automatically select the best method. If you need to support Huawei NPU (DaVinci architecture NPU equipped with Kirin 810/990 Soc), it should be set to npu, arm| +|--record_tailoring_info|When using the function of cutting library files according to the model, set this option to true to record the kernel and OP information contained in the optimized model. The default is false| + +`--model_dir` is suitable for the non-combined mode of the model to be optimized, and the inference model of PaddleOCR is the combined mode, that is, the model structure and model parameters are stored in a single file. + +Step 2: Use paddle_lite_opt to convert the inference model to the mobile model format. + +The following takes the ultra-lightweight Chinese model of PaddleOCR as an example to introduce the use of the compiled opt file to complete the conversion of the inference model to the Paddle-Lite optimized model + +```bash linenums="1" +# 【[Recommendation] Download the Chinese and English inference model of PP-OCRv3 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar +# Convert detection model +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +# Convert recognition model +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +# Convert angle classifier model +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +``` + +After the conversion is successful, there will be more files ending with `.nb` in the inference model directory, which is the successfully converted model file. + +### 2.2 Run optimized model on Phone + +Some preparatory work is required first. + +1. Prepare an Android phone with arm8. If the compiled prediction library and opt file are armv7, you need an arm7 phone and modify ARM_ABI = arm7 in the Makefile. + +2. Make sure the phone is connected to the computer, open the USB debugging option of the phone, and select the file transfer mode. +3. Install the adb tool on the computer. + + 3.1. Install ADB for MAC: + + ```bash linenums="1" + brew cask install android-platform-tools + ``` + + 3.2. Install ADB for Linux + + ```bash linenums="1" + sudo apt update + sudo apt install -y wget adb + ``` + + 3.3. Install ADB for windows + + To install on win, you need to go to Google's Android platform to download the adb package for installation:[link](https://developer.android.com/studio) + + Verify whether adb is installed successfully + + ```bash linenums="1" + adb devices + ``` + + If there is device output, it means the installation is successful。 + + ```bash linenums="1" + List of devices attached + 744be294 device + ``` + +4. Prepare optimized models, prediction library files, test images and dictionary files used. + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/PaddleOCR.git + cd PaddleOCR/deploy/lite/ + # run prepare.sh + sh prepare.sh /{lite prediction library path}/inference_lite_lib.android.armv8 + + # + cd /{lite prediction library path}/inference_lite_lib.android.armv8/ + cd demo/cxx/ocr/ + # copy paddle-lite C++ .so file to debug/ directory + cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/ + + cd inference_lite_lib.android.armv8/demo/cxx/ocr/ + cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/ + ``` + +Prepare the test image, taking PaddleOCR/doc/imgs/11.jpg as an example, copy the image file to the demo/cxx/ocr/debug/ folder. Prepare the model files optimized by the lite opt tool, ch_PP-OCRv3_det_slim_opt.nb , ch_PP-OCRv3_rec_slim_opt.nb , and place them under the demo/cxx/ocr/debug/ folder. + +The structure of the OCR demo is as follows after the above command is executed: + +```text linenums="1" +demo/cxx/ocr/ +|-- debug/ +| |--ch_PP-OCRv3_det_slim_opt.nb Detection model +| |--ch_PP-OCRv3_rec_slim_opt.nb Recognition model +| |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb Text direction classification model +| |--11.jpg Image for OCR +| |--ppocr_keys_v1.txt Dictionary file +| |--libpaddle_light_api_shared.so C++ .so file +| |--config.txt Config file +|-- config.txt Config file +|-- cls_process.cc Pre-processing and post-processing files for the angle classifier +|-- cls_process.h +|-- crnn_process.cc Pre-processing and post-processing files for the CRNN model +|-- crnn_process.h +|-- db_post_process.cc Pre-processing and post-processing files for the DB model +|-- db_post_process.h +|-- Makefile +|-- ocr_db_crnn.cc C++ main code +``` + +**Note**: + +1. `ppocr_keys_v1.txt` is a Chinese dictionary file. If the nb model is used for English recognition or other language recognition, dictionary file should be replaced with a dictionary of the corresponding language. PaddleOCR provides a variety of dictionaries under ppocr/utils/, including: + + ```python linenums="1" + dict/french_dict.txt # french + dict/german_dict.txt # german + ic15_dict.txt # english + dict/japan_dict.txt # japan + dict/korean_dict.txt # korean + ppocr_keys_v1.txt # chinese + ``` + +2. `config.txt` of the detector and classifier, as shown below: + + ```python linenums="1" + max_side_len 960 # Limit the maximum image height and width to 960 + det_db_thresh 0.3 # Used to filter the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result + det_db_box_thresh 0.5 # DDB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate + det_db_unclip_ratio 1.6 # Indicates the compactness of the text box, the smaller the value, the closer the text box to the text + use_direction_classify 0 # Whether to use the direction classifier, 0 means not to use, 1 means to use + rec_image_height 48 # The height of the input image of the recognition model, the PP-OCRv3 model needs to be set to 48, and the PP-OCRv2 model needs to be set to 32 + ``` + +3. Run Model on phone + + After the above steps are completed, you can use adb to push the file to the phone to run, the steps are as follows: + + ```bash linenums="1" + # Execute the compilation and get the executable file ocr_db_crnn + # The first execution of this command will download dependent libraries such as opencv. After the download is complete, you need to execute it again + make -j + # Move the compiled executable file to the debug folder + mv ocr_db_crnn ./debug/ + # Push the debug folder to the phone + adb push debug /data/local/tmp/ + adb shell + cd /data/local/tmp/debug + export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH + # The use of ocr_db_crnn is: + # ./ocr_db_crnn Mode Detection model file Orientation classifier model file Recognition model file Hardware Precision Threads Batchsize Test image path Dictionary file path + ./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + # precision can be INT8 for quantitative model or FP32 for normal model. + + # Only using detection model + ./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt + + # Only using recognition model + ./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt + ``` + +If you modify the code, you need to recompile and push to the phone. + +The outputs are as follows: + +![img](./images/lite_demo.png) + +## FAQ + +Q1: What if I want to change the model, do I need to run it again according to the process? + +A1: If you have performed the above steps, you only need to replace the .nb model file to complete the model replacement. + +Q2: How to test with another picture? + +A2: Replace the .jpg test image under ./debug with the image you want to test, and run adb push to push new image to the phone. + +Q3: How to package it into the mobile APP? + +A3: This demo aims to provide the core algorithm part that can run OCR on mobile phones. Further, PaddleOCR/deploy/android_demo is an example of encapsulating this demo into a mobile app for reference. + +Q4: When running the demo, an error is reported `Error: This model is not supported, because kernel for 'io_copy' is not supported by Paddle-Lite.` + +A4: The problem is that the installed paddlelite version does not match the downloaded prediction library version. Make sure that the paddleliteopt tool matches your prediction library version, and try to switch to the nb model again. diff --git a/docs/ppocr/infer_deploy/lite.md b/docs/ppocr/infer_deploy/lite.md new file mode 100644 index 0000000000..38fc927f28 --- /dev/null +++ b/docs/ppocr/infer_deploy/lite.md @@ -0,0 +1,308 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 端侧部署 + +本教程将介绍基于[Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在移动端部署PaddleOCR超轻量中文检测、识别模型的详细步骤。 + +Paddle Lite是飞桨轻量化推理引擎,为手机、IOT端提供高效推理能力,并广泛整合跨平台硬件,为端侧部署及应用落地问题提供轻量化的部署方案。 + +## 1. 准备环境 + +### 运行准备 + +- 电脑(编译Paddle Lite) +- 安卓手机(armv7或armv8) + +### 1.1 准备交叉编译环境 + +交叉编译环境用于编译 Paddle Lite 和 PaddleOCR 的C++ demo。 +支持多种开发环境,不同开发环境的编译流程请参考对应文档。 + +1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker) +2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux) +3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os) + +### 1.2 准备预测库 + +预测库有两种获取方式: + +1. [推荐]直接下载,预测库下载链接如下: + + | 平台 | 预测库下载链接 | + | ---- | ---- | + | Android | [arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.with_cv.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.with_cv.tar.gz) | + | IOS | [arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.ios.armv7.with_cv.with_extra.with_log.tiny_publish.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.10/inference_lite_lib.ios.armv8.with_cv.with_extra.with_log.tiny_publish.tar.gz) | + + 注:1. 上述预测库为PaddleLite 2.10分支编译得到,有关PaddleLite 2.10 详细信息可参考 [链接](https://github.com/PaddlePaddle/Paddle-Lite/releases/tag/v2.10) 。 + + **注:建议使用paddlelite>=2.10版本的预测库,其他预测库版本[下载链接](https://github.com/PaddlePaddle/Paddle-Lite/tags)** + +2. 编译Paddle-Lite得到预测库,Paddle-Lite的编译方式如下: + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/Paddle-Lite.git + cd Paddle-Lite + # 切换到Paddle-Lite release/v2.10 稳定分支 + git checkout release/v2.10 + ./lite/tools/build_android.sh --arch=armv8 --with_cv=ON --with_extra=ON + ``` + +注意:编译Paddle-Lite获得预测库时,需要打开`--with_cv=ON --with_extra=ON`两个选项,`--arch`表示`arm`版本,这里指定为armv8, +更多编译命令 +介绍请参考 [链接](https://paddle-lite.readthedocs.io/zh/release-v2.10_a/source_compile/linux_x86_compile_android.html) 。 + +直接下载预测库并解压后,可以得到`inference_lite_lib.android.armv8/`文件夹,通过编译Paddle-Lite得到的预测库位于 +`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。 +预测库的文件目录如下: + +```text linenums="1" +inference_lite_lib.android.armv8/ +|-- cxx C++ 预测库和头文件 +| |-- include C++ 头文件 +| | |-- paddle_api.h +| | |-- paddle_image_preprocess.h +| | |-- paddle_lite_factory_helper.h +| | |-- paddle_place.h +| | |-- paddle_use_kernels.h +| | |-- paddle_use_ops.h +| | `-- paddle_use_passes.h +| `-- lib C++预测库 +| |-- libpaddle_api_light_bundled.a C++静态库 +| `-- libpaddle_light_api_shared.so C++动态库 +|-- java Java预测库 +| |-- jar +| | `-- PaddlePredictor.jar +| |-- so +| | `-- libpaddle_lite_jni.so +| `-- src +|-- demo C++和Java示例代码 +| |-- cxx C++ 预测库demo +| `-- java Java 预测库demo +``` + +## 2 开始运行 + +### 2.1 模型优化 + +Paddle-Lite 提供了多种策略来自动优化原始的模型,其中包括量化、子图融合、混合调度、Kernel优选等方法,使用Paddle-lite的opt工具可以自动 +对inference模型进行优化,优化后的模型更轻量,模型运行速度更快。 + +如果已经准备好了 `.nb` 结尾的模型文件,可以跳过此步骤。 + +下述表格中也提供了一系列中文移动端模型: + +| 模型版本 | 模型简介 | 模型大小 | 检测模型 | 文本方向分类模型 | 识别模型 | Paddle-Lite版本 | +| -------------- | ----------------------------- | -------- | ------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | --------------- | +| PP-OCRv3 | 蒸馏版超轻量中文OCR移动端模型 | 16.2M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.nb) | v2.10 | +| PP-OCRv3(slim) | 蒸馏版超轻量中文OCR移动端模型 | 5.9M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | v2.10 | +| PP-OCRv2 | 蒸馏版超轻量中文OCR移动端模型 | 11M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb) | v2.10 | +| PP-OCRv2(slim) | 蒸馏版超轻量中文OCR移动端模型 | 4.6M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb) | v2.10 | + +如果直接使用上述表格中的模型进行部署,可略过下述步骤,直接阅读 [2.2节](#2.2与手机联调)。 + +如果要部署的模型不在上述表格中,则需要按照如下步骤获得优化后的模型。 + +步骤1:参考[文档](https://www.paddlepaddle.org.cn/lite/v2.10/user_guides/opt/opt_python.html)安装paddlelite,用于转换paddle inference model为paddlelite运行所需的nb模型 + +```bash linenums="1" +pip install paddlelite==2.10 # paddlelite版本要与预测库版本一致 +``` + +安装完后,如下指令可以查看帮助信息 + +```bash linenums="1" +paddle_lite_opt +``` + +paddle_lite_opt 参数介绍: + +| 选项 | 说明 | +| ---| --- | +| --model_dir | 待优化的PaddlePaddle模型(非combined形式)的路径 | +| --model_file | 待优化的PaddlePaddle模型(combined形式)的网络结构文件路径 | +| --param_file | 待优化的PaddlePaddle模型(combined形式)的权重文件路径 | +| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf | +| --optimize_out | 优化模型的输出路径 | +| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm | +| --record_tailoring_info | 当使用 根据模型裁剪库文件 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false | + +`--model_dir`适用于待优化的模型是非combined方式,PaddleOCR的inference模型是combined方式,即模型结构和模型参数使用单独一个文件存储。 + +步骤2:使用paddle_lite_opt将inference模型转换成移动端模型格式。 + +下面以PaddleOCR的超轻量中文模型为例,介绍使用编译好的opt文件完成inference模型到Paddle-Lite优化模型的转换。 + +```bash linenums="1" +# 【推荐】 下载 PP-OCRv3版本的中英文 inference模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar && tar xf ch_PP-OCRv3_det_slim_infer.tar +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar && tar xf ch_PP-OCRv2_rec_slim_quant_infer.tar +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_cls_slim_infer.tar && tar xf ch_ppocr_mobile_v2.0_cls_slim_infer.tar +# 转换检测模型 +paddle_lite_opt --model_file=./ch_PP-OCRv3_det_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_det_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_det_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +# 转换识别模型 +paddle_lite_opt --model_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdmodel --param_file=./ch_PP-OCRv3_rec_slim_infer/inference.pdiparams --optimize_out=./ch_PP-OCRv3_rec_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +# 转换方向分类器模型 +paddle_lite_opt --model_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdmodel --param_file=./ch_ppocr_mobile_v2.0_cls_slim_infer/inference.pdiparams --optimize_out=./ch_ppocr_mobile_v2.0_cls_slim_opt --valid_targets=arm --optimize_out_type=naive_buffer +``` + +转换成功后,inference模型目录下会多出`.nb`结尾的文件,即是转换成功的模型文件。 + +注意:使用paddle-lite部署时,需要使用opt工具优化后的模型。 opt工具的输入模型是paddle保存的inference模型 + +### 2.2 与手机联调 + +首先需要进行一些准备工作: + +1. 准备一台arm8的安卓手机,如果编译的预测库和opt文件是armv7,则需要arm7的手机,并修改Makefile中`ARM_ABI = arm7`。 + +2. 打开手机的USB调试选项,选择文件传输模式,连接电脑。 + +3. 电脑上安装adb工具,用于调试。 adb安装方式如下: + + 3.1. MAC电脑安装ADB: + + ```bash linenums="1" + brew cask install android-platform-tools + ``` + + 3.2. Linux安装ADB + + ```bash linenums="1" + sudo apt update + sudo apt install -y wget adb + ``` + + 3.3. Window安装ADB + win上安装需要去谷歌的安卓平台下载adb软件包进行安装:[链接](https://developer.android.com/studio) + + 打开终端,手机连接电脑,在终端中输入 + + ```bash linenums="1" + adb devices + ``` + + 如果有device输出,则表示安装成功。 + + ```bash linenums="1" + List of devices attached + 744be294 device + ``` + +4. 准备优化后的模型、预测库文件、测试图像和使用的字典文件。 + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/PaddleOCR.git + cd PaddleOCR/deploy/lite/ + # 运行prepare.sh,准备预测库文件、测试图像和使用的字典文件,并放置在预测库中的demo/cxx/ocr文件夹下 + sh prepare.sh /{lite prediction library path}/inference_lite_lib.android.armv8 + + # 进入OCR demo的工作目录 + cd /{lite prediction library path}/inference_lite_lib.android.armv8/ + cd demo/cxx/ocr/ + # 将C++预测动态库so文件复制到debug文件夹中 + cp ../../../cxx/lib/libpaddle_light_api_shared.so ./debug/ + ``` + + 准备测试图像,以`PaddleOCR/doc/imgs/11.jpg`为例,将测试的图像复制到`demo/cxx/ocr/debug/`文件夹下。 + 准备lite opt工具优化后的模型文件,比如使用`ch_PP-OCRv3_det_slim_opt.ch_PP-OCRv3_rec_slim_rec.nb, ch_ppocr_mobile_v2.0_cls_slim_opt.nb`,模型文件放置在`demo/cxx/ocr/debug/`文件夹下。 + + 执行完成后,ocr文件夹下将有如下文件格式: + + ```text linenums="1" + demo/cxx/ocr/ + |-- debug/ + | |--ch_PP-OCRv3_det_slim_opt.nb 优化后的检测模型文件 + | |--ch_PP-OCRv3_rec_slim_opt.nb 优化后的识别模型文件 + | |--ch_ppocr_mobile_v2.0_cls_slim_opt.nb 优化后的文字方向分类器模型文件 + | |--11.jpg 待测试图像 + | |--ppocr_keys_v1.txt 中文字典文件 + | |--libpaddle_light_api_shared.so C++预测库文件 + | |--config.txt 超参数配置 + |-- config.txt 超参数配置 + |-- cls_process.cc 方向分类器的预处理和后处理文件 + |-- cls_process.h + |-- crnn_process.cc 识别模型CRNN的预处理和后处理文件 + |-- crnn_process.h + |-- db_post_process.cc 检测模型DB的后处理文件 + |-- db_post_process.h + |-- Makefile 编译文件 + |-- ocr_db_crnn.cc C++预测源文件 + ``` + +#### 注意 + +1. ppocr_keys_v1.txt是中文字典文件,如果使用的 nb 模型是英文数字或其他语言的模型,需要更换为对应语言的字典。PaddleOCR 在ppocr/utils/下存放了多种字典,包括: + + ```text linenums="1" + dict/french_dict.txt # 法语字典 + dict/german_dict.txt # 德语字典 + ic15_dict.txt # 英文字典 + dict/japan_dict.txt # 日语字典 + dict/korean_dict.txt # 韩语字典 + ppocr_keys_v1.txt # 中文字典 + ... + ``` + +2. `config.txt` 包含了检测器、分类器、识别器的超参数,如下: + + ```python linenums="1" + max_side_len 960 # 输入图像长宽大于960时,等比例缩放图像,使得图像最长边 为960 + det_db_thresh 0.3 # 用于过滤DB预测的二值化图像,设置为0.-0.3对结果影响不 明显 + det_db_box_thresh 0.5 # 检测器后处理过滤box的阈值,如果检测存在漏框情况,可酌 情减小 + det_db_unclip_ratio 1.6 # 表示文本框的紧致程度,越小则文本框更靠近文本 + use_direction_classify 0 # 是否使用方向分类器,0表示不使用,1表示使用 + rec_image_height 48 # 识别模型输入图像的高度,PP-OCRv3模型设置为48, PP-OCRv2模型需要设置为32 + ``` + +3. 启动调试 + +上述步骤完成后就可以使用adb将文件push到手机上运行,步骤如下: + +```bash linenums="1" +# 执行编译,得到可执行文件ocr_db_crnn, 第一次执行此命令会下载opencv等依赖库,下载完成后,需要再执行一次 +make -j + +# 将编译的可执行文件移动到debug文件夹中 +mv ocr_db_crnn ./debug/ +# 将debug文件夹push到手机上 +adb push debug /data/local/tmp/ +adb shell +cd /data/local/tmp/debug +export LD_LIBRARY_PATH=${PWD}:$LD_LIBRARY_PATH +# 开始使用,ocr_db_crnn可执行文件的使用方式为: +# ./ocr_db_crnn 预测模式 检测模型文件 方向分类器模型文件 识别模型文件 运行硬件 运行精度 线程数 batchsize 测试图像路径 参数配置路径 字典文件路径 是否使用benchmark参数 +./ocr_db_crnn system ch_PP-OCRv3_det_slim_opt.nb ch_PP-OCRv3_rec_slim_opt.nb ch_ppocr_mobile_v2.0_cls_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt ppocr_keys_v1.txt True + +# 仅使用文本检测模型,使用方式如下: +./ocr_db_crnn det ch_PP-OCRv3_det_slim_opt.nb arm8 INT8 10 1 ./11.jpg config.txt + +# 仅使用文本识别模型,使用方式如下: +./ocr_db_crnn rec ch_PP-OCRv3_rec_slim_opt.nb arm8 INT8 10 1 word_1.jpg ppocr_keys_v1.txt config.txt +``` + + 如果对代码做了修改,则需要重新编译并push到手机上。 + + 运行效果如下: + +![img](./images/lite_demo.png) + +## FAQ + +Q1:如果想更换模型怎么办,需要重新按照流程走一遍吗? + +A1:如果已经走通了上述步骤,更换模型只需要替换 .nb 模型文件即可,同时要注意更新字典 + +Q2:换一个图测试怎么做? + +A2:替换debug下的.jpg测试图像为你想要测试的图像,adb push 到手机上即可 + +Q3:如何封装到手机APP中? + +A3:此demo旨在提供能在手机上运行OCR的核心算法部分,PaddleOCR/deploy/android_demo是将这个demo封装到手机app的示例,供参考 + +Q4:运行demo时遇到报错`Error: This model is not supported, because kernel for 'io_copy' is not supported by Paddle-Lite.` + +A4:问题是安装的paddlelite版本和下载的预测库版本不匹配,确保paddleliteopt工具和你的预测库版本匹配,重新转nb模型试试。 diff --git a/docs/ppocr/infer_deploy/paddle2onnx.en.md b/docs/ppocr/infer_deploy/paddle2onnx.en.md new file mode 100644 index 0000000000..91d3dae509 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle2onnx.en.md @@ -0,0 +1,169 @@ +--- +comments: true +--- + +# Paddle2ONNX model transformation and prediction + +This chapter describes how the PaddleOCR model is converted into an ONNX model and predicted based on the ONNXRuntime engine. + +## 1. Environment preparation + +Need to prepare PaddleOCR, Paddle2ONNX model conversion environment, and ONNXRuntime prediction environment + +### PaddleOCR + +Clone the PaddleOCR repository, use the release/2.6 branch, and install it. + +```bash linenums="1" +git clone -b release/2.6 https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR && python3.7 setup.py install +``` + +### Paddle2ONNX + +Paddle2ONNX supports converting the PaddlePaddle model format to the ONNX model format. The operator currently supports exporting ONNX Opset 9~11 stably, and some Paddle operators support lower ONNX Opset conversion. +For more details, please refer to [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_en.md) + +- install Paddle2ONNX + + ```bash linenums="1" + python3.7 -m pip install paddle2onnx + ``` + +- install ONNXRuntime + + ```bash linenums="1" + # It is recommended to install version 1.9.0, and the version number can be changed according to the environment + python3.7 -m pip install onnxruntime==1.9.0 + ``` + +## 2. Model conversion + +### Paddle model download + +There are two ways to obtain the Paddle model: Download the prediction model provided by PaddleOCR in [model_list](../model_list.en.md); + +Take the PP-OCRv3 detection, recognition, and classification model as an example: + +```bash linenums="1" +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar +cd ./inference && tar xf en_PP-OCRv3_det_infer.tar && cd .. + +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar +cd ./inference && tar xf en_PP-OCRv3_rec_infer.tar && cd .. + +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd .. +``` + +### Convert model + +Convert Paddle inference model to ONNX model format using Paddle2ONNX: + +```bash linenums="1" +paddle2onnx --model_dir ./inference/en_PP-OCRv3_det_infer \ +--model_filename inference.pdmodel \ +--params_filename inference.pdiparams \ +--save_file ./inference/det_onnx/model.onnx \ +--opset_version 10 \ +--input_shape_dict="{'x':[-1,3,-1,-1]}" \ +--enable_onnx_checker True + +paddle2onnx --model_dir ./inference/en_PP-OCRv3_rec_infer \ +--model_filename inference.pdmodel \ +--params_filename inference.pdiparams \ +--save_file ./inference/rec_onnx/model.onnx \ +--opset_version 10 \ +--input_shape_dict="{'x':[-1,3,-1,-1]}" \ +--enable_onnx_checker True + +paddle2onnx --model_dir ./inference/ch_ppocr_mobile_v2.0_cls_infer \ +--model_filename ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel \ +--params_filename ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams \ +--save_file ./inferencecls_onnx/model.onnx \ +--opset_version 10 \ +--input_shape_dict="{'x':[-1,3,-1,-1]}" \ +--enable_onnx_checker True +``` + +After execution, the ONNX model will be saved in `./inference/det_onnx/`, `./inference/rec_onnx/`, `./inference/cls_onnx/` paths respectively + +- Note: For the OCR model, the conversion process must be in the form of dynamic shape, that is, add the option --input_shape_dict="{'x': [-1, 3, -1, -1]}", otherwise the prediction result may be the same as Predicting directly with Paddle is slightly different. + +In addition, the following models do not currently support conversion to ONNX models: NRTR, SAR, RARE, SRN. + +## 3. prediction + +Take the English OCR model as an example, use **ONNXRuntime** to predict and execute the following commands: + +```bash linenums="1" +python3.7 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \ +--det_model_dir=./inference/det_onnx/model.onnx \ +--rec_model_dir=./inference/rec_onnx/model.onnx \ +--cls_model_dir=./inference/cls_onnx/model.onnx \ +--image_dir=doc/imgs_en/img_12.jpg \ +--rec_char_dict_path=ppocr/utils/en_dict.txt +``` + +Taking the English OCR model as an example, use **Paddle Inference** to predict and execute the following commands: + +```bash linenums="1" +python3.7 tools/infer/predict_system.py --use_gpu=False \ +--cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \ +--rec_model_dir=./inference/en_PP-OCRv3_rec_infer \ +--det_model_dir=./inference/en_PP-OCRv3_det_infer \ +--image_dir=doc/imgs_en/img_12.jpg \ +--rec_char_dict_path=ppocr/utils/en_dict.txt +``` + +After executing the command, the predicted identification information will be printed out in the terminal, and the visualization results will be saved under `./inference_results/`. + +ONNXRuntime result: + +![](./images/lite_demo_onnx.png) + +Paddle Inference result: + +![](./images/lite_demo_paddle.png) + +Using ONNXRuntime to predict, terminal output: + +```bash linenums="1" +[2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916 +[2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377 +[2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s +[2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997 +[2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976 +[2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979 +[2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989 +[2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956 +[2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991 +[2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978 +[2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988 +[2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958 +[2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970 +[2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998 +[2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg +[2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557 +``` + +Using Paddle Inference to predict, terminal output: + +```bash linenums="1" +[2022/10/10 12:06:28] ppocr DEBUG: dt_boxes num : 11, elapse : 0.3568880558013916 +[2022/10/10 12:06:31] ppocr DEBUG: rec_res num : 11, elapse : 2.6445000171661377 +[2022/10/10 12:06:31] ppocr DEBUG: 0 Predict time of doc/imgs_en/img_12.jpg: 3.021s +[2022/10/10 12:06:31] ppocr DEBUG: ACKNOWLEDGEMENTS, 0.997 +[2022/10/10 12:06:31] ppocr DEBUG: We would like to thank all the designers and, 0.976 +[2022/10/10 12:06:31] ppocr DEBUG: contributors who have been involved in the, 0.979 +[2022/10/10 12:06:31] ppocr DEBUG: production of this book; their contributions, 0.989 +[2022/10/10 12:06:31] ppocr DEBUG: have been indispensable to its creation. We, 0.956 +[2022/10/10 12:06:31] ppocr DEBUG: would also like to express our gratitude to all, 0.991 +[2022/10/10 12:06:31] ppocr DEBUG: the producers for their invaluable opinions, 0.978 +[2022/10/10 12:06:31] ppocr DEBUG: and assistance throughout this project. And to, 0.988 +[2022/10/10 12:06:31] ppocr DEBUG: the many others whose names are not credited, 0.958 +[2022/10/10 12:06:31] ppocr DEBUG: but have made specific input in this book, we, 0.970 +[2022/10/10 12:06:31] ppocr DEBUG: thank you for your continuous support., 0.998 +[2022/10/10 12:06:31] ppocr DEBUG: The visualized image saved in ./inference_results/img_12.jpg +[2022/10/10 12:06:31] ppocr INFO: The predict total time is 3.2482550144195557 +``` diff --git a/docs/ppocr/infer_deploy/paddle2onnx.md b/docs/ppocr/infer_deploy/paddle2onnx.md new file mode 100644 index 0000000000..b9c3c628d6 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle2onnx.md @@ -0,0 +1,220 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Paddle2ONNX模型转化与预测 + +本章节介绍 PaddleOCR 模型如何转化为 ONNX 模型,并基于 ONNXRuntime 引擎预测。 + +## 1. 环境准备 + +需要准备 PaddleOCR、Paddle2ONNX 模型转化环境,和 ONNXRuntime 预测环境 + +### PaddleOCR + +克隆PaddleOCR的仓库,使用 main 分支,并进行安装,由于 PaddleOCR 仓库比较大,git clone 速度比较慢,所以本教程已下载 + +```bash linenums="1" +git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR && python3 -m pip install -e . +``` + +### Paddle2ONNX + +Paddle2ONNX 支持将 PaddlePaddle 模型格式转化到 ONNX 模型格式,算子目前稳定支持导出 ONNX Opset 9~18,部分Paddle算子支持更低的ONNX Opset转换。 +更多细节可参考 [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX/blob/develop/README_zh.md) + +- 安装 Paddle2ONNX + + ```bash linenums="1" + python3 -m pip install paddle2onnx + ``` + +- 安装 ONNXRuntime + + ```bash linenums="1" + python3 -m pip install onnxruntime + ``` + +## 2. 模型转换 + +### Paddle 模型下载 + +有两种方式获取Paddle静态图模型:在 [model_list](../model_list.md) 中下载PaddleOCR提供的预测模型; + +以 PP-OCRv3 中文检测、识别、分类模型为例: + +```bash linenums="1" +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +cd ./inference && tar xf ch_PP-OCRv3_det_infer.tar && cd .. + +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +cd ./inference && tar xf ch_PP-OCRv3_rec_infer.tar && cd .. + +wget -nc -P ./inference https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +cd ./inference && tar xf ch_ppocr_mobile_v2.0_cls_infer.tar && cd .. +``` + +### 模型转换 + +使用 Paddle2ONNX 将Paddle静态图模型转换为ONNX模型格式: + +```bash linenums="1" +paddle2onnx --model_dir ./inference/ch_PP-OCRv3_det_infer \ +--model_filename inference.pdmodel \ +--params_filename inference.pdiparams \ +--save_file ./inference/det_onnx/model.onnx \ +--opset_version 11 \ +--enable_onnx_checker True + +paddle2onnx --model_dir ./inference/ch_PP-OCRv3_rec_infer \ +--model_filename inference.pdmodel \ +--params_filename inference.pdiparams \ +--save_file ./inference/rec_onnx/model.onnx \ +--opset_version 11 \ +--enable_onnx_checker True + +paddle2onnx --model_dir ./inference/ch_ppocr_mobile_v2.0_cls_infer \ +--model_filename inference.pdmodel \ +--params_filename inference.pdiparams \ +--save_file ./inference/cls_onnx/model.onnx \ +--opset_version 11 \ +--enable_onnx_checker True +``` + +执行完毕后,ONNX 模型会被分别保存在 `./inference/det_onnx/`,`./inference/rec_onnx/`,`./inference/cls_onnx/`路径下 + +- 注意:对于OCR模型,转化过程中必须采用动态shape的形式,否则预测结果可能与直接使用Paddle预测有细微不同。 + 另外,以下几个模型暂不支持转换为 ONNX 模型: + NRTR、SAR、RARE、SRN + +- 注意:[当前Paddle2ONNX版本(v1.2.3)](https://github.com/PaddlePaddle/Paddle2ONNX/releases/tag/v1.2.3)现已默认支持动态shape,即 `float32[p2o.DynamicDimension.0,3,p2o.DynamicDimension.1,p2o.DynamicDimension.2]`,选项 `--input_shape_dict` 已废弃。如果有shape调整需求可使用如下命令进行Paddle模型输入shape调整。 + + ```bash linenums="1" + python3 -m paddle2onnx.optimize --input_model inference/det_onnx/model.onnx \ + --output_model inference/det_onnx/model.onnx \ + --input_shape_dict "{'x': [-1,3,-1,-1]}" + ``` + +## 3. 推理预测 + +以中文OCR模型为例,使用 ONNXRuntime 预测可执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_system.py --use_gpu=False --use_onnx=True \ +--det_model_dir=./inference/det_onnx/model.onnx \ +--rec_model_dir=./inference/rec_onnx/model.onnx \ +--cls_model_dir=./inference/cls_onnx/model.onnx \ +--image_dir=./deploy/lite/imgs/lite_demo.png +``` + +以中文OCR模型为例,使用 Paddle Inference 预测可执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_system.py --use_gpu=False \ +--cls_model_dir=./inference/ch_ppocr_mobile_v2.0_cls_infer \ +--rec_model_dir=./inference/ch_PP-OCRv3_rec_infer \ +--det_model_dir=./inference/ch_PP-OCRv3_det_infer \ +--image_dir=./deploy/lite/imgs/lite_demo.png +``` + +执行命令后在终端会打印出预测的识别信息,并在 `./inference_results/` 下保存可视化结果。 + +ONNXRuntime 执行效果: + +![](./images/lite_demo_onnx.png) + +Paddle Inference 执行效果: + +![](./images/lite_demo_paddle.png) + +使用 ONNXRuntime 预测,终端输出: + +```bash linenums="1" +[2022/02/22 17:48:27] root DEBUG: dt_boxes num : 38, elapse : 0.043187856674194336 +[2022/02/22 17:48:27] root DEBUG: rec_res num : 38, elapse : 0.592170000076294 +[2022/02/22 17:48:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 0.642s +[2022/02/22 17:48:27] root DEBUG: The, 0.984 +[2022/02/22 17:48:27] root DEBUG: visualized, 0.882 +[2022/02/22 17:48:27] root DEBUG: etect18片, 0.720 +[2022/02/22 17:48:27] root DEBUG: image saved in./vis.jpg, 0.947 +[2022/02/22 17:48:27] root DEBUG: 纯臻营养护发素0.993604, 0.996 +[2022/02/22 17:48:27] root DEBUG: 产品信息/参数, 0.922 +[2022/02/22 17:48:27] root DEBUG: 0.992728, 0.914 +[2022/02/22 17:48:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926 +[2022/02/22 17:48:27] root DEBUG: 0.97417, 0.977 +[2022/02/22 17:48:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962 +[2022/02/22 17:48:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945 +[2022/02/22 17:48:27] root DEBUG: 0.985133, 0.980 +[2022/02/22 17:48:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921 +[2022/02/22 17:48:27] root DEBUG: 0.995007, 0.883 +[2022/02/22 17:48:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955 +[2022/02/22 17:48:27] root DEBUG: 【净含量】:220ml, 0.943 +[2022/02/22 17:48:27] root DEBUG: Q.996577, 0.932 +[2022/02/22 17:48:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913 +[2022/02/22 17:48:27] root DEBUG: 0.995842, 0.969 +[2022/02/22 17:48:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883 +[2022/02/22 17:48:27] root DEBUG: 0.961928, 0.964 +[2022/02/22 17:48:27] root DEBUG: 10, 0.812 +[2022/02/22 17:48:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866 +[2022/02/22 17:48:27] root DEBUG: 0.925898, 0.943 +[2022/02/22 17:48:27] root DEBUG: (成品包材), 0.974 +[2022/02/22 17:48:27] root DEBUG: 0.972573, 0.961 +[2022/02/22 17:48:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936 +[2022/02/22 17:48:27] root DEBUG: 0.994448, 0.952 +[2022/02/22 17:48:27] root DEBUG: 13, 0.998 +[2022/02/22 17:48:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994 +[2022/02/22 17:48:27] root DEBUG: 0.990198, 0.975 +[2022/02/22 17:48:27] root DEBUG: 14, 0.977 +[2022/02/22 17:48:27] root DEBUG: 发足够的滋养, 0.991 +[2022/02/22 17:48:27] root DEBUG: 0.997668, 0.918 +[2022/02/22 17:48:27] root DEBUG: 花费了0.457335秒, 0.901 +[2022/02/22 17:48:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png +[2022/02/22 17:48:27] root INFO: The predict total time is 0.7003889083862305 +``` + +使用 Paddle Inference 预测,终端输出: + +```bash linenums="1" +[2022/02/22 17:47:25] root DEBUG: dt_boxes num : 38, elapse : 0.11791276931762695 +[2022/02/22 17:47:27] root DEBUG: rec_res num : 38, elapse : 2.6206860542297363 +[2022/02/22 17:47:27] root DEBUG: 0 Predict time of ./deploy/lite/imgs/lite_demo.png: 2.746s +[2022/02/22 17:47:27] root DEBUG: The, 0.984 +[2022/02/22 17:47:27] root DEBUG: visualized, 0.882 +[2022/02/22 17:47:27] root DEBUG: etect18片, 0.720 +[2022/02/22 17:47:27] root DEBUG: image saved in./vis.jpg, 0.947 +[2022/02/22 17:47:27] root DEBUG: 纯臻营养护发素0.993604, 0.996 +[2022/02/22 17:47:27] root DEBUG: 产品信息/参数, 0.922 +[2022/02/22 17:47:27] root DEBUG: 0.992728, 0.914 +[2022/02/22 17:47:27] root DEBUG: (45元/每公斤,100公斤起订), 0.926 +[2022/02/22 17:47:27] root DEBUG: 0.97417, 0.977 +[2022/02/22 17:47:27] root DEBUG: 每瓶22元,1000瓶起订)0.993976, 0.962 +[2022/02/22 17:47:27] root DEBUG: 【品牌】:代加工方式/0EMODM, 0.945 +[2022/02/22 17:47:27] root DEBUG: 0.985133, 0.980 +[2022/02/22 17:47:27] root DEBUG: 【品名】:纯臻营养护发素, 0.921 +[2022/02/22 17:47:27] root DEBUG: 0.995007, 0.883 +[2022/02/22 17:47:27] root DEBUG: 【产品编号】:YM-X-30110.96899, 0.955 +[2022/02/22 17:47:27] root DEBUG: 【净含量】:220ml, 0.943 +[2022/02/22 17:47:27] root DEBUG: Q.996577, 0.932 +[2022/02/22 17:47:27] root DEBUG: 【适用人群】:适合所有肤质, 0.913 +[2022/02/22 17:47:27] root DEBUG: 0.995842, 0.969 +[2022/02/22 17:47:27] root DEBUG: 【主要成分】:鲸蜡硬脂醇、燕麦B-葡聚, 0.883 +[2022/02/22 17:47:27] root DEBUG: 0.961928, 0.964 +[2022/02/22 17:47:27] root DEBUG: 10, 0.812 +[2022/02/22 17:47:27] root DEBUG: 糖、椰油酰胺丙基甜菜碱、泛醒, 0.866 +[2022/02/22 17:47:27] root DEBUG: 0.925898, 0.943 +[2022/02/22 17:47:27] root DEBUG: (成品包材), 0.974 +[2022/02/22 17:47:27] root DEBUG: 0.972573, 0.961 +[2022/02/22 17:47:27] root DEBUG: 【主要功能】:可紧致头发磷层,从而达到, 0.936 +[2022/02/22 17:47:27] root DEBUG: 0.994448, 0.952 +[2022/02/22 17:47:27] root DEBUG: 13, 0.998 +[2022/02/22 17:47:27] root DEBUG: 即时持久改善头发光泽的效果,给干燥的头, 0.994 +[2022/02/22 17:47:27] root DEBUG: 0.990198, 0.975 +[2022/02/22 17:47:27] root DEBUG: 14, 0.977 +[2022/02/22 17:47:27] root DEBUG: 发足够的滋养, 0.991 +[2022/02/22 17:47:27] root DEBUG: 0.997668, 0.918 +[2022/02/22 17:47:27] root DEBUG: 花费了0.457335秒, 0.901 +[2022/02/22 17:47:27] root DEBUG: The visualized image saved in ./inference_results/lite_demo.png +[2022/02/22 17:47:27] root INFO: The predict total time is 2.8338775634765625 +``` diff --git a/docs/ppocr/infer_deploy/paddle_cloud.md b/docs/ppocr/infer_deploy/paddle_cloud.md new file mode 100644 index 0000000000..dada8bb722 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle_cloud.md @@ -0,0 +1,342 @@ +--- +comments: true +--- + +# 云上飞桨部署工具 + +[云上飞桨(PaddleCloud)](https://github.com/PaddlePaddle/PaddleCloud) 是面向飞桨框架及其模型套件的部署工具, +为用户提供了模型套件Docker化部署和Kubernetes集群部署两种方式,可以满足不同场景与环境的部署需求。 +本章节我们将使用PaddleCloud提供的OCR标准镜像以及云原生组件来训练和部署PP-OCRv3识别模型。 + +## 云上飞桨部署工具的优势 + +
+ +
+ +- **模型套件Docker镜像大礼包。** + + PaddleCloud为用户提供了飞桨模型套件Docker镜像大礼包,这些镜像中包含运行模型套件案例的所有依赖并能持续更新,支持异构硬件环境和常见CUDA版本、开箱即用。 + +- **具有丰富的云上飞桨组件。** + + 云上飞桨具有丰富的云原生功能组件,包括样本数据缓存组件、分布式训练组件、推理服务组件等,使用这些组件用户可以快速地在Kubernetes集群上进行训练和部署工作。 + +- **功能强大的自运维能力。** + + 云上飞桨组件基于Kubernetes的Operator机制提供了功能强大的自运维能力,如训练组件支持多种架构模式并具有分布式容错与弹性训练的能力,推理服务组件支持自动扩缩容与蓝绿发版等。 + +- **针对飞桨框架的定制优化。** + + 除了部署便捷与自运维的优势,PaddleCloud还针对飞桨框架进行了正对性优化,如通过缓存样本数据来加速云上飞桨分布式训练作业、基于飞桨框架和调度器的协同设计来优化集群GPU利用率等。 + +## 1. PP-OCRv3 Docker化部署 + +PaddleCloud基于 [Tekton](https://github.com/tektoncd/pipeline) 为OCR模型套件提供了镜像持续构建的能力,并支持CPU、GPU以及常见CUDA版本的镜像。 +您可以查看 [PaddleOCR 镜像仓库](https://hub.docker.com/repository/docker/paddlecloud/paddleocr) 来获取所有的镜像列表。 +同时我们也将PP-OCRv3识别模型的训练与推理实战案例放置到了AI Studio平台上,您可以点击 [PP-OCRv3识别训推一体项目实战](https://aistudio.baidu.com/aistudio/projectdetail/3916206?channelType=0&channel=0) 在平台上快速体验。 + +> **适用场景**:本地测试开发环境、单机部署环境。 + +### 1.1 安装Docker + +如果您所使用的机器上还没有安装 Docker,您可以参考 [Docker 官方文档](https://docs.docker.com/get-docker/) 来进行安装。 +如果您需要使用支持 GPU 版本的镜像,则还需安装好NVIDIA相关驱动和 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker) 。 + +**注意**:如果您使用的是Windows系统,需要开启 [WSL2(Linux子系统功能)功能](https://docs.microsoft.com/en-us/windows/wsl/install)。 + +### 1.2 启动容器 + +**使用CPU版本的Docker镜像** + +```bash linenums="1" +# 这是加上参数 --shm-size=32g 是为了防止容器里内存不足 +docker run --name ppocr -v $PWD:/mnt -p 8888:8888 -it --shm-size=32g paddlecloud/paddleocr:2.5-cpu-efbb0a /bin/bash +``` + +**使用GPU版本的Docker镜像** + +```bash linenums="1" +docker run --name ppocr --runtime=nvidia -v $PWD:/mnt -p 8888:8888 -it --shm-size=32g paddlecloud/paddleocr:2.5-gpu-cuda10.2-cudnn7-efbb0a /bin/bash +``` + +进入容器内,则可进行 PP-OCRv3 模型的训练和部署工作。 + +### 1.3 准备训练数据 + +本教程以HierText数据集为例,HierText是第一个具有自然场景和文档中文本分层注释的数据集。 +该数据集包含从 Open Images 数据集中选择的 11639 张图像,提供高质量的单词 (~1.2M)、行和段落级别的注释。 +我们已经将数据集上传到百度云对象存储(BOS),您可以通过运行如下指令,完成数据集的下载和解压操作: + +```bash linenums="1" +# 下载数据集 +$ wget -P /mnt https://paddleflow-public.hkg.bcebos.com/ppocr/hiertext1.tar + +# 解压数据集 +$ tar xf /mnt/hiertext1.tar -C /mnt && mv /mnt/hiertext1 /mnt/hiertext +``` + +运行上述命令后,在 `/mnt` 目录下包含以下文件: + +``` +/mnt/hiertext + └─ train/ HierText训练集数据 + └─ validation/ HierText验证集数据 + └─ label_hiertext_train.txt HierText训练集的行标注 + └─ label_hiertext_val.txt HierText验证集的行标注 +``` + +### 1.4 修改配置文件 + +PP-OCRv3模型配置文件位于`/home/PaddleOCR/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml`,需要修改的配置如下: + +- 修改训练数据配置: + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt +``` + +修改为: + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: /mnt/ + label_file_list: + - /mnt/hiertext/label_hiertext_train.txt +``` + +- 修改验证数据配置: + +```yaml linenums="1" +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt +``` + +修改为: + +```yaml linenums="1" +Eval: + dataset: + name: SimpleDataSet + data_dir: /mnt/ + label_file_list: + - /mnt/hiertext/label_hiertext_val.txt +``` + +### 1.5 启动训练 + +下载PP-OCRv3的蒸馏预训练模型并进行训练的方式如下 + +```bash linenums="1" +# 下载预训练模型到/home/PaddleOCR/pre_train文件夹下 +$ mkdir /home/PaddleOCR/pre_train + +$ wget -P /home/PaddleOCR/pre_train https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar + +$ tar xf /home/PaddleOCR/pre_train/ch_PP-OCRv3_det_distill_train.tar -C /home/PaddleOCR/pre_train/ +``` + +启动训练,训练模型默认保存在`output`目录下,加载PP-OCRv3检测预训练模型。 + +```bash linenums="1" +# 这里以 GPU 训练为例,使用 CPU 进行训练的话,需要指定参数 Global.use_gpu=false +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.save_model_dir=./output/ Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy +``` + +如果要使用多GPU分布式训练,请使用如下命令: + +```bash linenums="1" +# 启动训练,训练模型默认保存在output目录下,--gpus '0,1,2,3'表示使用0,1,2,3号GPU训练 +python3 -m paddle.distributed.launch --log_dir=./debug/ --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.save_model_dir=./output/ Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy +``` + +### 1.6 模型评估 + +训练过程中保存的模型在output目录下,包含以下文件: + +``` +best_accuracy.states +best_accuracy.pdparams # 默认保存最优精度的模型参数 +best_accuracy.pdopt # 默认保存最优精度的优化器相关参数 +latest.states +latest.pdparams # 默认保存的最新模型参数 +latest.pdopt # 默认保存的最新模型的优化器相关参数 +``` + +其中,best_accuracy是保存的最优模型,可以直接使用该模型评估 + +```bash linenums="1" +# 进行模型评估 +cd /home/PaddleOCR/ + +python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=./output/best_accuracy +``` + +## 2. PP-OCRv3云端部署 + +PaddleCloud基于Kubernetes的Operator机制为您提供了多个功能强大的云原生组件,如样本数据缓存组件、分布式训练组件、 以及模型推理服务组件, +使用这些组件您可以快速地在云上进行分布式训练和模型服务化部署。更多关于PaddleCloud云原生组件的内容,请参考文档 [PaddleCloud架构概览](https://github.com/PaddlePaddle/PaddleCloud/blob/main/docs/zh_CN/paddlecloud-overview.md) 。 + +> **适用场景**:基于Kubernetes的多机部署环境。 + +### 2.1 安装云上飞桨组件 + +#### 环境要求 + +- [Kubernetes v1.16+](https://kubernetes.io/zh/) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) +- [Helm](https://helm.sh/zh/docs/intro/install/) + +如果您没有Kubernetes环境,可以使用MicroK8S在本地搭建环境,更多详情请参考 [MicroK8S官方文档](https://microk8s.io/docs/getting-started)。 + +使用Helm一键安装所有组件和所有依赖 + +```bash linenums="1" +# 添加PaddleCloud Chart仓库 +$ helm repo add paddlecloud https://paddleflow-public.hkg.bcebos.com/charts +$ helm repo update + +# 安装云上飞桨组件 +$ helm install pdc paddlecloud/paddlecloud --set tags.all-dep=true --namespace paddlecloud --create-namespace + +# 检查所有云上飞桨组件是否成功启动,命名空间下的所有Pod都为Runing状态则安装成功。 +$ kubectl get pods -n paddlecloud +NAME READY STATUS RESTARTS AGE +pdc-hostpath-5b6bd6787d-bxvxg 1/1 Running 0 10h +juicefs-csi-node-pkldt 3/3 Running 0 10h +juicefs-csi-controller-0 3/3 Running 0 10h +pdc-paddlecloud-sampleset-767bdf6947-pb6zm 1/1 Running 0 10h +pdc-paddlecloud-paddlejob-7cc8b7bfc6-7gqnh 1/1 Running 0 10h +pdc-minio-7cc967669d-824q5 1/1 Running 0 10h +pdc-redis-master-0 1/1 Running 0 10h +``` + +更多安装参数请参考[PaddleCloud安装指南](https://github.com/PaddlePaddle/PaddleCloud/blob/main/docs/zh_CN/installation.md) + +### 2.2 云原生组件介绍 + +
+ +
+ +- **数据缓存组件。** 数据缓存组件使用JuiceFS作为缓存引擎,能够将远程样本数据缓存到训练集群本地,大幅加速云上飞桨分布式训练作业。 +- **分布式训练组件。** 分布式训练组件支持参数服务器(PS)与集合通信(Collective)两种架构模式,方便用户在云上快速运行飞桨分布式训练作业。 + +以下内容我们将使用这两个云原生组件来在Kubernetes集群中部署PP-OCRv3识别模型的训练作业。 + +### 2.3 准备hiertext数据集 + +使用数据缓存组件来准备数据集,编写SampleSet Yaml文件如下: + +```yaml linenums="1" +# hiertext.yaml +apiVersion: batch.paddlepaddle.org/v1alpha1 +kind: SampleSet +metadata: + name: hiertext + namespace: paddlecloud +spec: + partitions: 1 + source: + uri: bos://paddleflow-public.hkg.bcebos.com/ppocr/hiertext + secretRef: + name: none + secretRef: + name: data-center +``` + +然后在命令行中,使用kubectl执行如下命令。 + +```bash linenums="1" +# 创建hiertext数据集 +$ kubectl apply -f hiertext.yaml +sampleset.batch.paddlepaddle.org/hiertext created + +# 查看数据集的状态 +$ kubectl get sampleset hiertext -n paddlecloud +NAME TOTAL SIZE CACHED SIZE AVAIL SPACE RUNTIME PHASE AGE +hiertext 3.3 GiB 3.2 GiB 12 GiB 1/1 Ready 11m +``` + +### 2.4 训练PP-OCRv3模型 + +使用训练组件在Kubernetes集群上训练PP-OCRv3模型,编写PaddleJob Yaml文件如下: + +```yaml linenums="1" +# ppocrv3.yaml +apiVersion: batch.paddlepaddle.org/v1 +kind: PaddleJob +metadata: + name: ppocrv3 + namespace: paddlecloud +spec: + cleanPodPolicy: OnCompletion + sampleSetRef: + name: hiertext + namespace: paddlecloud + mountPath: /mnt/hiertext + worker: + replicas: 1 + template: + spec: + containers: + - name: ppocrv3 + image: paddlecloud/paddleocr:2.5-gpu-cuda10.2-cudnn7-efbb0a + command: + - /bin/bash + args: + - "-c" + - > + mkdir /home/PaddleOCR/pre_train && + wget -P ./pre_train https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar && + tar xf ./pre_train/ch_PP-OCRv3_det_distill_train.tar -C ./pre_train/ && + python tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o + Train.dataset.data_dir=/mnt/ + Train.dataset.label_file_list=[\"/mnt/hiertext/label_hiertext_train.txt\"] + Eval.dataset.data_dir=/mnt/ + Eval.dataset.label_file_list=[\"/mnt/hiertext/label_hiertext_val.txt\"] + Global.save_model_dir=./output/ + Global.pretrained_model=./pre_train/ch_PP-OCRv3_det_distill_train/best_accuracy + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: # 添加 shared memory 挂载以防止缓存出错 + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory +``` + +本案例采用GPU进行训练,如果您只有CPU机器,则可以将镜像替换成CPU版本 `paddlecloud/paddleocr:2.5-cpu-efbb0a`,并在args中加上参数`Global.use_gpu=false`。 + +```bash linenums="1" +# 创建PaddleJob训练模型 +$ kubectl apply -f ppocrv3.yaml +paddlejob.batch.paddlepaddle.org/ppocrv3 created + +# 查看PaddleJob状态 +$ kubectl get pods -n paddlecloud -l paddle-res-name=ppocrv3-worker-0 +NAME READY STATUS RESTARTS AGE +ppocrv3-worker-0 1/1 Running 0 4s + +# 查看训练日志 +$ kubectl logs -f ppocrv3-worker-0 -n paddlecloud +``` + +## 更多资源 + +欢迎关注[云上飞桨项目PaddleCloud](https://github.com/PaddlePaddle/PaddleCloud),我们为您提供了飞桨模型套件标准镜像以及全栈的云原生模型套件部署组件,如您有任何关于飞桨模型套件的部署问题,请联系我们。 +如果你发现任何PaddleCloud存在的问题或者是建议, 欢迎通过[GitHub Issues](https://github.com/PaddlePaddle/PaddleCloud/issues)给我们提issues。 diff --git a/docs/ppocr/infer_deploy/paddle_js.en.md b/docs/ppocr/infer_deploy/paddle_js.en.md new file mode 100644 index 0000000000..363f753af3 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle_js.en.md @@ -0,0 +1,28 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Paddle.js Introduction + +[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) is a web project for Baidu PaddlePaddle, which is an open source deep learning framework running in the browser. Paddle.js can either load a pre-trained model, or transforming a model from paddle-hub with model transforming tools provided by Paddle.js. It could run in every browser with WebGL/WebGPU/WebAssembly supported. It could also run in Baidu Smartprogram and wechat miniprogram. + +## Web Demo + +Run OCR demo in browser refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/blob/cd0ee79c91d4ed1103abdc65ff12ccadd23d0827/examples/application/js/WebDemo.md). + +|demo|web demo dicrctory|visualization| +|-|-|-| +|PP-OCRv3|[TextDetection、TextRecognition](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/)|![](./images/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png)| + +## Mini Program Demo + +The Mini Program demo running tutorial eference +Run OCR demo in wechat miniprogram refer to [tutorial](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program). + +|demo|directory| +|-|-| +|Text Detection| [ocrdetecXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrdetectXcx/) | +|Text Recognition| [ocrXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrXcx/) | + +![](./images/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif) diff --git a/docs/ppocr/infer_deploy/paddle_js.md b/docs/ppocr/infer_deploy/paddle_js.md new file mode 100644 index 0000000000..921951a341 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle_js.md @@ -0,0 +1,29 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Paddle.js 网页前端部署 + +[Paddle.js](https://github.com/PaddlePaddle/Paddle.js) 是百度 PaddlePaddle 的 web 方向子项目,是一个运行在浏览器中的开源深度学习框架。Paddle.js 可以加载提前训练好的 paddle 模型,通过 Paddle.js 的模型转换工具 paddlejs-converter 变成浏览器友好的模型进行在线推理预测使用。目前,Paddle.js 可以在支持 WebGL/WebGPU/WebAssembly 的浏览器中运行,也可以在百度小程序和微信小程序环境下运行。 + +## Web Demo使用 + +在浏览器中直接运行官方OCR demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/blob/develop/examples/application/js/WebDemo.md) + +|demo名称|web demo目录|可视化| +|-|-|-| +|PP-OCRv3|[TextDetection、TextRecognition](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/web_demo/src/pages/cv/ocr/)|![](./images/196874354-1b5eecb0-f273-403c-aa6c-4463bf6d78db.png)| + +## 微信小程序Demo使用 + +在微信小程序运行官方demo参考[教程](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program) + +|名称|目录| +|-|-| +|OCR文本检测| [ocrdetecXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrdetectXcx/) | +|OCR文本识别| [ocrXcx](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/application/js/mini_program/ocrXcx/) | + +- 效果: + +![](./images/197918203-c7d46f8a-75d4-47f9-9687-405ee0d6727e.gif) diff --git a/docs/ppocr/infer_deploy/paddle_server.en.md b/docs/ppocr/infer_deploy/paddle_server.en.md new file mode 100644 index 0000000000..a95d73c2ef --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle_server.en.md @@ -0,0 +1,286 @@ +--- +comments: true +--- + +## OCR Pipeline WebService + +PaddleOCR provides two service deployment methods: + +- Based on **PaddleHub Serving**: Code path is "`./deploy/hubserving`". Please refer to the [tutorial](https://github.com/PaddlePaddle/PaddleOCR/blob/db0ad17cf631fafc01650c177e00ce76413af97f/deploy/hubserving/readme_en.md) +- Based on **PaddleServing**: Code path is "`./deploy/pdserving`". Please follow this tutorial. + +### Service deployment based on PaddleServing + +This document will introduce how to use the [PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README.md) to deploy the PPOCR dynamic graph model as a pipeline online service. + +Some Key Features of Paddle Serving: + +- Integrate with Paddle training pipeline seamlessly, most paddle models can be deployed with one line command. +- Industrial serving features supported, such as models management, online loading, online A/B testing etc. +- Highly concurrent and efficient communication between clients and servers supported. + +PaddleServing supports deployment in multiple languages. In this example, two deployment methods, python pipeline and C++, are provided. The comparison between the two is as follows: + +| Language | Speed | Secondary development | Do you need to compile | +|-----|-----|---------|------------| +| C++ | fast | Slightly difficult | Single model prediction does not need to be compiled, multi-model concatenation needs to be compiled | +| python | general | easy | single-model/multi-model no compilation required | + +The introduction and tutorial of Paddle Serving service deployment framework reference [document](https://github.com/PaddlePaddle/Serving/blob/develop/README.md). + +### Environmental preparation + +PaddleOCR operating environment and Paddle Serving operating environment are needed. + +1. Please prepare PaddleOCR operating environment reference [link](../environment.en.md). + Download the corresponding paddlepaddle whl package according to the environment, it is recommended to install version 2.2.2. + +2. The steps of PaddleServing operating environment prepare are as follows: + + ```bash linenums="1" + # Install serving which used to start the service + wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl + pip3 install paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl + + # Install paddle-serving-server for cuda10.1 + # wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl + # pip3 install paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl + + # Install serving which used to start the service + wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.8.3-cp37-none-any.whl + pip3 install paddle_serving_client-0.8.3-cp37-none-any.whl + + # Install serving-app + wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.8.3-py3-none-any.whl + pip3 install paddle_serving_app-0.8.3-py3-none-any.whl + ``` + + **note:** If you want to install the latest version of PaddleServing, refer to [link](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Latest_Packages_CN.md). + +### Model conversion + +When using PaddleServing for service deployment, you need to convert the saved inference model into a serving model that is easy to deploy. + +Firstly, download the [inference model](../model_list.en.md) of PPOCR + +```bash linenums="1" +# Download and unzip the OCR text detection model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -O ch_PP-OCRv3_det_infer.tar && tar -xf ch_PP-OCRv3_det_infer.tar +# Download and unzip the OCR text recognition model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -O ch_PP-OCRv3_rec_infer.tar && tar -xf ch_PP-OCRv3_rec_infer.tar +``` + +Then, you can use installed paddle_serving_client tool to convert inference model to mobile model. + +```bash linenums="1" +# Detection model conversion +python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_det_infer/ \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_det_v3_serving/ \ + --serving_client ./ppocr_det_v3_client/ + +# Recognition model conversion +python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_rec_infer/ \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_rec_v3_serving/ \ + --serving_client ./ppocr_rec_v3_client/ + +``` + +After the detection model is converted, there will be additional folders of `ppocr_det_v3_serving` and `ppocr_det_v3_client` in the current folder, with the following format: + +```text linenums="1" +|- ppocr_det_v3_serving/ + |- __model__ + |- __params__ + |- serving_server_conf.prototxt + |- serving_server_conf.stream.prototxt + +|- ppocr_det_v3_client + |- serving_client_conf.prototxt + |- serving_client_conf.stream.prototxt + +``` + +The recognition model is the same. + +### Paddle Serving pipeline deployment + +1. Download the PaddleOCR code, if you have already downloaded it, you can skip this step. + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/PaddleOCR + + # Enter the working directory + cd PaddleOCR/deploy/pdserving/ + ``` + + The pdserver directory contains the code to start the pipeline service and send prediction requests, including: + + ```bash linenums="1" + __init__.py + config.yml # Start the service configuration file + ocr_reader.py # OCR model pre-processing and post-processing code implementation + pipeline_http_client.py # Script to send pipeline prediction request + web_service.py # Start the script of the pipeline server + ``` + +2. Run the following command to start the service. + + ```bash linenums="1" + # Start the service and save the running log in log.txt + python3 web_service.py --config=config.yml &>log.txt & + ``` + + After the service is successfully started, a log similar to the following will be printed in log.txt + + ![](./images/start_server.png) + +3. Send service request + + ```bash linenums="1" + python3 pipeline_http_client.py + ``` + + After successfully running, the predicted result of the model will be printed in the cmd window. An example of the result is: + + ![](./images/results.png) + + Adjust the number of concurrency in config.yml to get the largest QPS. Generally, the number of concurrent detection and recognition is 2:1 + + ```yaml linenums="1" + det: + concurrency: 8 + ... + rec: + concurrency: 4 + ... + ``` + + Multiple service requests can be sent at the same time if necessary. + + The predicted performance data will be automatically written into the `PipelineServingLogs/pipeline.tracer` file. + + Tested on 200 real pictures, and limited the detection long side to 960. The average QPS on T4 GPU can reach around 23: + + ```bash linenums="1" + 2021-05-13 03:42:36,895 ==================== TRACER ====================== + 2021-05-13 03:42:36,975 Op(rec): + 2021-05-13 03:42:36,976 in[14.472382882882883 ms] + 2021-05-13 03:42:36,976 prep[9.556855855855856 ms] + 2021-05-13 03:42:36,976 midp[59.921905405405404 ms] + 2021-05-13 03:42:36,976 postp[15.345945945945946 ms] + 2021-05-13 03:42:36,976 out[1.9921216216216215 ms] + 2021-05-13 03:42:36,976 idle[0.16254943864471572] + 2021-05-13 03:42:36,976 Op(det): + 2021-05-13 03:42:36,976 in[315.4468035714286 ms] + 2021-05-13 03:42:36,976 prep[69.5980625 ms] + 2021-05-13 03:42:36,976 midp[18.989535714285715 ms] + 2021-05-13 03:42:36,976 postp[18.857803571428573 ms] + 2021-05-13 03:42:36,977 out[3.1337544642857145 ms] + 2021-05-13 03:42:36,977 idle[0.7477961159203756] + 2021-05-13 03:42:36,977 DAGExecutor: + 2021-05-13 03:42:36,977 Query count[224] + 2021-05-13 03:42:36,977 QPS[22.4 q/s] + 2021-05-13 03:42:36,977 Succ[0.9910714285714286] + 2021-05-13 03:42:36,977 Error req[169, 170] + 2021-05-13 03:42:36,977 Latency: + 2021-05-13 03:42:36,977 ave[535.1678348214285 ms] + 2021-05-13 03:42:36,977 .50[172.651 ms] + 2021-05-13 03:42:36,977 .60[187.904 ms] + 2021-05-13 03:42:36,977 .70[245.675 ms] + 2021-05-13 03:42:36,977 .80[526.684 ms] + 2021-05-13 03:42:36,977 .90[854.596 ms] + 2021-05-13 03:42:36,977 .95[1722.728 ms] + 2021-05-13 03:42:36,977 .99[3990.292 ms] + 2021-05-13 03:42:36,978 Channel (server worker num[10]): + 2021-05-13 03:42:36,978 chl0(In: ['@DAGExecutor'], Out: ['det']) size[0/0] + 2021-05-13 03:42:36,979 chl1(In: ['det'], Out: ['rec']) size[6/0] + 2021-05-13 03:42:36,979 chl2(In: ['rec'], Out: ['@DAGExecutor']) size[0/0] + ``` + +### C++ Serving + +Service deployment based on python obviously has the advantage of convenient secondary development. However, the real application often needs to pursue better performance. PaddleServing also provides a more performant C++ deployment version. + +The C++ service deployment is the same as python in the environment setup and data preparation stages, the difference is when the service is started and the client sends requests. + +1. Compile Serving + + To improve predictive performance, C++ services also provide multiple model concatenation services. Unlike Python Pipeline services, multiple model concatenation requires the pre - and post-model processing code to be written on the server side, so local recompilation is required to generate serving. Specific may refer to the official document: [how to compile Serving](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Compile_EN.md) + +2. Run the following command to start the service. + + ```bash linenums="1" + # Start the service and save the running log in log.txt + python3 -m paddle_serving_server.serve --model ppocr_det_v3_serving ppocr_rec_v3_serving --op GeneralDetectionOp GeneralInferOp --port 8181 &>log.txt & + ``` + + After the service is successfully started, a log similar to the following will be printed in log.txt + ![](./images/start_server.png) + +3. Send service request + + Due to the need for pre and post-processing in the C++Server part, in order to speed up the input to the C++Server is only the base64 encoded string of the picture, it needs to be manually modified + Change the feed_type field and shape field in ppocr_det_v3_client/serving_client_conf.prototxt to the following: + + ```bash linenums="1" + feed_var { + name: "x" + alias_name: "x" + is_lod_tensor: false + feed_type: 20 + shape: 1 + } + ``` + + start the client: + + ```bash linenums="1" + python3 ocr_cpp_client.py ppocr_det_v3_client ppocr_rec_v3_client + ``` + + After successfully running, the predicted result of the model will be printed in the cmd window. An example of the result is: + + ![](./images/results.png) + +### WINDOWS Users + +Windows does not support Pipeline Serving, if we want to lauch paddle serving on Windows, we should use Web Service, for more infomation please refer to [Paddle Serving for Windows Users](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Windows_Tutorial_EN.md) + +**WINDOWS user can only use version 0.5.0 CPU Mode** + +**Prepare Stage:** + +```bash linenums="1" +pip3 install paddle-serving-server==0.5.0 +pip3 install paddle-serving-app==0.3.1 +``` + +1. Start Server + + ```bash linenums="1" + cd win + python3 ocr_web_server.py gpu(for gpu user) + or + python3 ocr_web_server.py cpu(for cpu user) + ``` + +2. Client Send Requests + + ```bash linenums="1" + python3 ocr_web_client.py + ``` + +### FAQ + +**Q1**: No result return after sending the request. + +**A1**: Do not set the proxy when starting the service and sending the request. You can close the proxy before starting the service and before sending the request. The command to close the proxy is: + +```bash linenums="1" +unset https_proxy +unset http_proxy +``` diff --git a/docs/ppocr/infer_deploy/paddle_server.md b/docs/ppocr/infer_deploy/paddle_server.md new file mode 100644 index 0000000000..628be3f646 --- /dev/null +++ b/docs/ppocr/infer_deploy/paddle_server.md @@ -0,0 +1,305 @@ +--- +typora-copy-images-to: images +comments: true +--- + +## PPOCR 服务化部署 + +PaddleOCR提供2种服务部署方式: + +- 基于PaddleHub Serving的部署:代码路径为"`./deploy/hubserving`",使用方法参考[文档](https://github.com/PaddlePaddle/PaddleOCR/blob/db0ad17cf631fafc01650c177e00ce76413af97f/deploy/hubserving/readme.md); +- 基于PaddleServing的部署:代码路径为"`./deploy/pdserving`",按照本教程使用。 + +### 基于PaddleServing的服务部署 + +本文档将介绍如何使用[PaddleServing](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md) 工具部署PP-OCR动态图模型的pipeline在线服务。 + +相比较于hubserving部署,PaddleServing具备以下优点: + +- 支持客户端和服务端之间高并发和高效通信 +- 支持 工业级的服务能力 例如模型管理,在线加载,在线A/B测试等 +- 支持 多种编程语言 开发客户端,例如C++, Python和Java + +PaddleServing 支持多种语言部署,本例中提供了python pipeline 和 C++ 两种部署方式,两者的对比如下: + +| 语言 | 速度 | 二次开发 | 是否需要编译 | +| ------ | ---- | -------- | -------------------------------------- | +| C++ | 很快 | 略有难度 | 单模型预测无需编译,多模型串联需要编译 | +| python | 一般 | 容易 | 单模型/多模型 均无需编译 | + +更多有关PaddleServing服务化部署框架介绍和使用教程参考[文档](https://github.com/PaddlePaddle/Serving/blob/develop/README_CN.md)。 + +AIStudio演示案例可参考 [基于PaddleServing的OCR服务化部署实战](https://aistudio.baidu.com/aistudio/projectdetail/3630726)。 + +#### 环境准备 + +需要准备PaddleOCR的运行环境和Paddle Serving的运行环境。 + +- 准备PaddleOCR的运行环境[链接](../environment.md) + + ```bash linenums="1" + git clone https://github.com/PaddlePaddle/PaddleOCR + + # 进入到工作目录 + cd PaddleOCR/deploy/pdserving/ + ``` + +- 准备PaddleServing的运行环境,步骤如下 + +```bash linenums="1" +# 安装serving,用于启动服务 +wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl +pip3 install paddle_serving_server_gpu-0.8.3.post102-py3-none-any.whl +# 如果是cuda10.1环境,可以使用下面的命令安装paddle-serving-server +# wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl +# pip3 install paddle_serving_server_gpu-0.8.3.post101-py3-none-any.whl + +# 安装client,用于向服务发送请求 +wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_client-0.8.3-cp37-none-any.whl +pip3 install paddle_serving_client-0.8.3-cp37-none-any.whl + +# 安装serving-app +wget https://paddle-serving.bj.bcebos.com/test-dev/whl/paddle_serving_app-0.8.3-py3-none-any.whl +pip3 install paddle_serving_app-0.8.3-py3-none-any.whl +``` + +**Note:** 如果要安装最新版本的PaddleServing参考[链接](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Latest_Packages_CN.md)。 + +### 模型转换 + +使用PaddleServing做服务化部署时,需要将保存的inference模型转换为serving易于部署的模型。 + +首先,下载PP-OCR的[inference模型](https://github.com/PaddlePaddle/PaddleOCR#pp-ocr-series-model-listupdate-on-september-8th) + +```bash linenums="1" +# 下载并解压 OCR 文本检测模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar -O ch_PP-OCRv3_det_infer.tar && tar -xf ch_PP-OCRv3_det_infer.tar +# 下载并解压 OCR 文本识别模型 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar -O ch_PP-OCRv3_rec_infer.tar && tar -xf ch_PP-OCRv3_rec_infer.tar +``` + +接下来,用安装的paddle_serving_client把下载的inference模型转换成易于server部署的模型格式。 + +```bash linenums="1" +# 转换检测模型 +python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_det_infer/ \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_det_v3_serving/ \ + --serving_client ./ppocr_det_v3_client/ + +# 转换识别模型 +python3 -m paddle_serving_client.convert --dirname ./ch_PP-OCRv3_rec_infer/ \ + --model_filename inference.pdmodel \ + --params_filename inference.pdiparams \ + --serving_server ./ppocr_rec_v3_serving/ \ + --serving_client ./ppocr_rec_v3_client/ +``` + +检测模型转换完成后,会在当前文件夹多出`ppocr_det_v3_serving` 和`ppocr_det_v3_client`的文件夹,具备如下格式: + +```text linenums="1" +|- ppocr_det_v3_serving/ + |- __model__ + |- __params__ + |- serving_server_conf.prototxt + |- serving_server_conf.stream.prototxt + +|- ppocr_det_v3_client + |- serving_client_conf.prototxt + |- serving_client_conf.stream.prototxt + +``` + +识别模型同理。 + +### Paddle Serving pipeline部署 + +#### 1. 确认工作目录下文件结构 + +pdserver目录包含启动pipeline服务和发送预测请求的代码,包括: + +```bash linenums="1" +__init__.py +config.yml # 启动服务的配置文件 +ocr_reader.py # OCR模型预处理和后处理的代码实现 +pipeline_http_client.py # 发送pipeline预测请求的脚本 +web_service.py # 启动pipeline服务端的脚本 +``` + +#### 2. 启动服务 + +```bash linenums="1" +# 启动服务,运行日志保存在log.txt +python3 web_service.py --config=config.yml &>log.txt & +``` + +成功启动服务后,log.txt中会打印类似如下日志 + +![img](./images/start_server.png) + +#### 3. 发送服务请求 + +```bash linenums="1" +python3 pipeline_http_client.py +``` + +成功运行后,模型预测的结果会打印在cmd窗口中,结果示例为: + +![img](./images/pipeline_result.png) + +调整 config.yml 中的并发个数获得最大的QPS, 一般检测和识别的并发数为2:1 + +```yaml linenums="1" +det: + #并发数,is_thread_op=True时,为线程并发;否则为进程并发 + concurrency: 8 + ... +rec: + #并发数,is_thread_op=True时,为线程并发;否则为进程并发 + concurrency: 4 + ... +``` + +有需要的话可以同时发送多个服务请求 + +预测性能数据会被自动写入 `PipelineServingLogs/pipeline.tracer` 文件中。 + +在200张真实图片上测试,把检测长边限制为960。T4 GPU 上 QPS 均值可达到23左右: + +```bash linenums="1" +2021-05-13 03:42:36,895 ==================== TRACER ====================== +2021-05-13 03:42:36,975 Op(rec): +2021-05-13 03:42:36,976 in[14.472382882882883 ms] +2021-05-13 03:42:36,976 prep[9.556855855855856 ms] +2021-05-13 03:42:36,976 midp[59.921905405405404 ms] +2021-05-13 03:42:36,976 postp[15.345945945945946 ms] +2021-05-13 03:42:36,976 out[1.9921216216216215 ms] +2021-05-13 03:42:36,976 idle[0.16254943864471572] +2021-05-13 03:42:36,976 Op(det): +2021-05-13 03:42:36,976 in[315.4468035714286 ms] +2021-05-13 03:42:36,976 prep[69.5980625 ms] +2021-05-13 03:42:36,976 midp[18.989535714285715 ms] +2021-05-13 03:42:36,976 postp[18.857803571428573 ms] +2021-05-13 03:42:36,977 out[3.1337544642857145 ms] +2021-05-13 03:42:36,977 idle[0.7477961159203756] +2021-05-13 03:42:36,977 DAGExecutor: +2021-05-13 03:42:36,977 Query count[224] +2021-05-13 03:42:36,977 QPS[22.4 q/s] +2021-05-13 03:42:36,977 Succ[0.9910714285714286] +2021-05-13 03:42:36,977 Error req[169, 170] +2021-05-13 03:42:36,977 Latency: +2021-05-13 03:42:36,977 ave[535.1678348214285 ms] +2021-05-13 03:42:36,977 .50[172.651 ms] +2021-05-13 03:42:36,977 .60[187.904 ms] +2021-05-13 03:42:36,977 .70[245.675 ms] +2021-05-13 03:42:36,977 .80[526.684 ms] +2021-05-13 03:42:36,977 .90[854.596 ms] +2021-05-13 03:42:36,977 .95[1722.728 ms] +2021-05-13 03:42:36,977 .99[3990.292 ms] +2021-05-13 03:42:36,978 Channel (server worker num[10]): +2021-05-13 03:42:36,978 chl0(In: ['@DAGExecutor'], Out: ['det']) size[0/0] +2021-05-13 03:42:36,979 chl1(In: ['det'], Out: ['rec']) size[6/0] +2021-05-13 03:42:36,979 chl2(In: ['rec'], Out: ['@DAGExecutor']) size[0/0] +``` + +### Paddle Serving C++ 部署 + +基于python的服务部署,显然具有二次开发便捷的优势,然而真正落地应用,往往需要追求更优的性能。PaddleServing 也提供了性能更优的C++部署版本。 + +C++ 服务部署在环境搭建和数据准备阶段与 python 相同,区别在于启动服务和客户端发送请求时不同。 + +#### 1. 准备 Serving 环境 + +为了提高预测性能,C++ 服务同样提供了多模型串联服务。与python pipeline服务不同,多模型串联的过程中需要将模型前后处理代码写在服务端,因此需要在本地重新编译生成serving。 + +首先需要下载Serving代码库, 把OCR文本检测预处理相关代码替换到Serving库中 + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/Serving +cp -rf general_detection_op.cpp Serving/core/general-server/op +``` + +具体可参考官方文档:[如何编译Serving](https://github.com/PaddlePaddle/Serving/blob/v0.8.3/doc/Compile_CN.md),注意需要开启 WITH_OPENCV 选项。 + +完成编译后,注意要安装编译出的三个whl包,并设置SERVING_BIN环境变量。 + +#### 2. 启动服务 + +一个服务启动两个模型串联,只需要在--model后依次按顺序传入模型文件夹的相对路径,且需要在--op后依次传入自定义C++OP类名称: + +```bash linenums="1" +# 启动服务,运行日志保存在log.txt +python3 -m paddle_serving_server.serve --model ppocr_det_v3_serving ppocr_rec_v3_serving --op GeneralDetectionOp GeneralInferOp --port 8181 &>log.txt & +``` + +成功启动服务后,log.txt中会打印类似如下日志 +![](./imgs/start_server.png) + +#### 3. 发送服务请求 + +由于需要在C++Server部分进行前后处理,为了加速传入C++Server的仅仅是图片的base64编码的字符串,故需要手动修改 +ppocr_det_v3_client/serving_client_conf.prototxt 中 feed_type 字段 和 shape 字段,修改成如下内容: + +```bash linenums="1" +feed_var { +name: "x" +alias_name: "x" +is_lod_tensor: false +feed_type: 20 +shape: 1 +} +``` + +启动客户端 + +```bash linenums="1" +python3 ocr_cpp_client.py ppocr_det_v3_client ppocr_rec_v3_client +``` + +成功运行后,模型预测的结果会打印在cmd窗口中,结果示例为: + +![img](./images/results.png) + +在浏览器中输入服务器 ip:端口号,可以看到当前服务的实时QPS。(端口号范围需要是8000-9000) + +在200张真实图片上测试,把检测长边限制为960。T4 GPU 上 QPS 峰值可达到51左右,约为pipeline的 2.12 倍。 + +### Windows用户 + +Windows用户不能使用上述的启动方式,需要使用Web Service,详情参见[Windows平台使用Paddle Serving指导](https://github.com/PaddlePaddle/Serving/blob/develop/doc/Windows_Tutorial_CN.md) + +**WINDOWS只能使用0.5.0版本的CPU模式** + +准备阶段: + +```bash linenums="1" +pip3 install paddle-serving-server==0.5.0 +pip3 install paddle-serving-app==0.3.1 +``` + +#### 1. 启动服务端程序 + +```bash linenums="1" +cd win +python3 ocr_web_server.py gpu(使用gpu方式) +或者 +python3 ocr_web_server.py cpu(使用cpu方式) +``` + +#### 2. 发送服务请求 + +```bash linenums="1" +python3 ocr_web_client.py +``` + +### FAQ + +**Q1**: 发送请求后没有结果返回或者提示输出解码报错 + +**A1**: 启动服务和发送请求时不要设置代理,可以在启动服务前和发送请求前关闭代理,关闭代理的命令是: + +```bash linenums="1" +unset https_proxy +unset http_proxy +``` diff --git a/docs/ppocr/infer_deploy/python_infer.en.md b/docs/ppocr/infer_deploy/python_infer.en.md new file mode 100755 index 0000000000..d4e7b54a5d --- /dev/null +++ b/docs/ppocr/infer_deploy/python_infer.en.md @@ -0,0 +1,176 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Python Inference for PP-OCR Model Zoo + +This article introduces the use of the Python inference engine for the PP-OCR model library. The content is in order of text detection, text recognition, direction classifier and the prediction method of the three in series on the CPU and GPU. + +## Text Detection Model Inference + +The default configuration is based on the inference setting of the DB text detection model. For lightweight Chinese detection model inference, you can execute the following commands: + +```bash linenums="1" +# download DB text detection inference model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar +tar xf ch_PP-OCRv3_det_infer.tar +# run inference +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" +``` + +The visual text detection results are saved to the ./inference_results folder by default, and the name of the result file is prefixed with 'det_res'. Examples of results are as follows: + +![img](./images/det_res_00018069.jpg) + +You can use the parameters `limit_type` and `det_limit_side_len` to limit the size of the input image, +The optional parameters of `limit_type` are [`max`, `min`], and +`det_limit_size_len` is a positive integer, generally set to a multiple of 32, such as 960. + +The default setting of the parameters is `limit_type='max', det_limit_side_len=960`. Indicates that the longest side of the network input image cannot exceed 960, +If this value is exceeded, the image will be resized with the same width ratio to ensure that the longest side is `det_limit_side_len`. +Set as `limit_type='min', det_limit_side_len=960`, it means that the shortest side of the image is limited to 960. + +If the resolution of the input picture is relatively large and you want to use a larger resolution prediction, you can set det_limit_side_len to the desired value, such as 1216: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --det_limit_type=max --det_limit_side_len=1216 +``` + +If you want to use the CPU for prediction, execute the command as follows + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_gpu=False +``` + +## Text Recognition Model Inference + +### 1. Lightweight Chinese Recognition Model Inference + +**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. + +For lightweight Chinese recognition model inference, you can execute the following commands: + +```bash linenums="1" +# download CRNN text recognition inference model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar +tar xf ch_PP-OCRv3_rec_infer.tar +# run inference +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_10.png" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --rec_image_shape=3,48,320 +``` + +![img](./images/word_10.png) + +After executing the command, the prediction results (recognized text and score) of the above image will be printed on the screen. + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_10.png:('PAIN', 0.988671) +``` + +### 2. English Recognition Model Inference + +For English recognition model inference, you can execute the following commands,you need to specify the dictionary path used by `--rec_char_dict_path`: + +```bash linenums="1" +# download en model: +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar +tar xf en_PP-OCRv3_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png" --rec_model_dir="./en_PP-OCRv3_rec_infer/" --rec_char_dict_path="ppocr/utils/en_dict.txt" +``` + +![img](./images/word_1.png) + +After executing the command, the prediction result of the above figure is: + +```bash linenums="1" +Predicts of ./doc/imgs_words/en/word_1.png: ('JOINT', 0.998160719871521) +``` + +### 3. Multilingual Model Inference + +If you need to predict [other language models](../model_list.en.md), when using inference model prediction, you need to specify the dictionary path used by `--rec_char_dict_path`. At the same time, in order to get the correct visualization results, +You need to specify the visual font path through `--vis_font_path`. There are small language fonts provided by default under the `doc/fonts` path, such as Korean recognition: + +```bash linenums="1" +wget wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/korean_mobile_v2.0_rec_infer.tar + +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +``` + +![img](./images/1.jpg) + +After executing the command, the prediction result of the above figure is: + +```text linenums="1" +Predicts of ./doc/imgs_words/korean/1.jpg:('바탕으로', 0.9948904) +``` + +## Angle Classification Model Inference + +For angle classification model inference, you can execute the following commands: + +```bash linenums="1" +# download text angle class inference model: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar xf ch_ppocr_mobile_v2.0_cls_infer.tar +python3 tools/infer/predict_cls.py --image_dir="./doc/imgs_words_en/word_10.png" --cls_model_dir="ch_ppocr_mobile_v2.0_cls_infer" +``` + +![img](./images/word_10.png) + +After executing the command, the prediction results (classification angle and score) of the above image will be printed on the screen. + +```text linenums="1" + Predicts of ./doc/imgs_words_en/word_10.png:['0', 0.9999995] +``` + +## Text Detection Angle Classification and Recognition Inference Concatenation + +**Note**: The input shape used by the recognition model of `PP-OCRv3` is `3, 48, 320`. If you use other recognition models, you need to set the parameter `--rec_image_shape` according to the model. In addition, the `rec_algorithm` used by the recognition model of `PP-OCRv3` is `SVTR_LCNet` by default. Note the difference from the original `SVTR`. + +When performing prediction, you need to specify the path of a single image or a folder of images through the parameter `image_dir`, pdf file is also supported, the parameter `det_model_dir` specifies the path to detect the inference model, the parameter `cls_model_dir` specifies the path to angle classification inference model and the parameter `rec_model_dir` specifies the path to identify the inference model. The parameter `use_angle_cls` is used to control whether to enable the angle classification model. The parameter `use_mp` specifies whether to use multi-process to infer `total_process_num` specifies process number when using multi-process. The parameter . The visualized recognition results are saved to the `./inference_results` folder by default. + +```bash linenums="1" +# use direction classifier +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true +# not use use direction classifier +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false +# use multi-process +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=false --use_mp=True --total_process_num=6 +# use PDF files, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages +python3 tools/infer/predict_system.py --image_dir="./xxx.pdf" --det_model_dir="./ch_PP-OCRv3_det_infer/" --cls_model_dir="./cls/" --rec_model_dir="./ch_PP-OCRv3_rec_infer/" --use_angle_cls=true --page_num=2 +``` + +After executing the command, the recognition result image is as follows: + +![](./images/system_res_00018069_v3.jpg) + +For more configuration and explanation of inference parameters, please refer to:[Model Inference Parameters Explained Tutorial](../blog/inference_args.en.md)。 + +## TensorRT Inference + +Paddle Inference ensembles TensorRT using subgraph mode. For GPU deployment scenarios, TensorRT can optimize some subgraphs, including horizontal and vertical integration of OPs, filter redundant OPs, and automatically select the optimal OP kernels for to speed up inference. + +You need to do the following 2 steps for inference using TRT. + +* (1) Collect the dynamic shape information of the model about a specific dataset and store it in a file. +* (2) Load the dynamic shape information file for TRT inference. + +Taking the text detection model as an example. Firstly, you can use the following command to generate a dynamic shape file, which will eventually be named as `det_trt_dynamic_shape.txt` and stored in the `ch_PP-OCRv3_det_infer` folder. + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +The above command is only used to collect dynamic shape information, and TRT is not used during inference. + +Then, you can use the following command to perform TRT inference. + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./ch_PP-OCRv3_det_infer/" --use_tensorrt=True +``` + +**Note:** + +* In the first step, if the dynamic shape information file already exists, it does not need to be collected again. If you want to regenerate the dynamic shape information file, you need to delete the dynamic shape information file in the model folder firstly, and then regenerate it. +* In general, dynamic shape information file only needs to be generated once. In the actual deployment process, it is recommended that the dynamic shape information file can be generated on offline validation set or test set, and then the file can be directly loaded for online TRT inference. diff --git a/docs/ppocr/infer_deploy/python_infer.md b/docs/ppocr/infer_deploy/python_infer.md new file mode 100644 index 0000000000..2b1bf8b3be --- /dev/null +++ b/docs/ppocr/infer_deploy/python_infer.md @@ -0,0 +1,502 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 基于Python预测引擎推理 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +本文介绍针对PP-OCR模型库的Python推理引擎使用方法,内容依次为文本检测、文本识别、方向分类器以及三者串联在CPU、GPU上的预测方法。 + +## 一、训练模型转inference模型 + +### 检测模型转inference模型 + +下载超轻量级中文检测模型: + +```bash linenums="1" +wget -P ./ch_lite/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar && tar xf ./ch_lite/ch_ppocr_mobile_v2.0_det_train.tar -C ./ch_lite/ +``` + +上述模型是以MobileNetV3为backbone训练的DB算法,将训练好的模型转换成inference模型只需要运行如下命令: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_det_train/best_accuracy Global.save_inference_dir=./inference/det_db/ +``` + +转inference模型时,使用的配置文件和训练时使用的配置文件相同。另外,还需要设置配置文件中的`Global.pretrained_model`参数,其指向训练中保存的模型参数文件。 +转换成功后,在模型保存目录下有三个文件: + +```text linenums="1" +inference/det_db/ + ├── inference.pdiparams # 检测inference模型的参数文件 + ├── inference.pdiparams.info # 检测inference模型的参数信息,可忽略 + └── inference.pdmodel # 检测inference模型的program文件 +``` + +### 识别模型转inference模型 + +下载超轻量中文识别模型: + +```bash linenums="1" +wget -P ./ch_lite/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar && tar xf ./ch_lite/ch_ppocr_mobile_v2.0_rec_train.tar -C ./ch_lite/ +``` + +识别模型转inference模型与检测的方式相同,如下: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_rec_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn/ +``` + +**注意:** 如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`是否是所需要的字典文件。 + +转换成功后,在目录下有三个文件: + +```bash linenums="1" +/inference/rec_crnn/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +### 方向分类模型转inference模型 + +下载方向分类模型: + +```bash linenums="1" +wget -P ./ch_lite/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar && tar xf ./ch_lite/ch_ppocr_mobile_v2.0_cls_train.tar -C ./ch_lite/ +``` + +方向分类模型转inference模型与检测的方式相同,如下: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/cls/cls_mv3.yml -o Global.pretrained_model=./ch_lite/ch_ppocr_mobile_v2.0_cls_train/best_accuracy Global.save_inference_dir=./inference/cls/ +``` + +转换成功后,在目录下有三个文件: + +```text linenums="1" +/inference/cls/ + ├── inference.pdiparams # 分类inference模型的参数文件 + ├── inference.pdiparams.info # 分类inference模型的参数信息,可忽略 + └── inference.pdmodel # 分类inference模型的program文件 +``` + +## 二、文本检测模型推理 + +文本检测模型推理,默认使用DB模型的配置参数。当不使用DB模型时,在推理时,需要通过传入相应的参数进行算法适配,细节参考下文。 + +### 1. 超轻量中文检测模型推理 + +超轻量中文检测模型推理,可以执行如下命令: + +```bash linenums="1" +# 下载超轻量中文检测模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar +tar xf ch_ppocr_mobile_v2.0_det_infer.tar +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./ch_ppocr_mobile_v2.0_det_infer/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![](./images/det_res_00018069.jpg) + +通过参数`limit_type`和`det_limit_side_len`来对图片的尺寸进行限制, +`limit_type`可选参数为[`max`, `min`], +`det_limit_size_len` 为正整数,一般设置为32 的倍数,比如960。 + +参数默认设置为`limit_type='max', det_limit_side_len=960`。表示网络输入图像的最长边不能超过960, +如果超过这个值,会对图像做等宽比的resize操作,确保最长边为`det_limit_side_len`。 +设置为`limit_type='min', det_limit_side_len=960` 则表示限制图像的最短边为960。 + +如果输入图片的分辨率比较大,而且想使用更大的分辨率预测,可以设置det_limit_side_len 为想要的值,比如1216: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --det_limit_type=max --det_limit_side_len=1216 +``` + +如果想使用CPU进行预测,执行命令如下 + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs/1.jpg" --det_model_dir="./inference/det_db/" --use_gpu=False +``` + +### 2. DB文本检测模型推理 + +首先将DB文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_db_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_db.yml -o Global.pretrained_model=./det_r50_vd_db_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_db +``` + +DB文本检测模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_db/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![img](./images/det_res_img_10_db.jpg) + +**注意**:由于ICDAR2015数据集只有1000张训练图像,且主要针对英文场景,所以上述模型对中文文本图像检测效果会比较差。 + +### 3. EAST文本检测模型推理 + +首先将EAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_east_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_east.yml -o Global.pretrained_model=./det_r50_vd_east_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_east +``` + +**EAST文本检测模型推理,需要设置参数`--det_algorithm="EAST"`**,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="EAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![img](./images/det_res_img_10_east.jpg) + +**注意**:本代码库中,EAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。 + +### 4. SAST文本检测模型推理 + +#### (1). 四边形文本检测模型(ICDAR2015) + +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在ICDAR2015英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_icdar15_v2.0_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_icdar15.yml -o Global.pretrained_model=./det_r50_vd_sast_icdar15_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_ic15 + +``` + +**SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`**,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_sast_ic15/" +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![](./images/det_res_img_10_sast.jpg) + +#### (2). 弯曲文本检测模型(Total-Text) + +首先将SAST文本检测训练过程中保存的模型,转换成inference model。以基于Resnet50_vd骨干网络,在Total-Text英文数据集训练的模型为例([模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_r50_vd_sast_totaltext_v2.0_train.tar)),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_r50_vd_sast_totaltext.yml -o Global.pretrained_model=./det_r50_vd_sast_totaltext_v2.0_train/best_accuracy Global.save_inference_dir=./inference/det_sast_tt + +``` + +SAST文本检测模型推理,需要设置参数`--det_algorithm="SAST"`,同时,还需要增加参数`--det_sast_polygon=True`,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="SAST" --image_dir="./doc/imgs_en/img623.jpg" --det_model_dir="./inference/det_sast_tt/" --det_sast_polygon=True +``` + +可视化文本检测结果默认保存到`./inference_results`文件夹里面,结果文件的名称前缀为`det_res`。结果示例如下: + +![img](./images/det_res_img623_sast.jpg) + +**注意**:本代码库中,SAST后处理Locality-Aware NMS有python和c++两种版本,c++版速度明显快于python版。由于c++版本nms编译版本问题,只有python3.5环境下会调用c++版nms,其他情况将调用python版nms。 + +## 三、文本识别模型推理 + +下面将介绍超轻量中文识别模型推理、基于CTC损失的识别模型推理和基于Attention损失的识别模型推理。对于中文文本识别,建议优先选择基于CTC损失的识别模型,实践中也发现基于Attention损失的效果不如基于CTC损失的识别模型。此外,如果训练时修改了文本的字典,请参考下面的自定义文本识别字典的推理。 + +### 1. 超轻量中文识别模型推理 + +超轻量中文识别模型推理,可以执行如下命令: + +```bash linenums="1" +# 下载超轻量中文识别模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar +tar xf ch_ppocr_mobile_v2.0_rec_infer.tar +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --rec_model_dir="ch_ppocr_mobile_v2.0_rec_infer" +``` + +![img](./images/word_4.jpg) + +执行命令后,上面图像的预测结果(识别的文本和得分)会打印到屏幕上,示例如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words/ch/word_4.jpg:('实力活力', 0.98458153) +``` + +### 2. 基于CTC损失的识别模型推理 + +我们以 CRNN 为例,介绍基于CTC损失的识别模型推理。 Rosetta 使用方式类似,不用设置识别算法参数rec_algorithm。 + +首先将 CRNN 文本识别训练过程中保存的模型,转换成inference model。以基于Resnet34_vd骨干网络,使用MJSynth和SynthText两个英文文本识别合成数据集训练 +的模型为例( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_none_bilstm_ctc_v2.0_train.tar) ),可以使用如下命令进行转换: + +```bash linenums="1" +python3 tools/export_model.py -c configs/rec/rec_r34_vd_none_bilstm_ctc.yml -o Global.pretrained_model=./rec_r34_vd_none_bilstm_ctc_v2.0_train/best_accuracy Global.save_inference_dir=./inference/rec_crnn +``` + +CRNN 文本识别模型推理,可以执行如下命令: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./inference/rec_crnn/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +![img](./images/word_336.png) + +执行命令后,上面图像的识别结果如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words_en/word_336.png:('super', 0.9999073) +``` + +**注意**:由于上述模型是参考[DTRB](https://arxiv.org/abs/1904.01906)文本识别训练和评估流程,与超轻量级中文识别模型训练有两方面不同: + +- 训练时采用的图像分辨率不同,训练上述模型采用的图像分辨率是[3,32,100],而中文模型训练时,为了保证长文本的识别效果,训练时采用的图像分辨率是[3, 32, 320]。预测推理程序默认的形状参数是训练中文采用的图像分辨率,即[3, 32, 320]。因此,这里推理上述英文模型时,需要通过参数rec_image_shape设置识别图像的形状。 + +- 字符列表,DTRB论文中实验只是针对26个小写英文本母和10个数字进行实验,总共36个字符。所有大小字符都转成了小写字符,不在上面列表的字符都忽略,认为是空格。因此这里没有输入字符字典,而是通过如下命令生成字典.因此在推理时需要设置参数rec_char_dict_path,指定为英文字典"./ppocr/utils/ic15_dict.txt"。 + +```python linenums="1" +self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" +dict_character = list(self.character_str) +``` + +### 3. 基于SRN损失的识别模型推理 + +基于SRN损失的识别模型,需要额外设置识别算法参数 --rec_algorithm="SRN"。 +同时需要保证预测shape与训练时一致,如: --rec_image_shape="1, 64, 256" + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" \ + --rec_model_dir="./inference/srn/" \ + --rec_image_shape="1, 64, 256" \ + --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" \ + --rec_algorithm="SRN" +``` + +### 4. 自定义文本识别字典的推理 + +如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径 + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_dict_path="your text dict path" +``` + +### 5. 多语言模型的推理 + +如果您需要预测的是其他语言模型,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径, 同时为了得到正确的可视化结果, +需要通过 `--vis_font_path` 指定可视化的字体路径,`doc/fonts/` 路径下有默认提供的小语种字体,例如韩文识别: + +```bash linenums="1" +python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/korean/1.jpg" --rec_model_dir="./your inference model" --rec_char_dict_path="ppocr/utils/dict/korean_dict.txt" --vis_font_path="doc/fonts/korean.ttf" +``` + +![img](./images/1.jpg) + +执行命令后,上图的预测结果为: + +``` text +Predicts of ./doc/imgs_words/korean/1.jpg:('바탕으로', 0.9948904) +``` + +## 四、方向分类模型推理 + +下面将介绍方向分类模型推理。 + +### 1. 方向分类模型推理 + +方向分类模型推理,可以执行如下命令: + +```bash linenums="1" +# 下载超轻量中文方向分类器模型: +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar +tar xf ch_ppocr_mobile_v2.0_cls_infer.tar +python3 tools/infer/predict_cls.py --image_dir="./doc/imgs_words/ch/word_4.jpg" --cls_model_dir="ch_ppocr_mobile_v2.0_cls_infer" +``` + +![img](./images/word_1.jpg) + +执行命令后,上面图像的预测结果(分类的方向和得分)会打印到屏幕上,示例如下: + +```bash linenums="1" +Predicts of ./doc/imgs_words/ch/word_4.jpg:['0', 0.9999982] +``` + +## 五、文本检测、方向分类和文字识别串联推理 + +### 1. 超轻量中文OCR模型推理 + +在执行预测时,需要通过参数`image_dir`指定单张图像或者图像集合的路径、参数`det_model_dir`,`cls_model_dir`和`rec_model_dir`分别指定检测,方向分类和识别的inference模型路径。参数`use_angle_cls`用于控制是否启用方向分类模型。`use_mp`表示是否使用多进程。`total_process_num`表示在使用多进程时的进程数。可视化识别结果默认保存到 ./inference_results 文件夹里面。 + +```bash linenums="1" +# 使用方向分类器 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --cls_model_dir="./inference/cls/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=true + +# 不使用方向分类器 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false + +# 使用多进程 +python3 tools/infer/predict_system.py --image_dir="./doc/imgs/00018069.jpg" --det_model_dir="./inference/det_db/" --rec_model_dir="./inference/rec_crnn/" --use_angle_cls=false --use_mp=True --total_process_num=6 +``` + +执行命令后,识别结果图像如下: + +![](./images/system_res_00018069_v3.jpg) + +### 2. 其他模型推理 + +如果想尝试使用其他检测算法或者识别算法,请参考上述文本检测模型推理和文本识别模型推理,更新相应配置和模型。 + +**注意:由于检测框矫正逻辑的局限性,暂不支持使用SAST弯曲文本检测模型(即,使用参数`--det_sast_polygon=True`时)进行模型串联。** + +下面给出基于EAST文本检测和STAR-Net文本识别执行命令: + +```bash linenums="1" +python3 tools/infer/predict_system.py --image_dir="./doc/imgs_en/img_10.jpg" --det_model_dir="./inference/det_east/" --det_algorithm="EAST" --rec_model_dir="./inference/starnet/" --rec_image_shape="3, 32, 100" --rec_char_dict_path="./ppocr/utils/ic15_dict.txt" +``` + +执行命令后,识别结果图像如下: + +![img](./images/img_10_east_starnet.jpg) + +## 六、参数解释 + +更多关于预测过程的参数解释如下所示。 + +- 全局信息 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| image_dir | str | 无,必须显式指定 | 图像或者文件夹路径 | +| vis_font_path | str | "./doc/fonts/simfang.ttf" | 用于可视化的字体路径 | +| drop_score | float | 0.5 | 识别得分小于该值的结果会被丢弃,不会作为返回结果 | +| use_pdserving | bool | False | 是否使用Paddle Serving进行预测 | +| warmup | bool | False | 是否开启warmup,在统计预测耗时的时候,可以使用这种方法 | +| draw_img_save_dir | str | "./inference_results" | 系统串联预测OCR结果的保存文件夹 | +| save_crop_res | bool | False | 是否保存OCR的识别文本图像 | +| crop_res_save_dir | str | "./output" | 保存OCR识别出来的文本图像路径 | +| use_mp | bool | False | 是否开启多进程预测 | +| total_process_num | int | 6 | 开启的进城数,`use_mp`为`True`时生效 | +| process_id | int | 0 | 当前进程的id号,无需自己修改 | +| benchmark | bool | False | 是否开启benchmark,对预测速度、显存占用等进行统计 | +| save_log_path | str | "./log_output/" | 开启`benchmark`时,日志结果的保存文件夹 | +| show_log | bool | True | 是否显示预测中的日志信息 | +| use_onnx | bool | False | 是否开启onnx预测 | + +- 预测引擎相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| use_gpu | bool | True | 是否使用GPU进行预测 | +| ir_optim | bool | True | 是否对计算图进行分析与优化,开启后可以加速预测过程 | +| use_tensorrt | bool | False | 是否开启tensorrt | +| min_subgraph_size | int | 15 | tensorrt中最小子图size,当子图的size大于该值时,才会尝试对该子图使用trt engine计算 | +| precision | str | fp32 | 预测的精度,支持`fp32`, `fp16`, `int8` 3种输入 | +| enable_mkldnn | bool | True | 是否开启mkldnn | +| cpu_threads | int | 10 | 开启mkldnn时,cpu预测的线程数 | + +- 文本检测模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_algorithm | str | "DB" | 文本检测算法名称,目前支持`DB`, `EAST`, `SAST`, `PSE` | +| det_model_dir | str | xx | 检测inference模型路径 | +| det_limit_side_len | int | 960 | 检测的图像边长限制 | +| det_limit_type | str | "max" | 检测的变成限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`det_limit_side_len`,`max`表示保证图像最长边不大于`det_limit_side_len` | + +其中,DB算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | DB输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 | +| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 | +| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 | +| max_batch_size | int | 10 | 预测的batch size | +| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 | +| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + +EAST算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_east_score_thresh | float | 0.8 | EAST后处理中score map的阈值 | +| det_east_cover_thresh | float | 0.1 | EAST后处理中文本框的平均得分阈值 | +| det_east_nms_thresh | float | 0.2 | EAST后处理中nms的阈值 | + +SAST算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_sast_score_thresh | float | 0.5 | SAST后处理中的得分阈值 | +| det_sast_nms_thresh | float | 0.5 | SAST后处理中nms的阈值 | +| det_sast_polygon | bool | False | 是否多边形检测,弯曲文本场景(如Total-Text)设置为True | + +PSE算法相关参数如下 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_pse_thresh | float | 0.0 | 对输出图做二值化的阈值 | +| det_pse_box_thresh | float | 0.85 | 对box进行过滤的阈值,低于此阈值的丢弃 | +| det_pse_min_area | float | 16 | box的最小面积,低于此阈值的丢弃 | +| det_pse_box_type | str | "box" | 返回框的类型,box:四点坐标,poly: 弯曲文本的所有点坐标 | +| det_pse_scale | int | 1 | 输入图像相对于进后处理的图的比例,如`640*640`的图像,网络输出为`160*160`,scale为2的情况下,进后处理的图片shape为`320*320`。这个值调大可以加快后处理速度,但是会带来精度的下降 | + +- 文本识别模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| rec_algorithm | str | "CRNN" | 文本识别算法名称,目前支持`CRNN`, `SRN`, `RARE`, `NETR`, `SAR` | +| rec_model_dir | str | 无,如果使用识别模型,该项是必填项 | 识别inference模型路径 | +| rec_image_shape | list | [3, 32, 320] | 识别时的图像尺寸, | +| rec_batch_num | int | 6 | 识别的batch size | +| max_text_length | int | 25 | 识别结果最大长度,在`SRN`中有效 | +| rec_char_dict_path | str | "./ppocr/utils/ppocr_keys_v1.txt" | 识别的字符字典文件 | +| use_space_char | bool | True | 是否包含空格,如果为`True`,则会在最后字符字典中补充`空格`字符 | + +- 端到端文本检测与识别模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| e2e_algorithm | str | "PGNet" | 端到端算法名称,目前支持`PGNet` | +| e2e_model_dir | str | 无,如果使用端到端模型,该项是必填项 | 端到端模型inference模型路径 | +| e2e_limit_side_len | int | 768 | 端到端的输入图像边长限制 | +| e2e_limit_type | str | "max" | 端到端的边长限制类型,目前支持`min`, `max`,`min`表示保证图像最短边不小于`e2e_limit_side_len`,`max`表示保证图像最长边不大于`e2e_limit_side_len` | +| e2e_pgnet_score_thresh | float | 0.5 | 端到端得分阈值,小于该阈值的结果会被丢弃 | +| e2e_char_dict_path | str | "./ppocr/utils/ic15_dict.txt" | 识别的字典文件路径 | +| e2e_pgnet_valid_set | str | "totaltext" | 验证集名称,目前支持`totaltext`, `partvgg`,不同数据集对应的后处理方式不同,与训练过程保持一致即可 | +| e2e_pgnet_mode | str | "fast" | PGNet的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + +- 方向分类器模型相关 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| use_angle_cls | bool | False | 是否使用方向分类器 | +| cls_model_dir | str | 无,如果需要使用,则必须显式指定路径 | 方向分类器inference模型路径 | +| cls_image_shape | list | [3, 48, 192] | 预测尺度 | +| label_list | list | ['0', '180'] | class id对应的角度值 | +| cls_batch_num | int | 6 | 方向分类器预测的batch size | +| cls_thresh | float | 0.9 | 预测阈值,模型预测结果为180度,且得分大于该阈值时,认为最终预测结果为180度,需要翻转 | + +## 七、FAQ + +- 如果是使用paddle2.0之前版本的代码导出的`inference模型`,则其文件名为`model`与`params`,分别对应paddle2.0或者之后版本导出的`inference.pdmodel`与`inference.pdiparams`;不过目前PaddleOCR的release分支已经不支持paddle2.0之前版本导出的inference 模型,如果希望使用,需要使用develop分支(静态图分支)的代码与文档。 diff --git a/docs/ppocr/infer_deploy/windows_vs2019_build.en.md b/docs/ppocr/infer_deploy/windows_vs2019_build.en.md new file mode 100644 index 0000000000..0f895ada32 --- /dev/null +++ b/docs/ppocr/infer_deploy/windows_vs2019_build.en.md @@ -0,0 +1,144 @@ +--- +comments: true +--- + +# Visual Studio 2019 Community CMake Compilation Guide + +PaddleOCR is tested on Windows based on `Visual Studio 2019 Community`. Microsoft has supported direct management of `CMake` cross-platform compilation projects since `Visual Studio 2017`, but it was not until `2019` that stable and complete support was provided, so if you want to use CMake to manage project compilation and build, we recommend that you use the `Visual Studio 2019` environment to build. + +**All the examples below are demonstrated with the working directory as `D:\projects\cpp`**. + +## 1. Environment Preparation + +### 1.1 Install the required environment + +- Visual Studio 2019 +- CUDA 10.2, cudnn 7+ (only required when using the GPU version of the prediction library) +- CMake 3.22+ + +Please make sure the system has the above basic software installed. We use the community version of `VS2019`. + +### 1.2 Download PaddlePaddle C++ prediction library and Opencv + +#### 1.2.1 Download PaddlePaddle C++ prediction library + +PaddlePaddle C++ prediction library provides different precompiled versions for different `CPU` and `CUDA` versions. Please download according to the actual situation: [C++ prediction library download list](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#windows) + +After decompression, the `D:\projects\paddle_inference` directory contains the following contents: + +``` +paddle_inference +├── paddle # paddle core library and header files +| +├── third_party # third-party dependent libraries and header files +| +└── version.txt # version and compilation information +``` + +#### 1.2.2 Install and configure OpenCV + +1. Download Opencv for Windows platform from the OpenCV official website, [Download address](https://github.com/opencv/opencv/releases) +2. Run the downloaded executable file and unzip OpenCV to the specified directory, such as `D:\projects\cpp\opencv` + +#### 1.2.3 Download PaddleOCR code + +```bash linenums="1" +git clone -b dygraph https://github.com/PaddlePaddle/PaddleOCR +``` + +## 2. Start running + +### Step1: Build Visual Studio project + +After cmake is installed, there will be a cmake-gui program in the system. Open cmake-gui, fill in the source code path in the first input box, and fill in the compilation output path in the second input box + +![step1](./images/cmake_step1.jpg) + +### Step2: Execute cmake configuration + +Click the `Configure` button at the bottom of the interface. The first click will pop up a prompt box for Visual Studio configuration, as shown below. Select your Visual Studio version is fine, and the target platform is x64. Then click the `finish` button to start the automatic configuration. + +![step2](./images/cmake_step2.jpg) + +The first execution will report an error, which is normal. Next, configure Opencv and the prediction library + +- For cpu version, only the three parameters OPENCV_DIR, OpenCV_DIR, and PADDLE_LIB need to be considered + +- OPENCV_DIR: Fill in the location of the opencv lib folder + +- OpenCV_DIR: Fill in the location of the opencv lib folder + +- PADDLE_LIB: The location of the paddle_inference folder + +- For GPU version, on the basis of the cpu version, the following variables need to be filled in +CUDA_LIB, CUDNN_LIB, TENSORRT_DIR, WITH_GPU, WITH_TENSORRT + +- CUDA_LIB: CUDA address, such as `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib\x64` + +- CUDNN_LIB: The same as CUDA_LIB + +- TENSORRT_DIR: The location where TRT is unzipped after downloading, such as `D:\TensorRT-8.0.1.6` +- WITH_GPU: Check +- WITH_TENSORRT: Check + +The configured screenshot is as follows + +![step3](./images/cmake_step3.jpg) + +After the configuration is completed, click the `Configure` button again. + +**Note:** + +1. If you are using the `openblas` version, please uncheck `WITH_MKL` +2. If you encounter the error `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, change the github address in `deploy/cpp_infer/external-cmake/auto-log.cmake` to address. + +### Step3: Generate Visual Studio Project + +Click the `Generate` button to generate the sln file of the Visual Studio project. +![step4](./images/cmake_step4.jpg) + +Click the `Open Project` button to open the project in Visual Studio. The screenshot after opening is as follows + +![step5](./images/vs_step1.jpg) + +Before starting to generate the solution, perform the following steps: + +1. Change `Debug` to `Release` + +2. Download [dirent.h](https://paddleocr.bj.bcebos.com/deploy/cpp_infer/cpp_files/dirent.h) and copy it to the include folder of Visual Studio, such as `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\VS\include`. + +Click `Build->Generate Solution`, and you can see the `ppocr.exe` file in the `build/Release/` folder. + +Before running, copy the following files to the `build/Release/` folder + +1. `paddle_inference/paddle/lib/paddle_inference.dll` + +2. `paddle_inference/third_party/install/onnxruntime/lib/onnxruntime.dll` + +3. `paddle_inference/third_party/install/paddle2onnx/lib/paddle2onnx.dll` + +4. `opencv/build/x64/vc15/bin/opencv_world455.dll` + +5. If you use the prediction library of the openblas version, you also need to copy `paddle_inference/third_party/install/openblas/lib/openblas.dll` + +### Step4: Prediction + +The above `Visual Studio The executable file compiled by 2019 is in the directory of build/Release/. Open cmd and switch to D:\projects\cpp\PaddleOCR\deploy\cpp_infer\: + +cd /d D:\projects\cpp\PaddleOCR\deploy\cpp_infer + +The executable file ppocr.exe is the sample prediction program. Its main usage is as follows. For more usage, please refer to the [Instructions](./cpp_infer.en.md) section of running demo. + +```bash linenums="1" +# Switch terminal encoding to utf8 +CHCP 65001 +# Execute prediction +.\build\Release\ppocr.exe system --det_model_dir=D:\projects\cpp\ch_PP-OCRv2_det_slim_quant_infer --rec_model_dir=D:\projects\cpp\ch_PP-OCRv2_rec_slim_quant_infer --image_dir=D:\projects\cpp\PaddleOCR\doc\imgs\11.jpg +``` + +The recognition result is as follows +![result](./images/result.jpg) + +## FAQ + +- When running, a pop-up window prompts `The application cannot be started normally (0xc0000142)`, and the `cmd` window prompts `You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found.`, copy all the dll files in the lib in the tensor directory to the release directory, and run it again. diff --git a/docs/ppocr/infer_deploy/windows_vs2019_build.md b/docs/ppocr/infer_deploy/windows_vs2019_build.md new file mode 100644 index 0000000000..61c7c0fc36 --- /dev/null +++ b/docs/ppocr/infer_deploy/windows_vs2019_build.md @@ -0,0 +1,137 @@ +--- +comments: true +--- + +# Visual Studio 2019 Community CMake 编译指南 + +PaddleOCR在Windows 平台下基于`Visual Studio 2019 Community` 进行了测试。微软从`Visual Studio 2017`开始即支持直接管理`CMake`跨平台编译项目,但是直到`2019`才提供了稳定和完全的支持,所以如果你想使用CMake管理项目编译构建,我们推荐你使用`Visual Studio 2019`环境下构建。 + +**下面所有示例以工作目录为 `D:\projects\cpp`演示**。 + +## 1. 环境准备 + +### 1.1 安装必须环境 + +- Visual Studio 2019 +- CUDA 10.2,cudnn 7+ (仅在使用GPU版本的预测库时需要) +- CMake 3.22+ + +请确保系统已经安装好上述基本软件,我们使用的是`VS2019`的社区版。 + +### 1.2 下载 PaddlePaddle C++ 预测库和 Opencv + +#### 1.2.1 下载 PaddlePaddle C++ 预测库 + +PaddlePaddle C++ 预测库针对不同的`CPU`和`CUDA`版本提供了不同的预编译版本,请根据实际情况下载: [C++预测库下载列表](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#windows) + +解压后`D:\projects\paddle_inference`目录包含内容为: + +``` +paddle_inference +├── paddle # paddle核心库和头文件 +| +├── third_party # 第三方依赖库和头文件 +| +└── version.txt # 版本和编译信息 +``` + +#### 1.2.2 安装配置OpenCV + +1. 在OpenCV官网下载适用于Windows平台的Opencv, [下载地址](https://github.com/opencv/opencv/releases) +2. 运行下载的可执行文件,将OpenCV解压至指定目录,如`D:\projects\cpp\opencv` + +#### 1.2.3 下载PaddleOCR代码 + +```bash linenums="1" +git clone -b dygraph https://github.com/PaddlePaddle/PaddleOCR +``` + +## 2. 开始运行 + +### Step1: 构建Visual Studio项目 + +cmake安装完后后系统里会有一个cmake-gui程序,打开cmake-gui,在第一个输入框处填写源代码路径,第二个输入框处填写编译输出路径 + +![step1](./images/cmake_step1.jpg) + +### Step2: 执行cmake配置 + +点击界面下方的`Configure`按钮,第一次点击会弹出提示框进行Visual Studio配置,如下图,选择你的Visual Studio版本即可,目标平台选择x64。然后点击`finish`按钮即开始自动执行配置。 + +![step2](./images/cmake_step2.jpg) + +第一次执行会报错,这是正常现象,接下来进行Opencv和预测库的配置 + +- cpu版本,仅需考虑OPENCV_DIR、OpenCV_DIR、PADDLE_LIB三个参数 + + - OPENCV_DIR:填写opencv lib文件夹所在位置 + - OpenCV_DIR:同填写opencv lib文件夹所在位置 + - PADDLE_LIB:paddle_inference文件夹所在位置 + +- GPU版本,在cpu版本的基础上,还需填写以下变量 +CUDA_LIB、CUDNN_LIB、TENSORRT_DIR、WITH_GPU、WITH_TENSORRT + +- CUDA_LIB: CUDA地址,如 `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib\x64` +- CUDNN_LIB: 和CUDA_LIB一致 +- TENSORRT_DIR:TRT下载后解压缩的位置,如 `D:\TensorRT-8.0.1.6` +- WITH_GPU: 打钩 +- WITH_TENSORRT:打勾 + +配置好的截图如下 + +![step3](./images/cmake_step3.jpg) + +配置完成后,再次点击`Configure`按钮。 + +**注意:** + + 1. 如果使用的是`openblas`版本,请把`WITH_MKL`勾去掉 + 2. 遇到报错 `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, 将 `deploy/cpp_infer/external-cmake/auto-log.cmake` 中的github地址改为 地址即可。 + +### Step3: 生成Visual Studio 项目 + +点击`Generate`按钮即可生成Visual Studio 项目的sln文件。 +![step4](./images/cmake_step4.jpg) + +点击`Open Project`按钮即可在Visual Studio 中打开项目。打开后截图如下 + +![step5](./images/vs_step1.jpg) + +在开始生成解决方案之前,执行下面步骤: + +1. 将`Debug`改为`Release` +2. 下载[dirent.h](https://paddleocr.bj.bcebos.com/deploy/cpp_infer/cpp_files/dirent.h),并拷贝到 Visual Studio 的 include 文件夹下,如`C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\VS\include`。 + +点击`生成->生成解决方案`,即可在`build/Release/`文件夹下看见`ppocr.exe`文件。 + +运行之前,将下面文件拷贝到`build/Release/`文件夹下 + +1. `paddle_inference/paddle/lib/paddle_inference.dll` +2. `paddle_inference/third_party/install/onnxruntime/lib/onnxruntime.dll` +3. `paddle_inference/third_party/install/paddle2onnx/lib/paddle2onnx.dll` +4. `opencv/build/x64/vc15/bin/opencv_world455.dll` +5. 如果使用openblas版本的预测库还需要拷贝 `paddle_inference/third_party/install/openblas/lib/openblas.dll` + +### Step4: 预测 + +上述`Visual Studio 2019`编译产出的可执行文件在`build/Release/`目录下,打开`cmd`,并切换到`D:\projects\cpp\PaddleOCR\deploy\cpp_infer\`: + +```bash linenums="1" +cd /d D:\projects\cpp\PaddleOCR\deploy\cpp_infer +``` + +可执行文件`ppocr.exe`即为样例的预测程序,其主要使用方法如下,更多使用方法可以参考[说明文档](./cpp_infer.md)`运行demo`部分。 + +```bash linenums="1" +# 切换终端编码为utf8 +CHCP 65001 +# 执行预测 +.\build\Release\ppocr.exe system --det_model_dir=D:\projects\cpp\ch_PP-OCRv2_det_slim_quant_infer --rec_model_dir=D:\projects\cpp\ch_PP-OCRv2_rec_slim_quant_infer --image_dir=D:\projects\cpp\PaddleOCR\doc\imgs\11.jpg +``` + +识别结果如下 +![result](./images/result.jpg) + +## FAQ + +- 运行时,弹窗报错提示`应用程序无法正常启动(0xc0000142)`,并且`cmd`窗口内提示`You are using Paddle compiled with TensorRT, but TensorRT dynamic library is not found.`,把tensort目录下的lib里面的所有dll文件复制到release目录下,再次运行即可。 diff --git a/docs/ppocr/installation.en.md b/docs/ppocr/installation.en.md new file mode 100644 index 0000000000..3ac7533d34 --- /dev/null +++ b/docs/ppocr/installation.en.md @@ -0,0 +1,85 @@ +--- +comments: true +--- + +## Quick Installation + +After testing, PaddleOCR can run on glibc 2.23. You can also test other glibc versions or install glibc 2.23 for the best compatibility. + +PaddleOCR working environment: + +- PaddlePaddle 2.0.0 +- Python 3.7 +- glibc 2.23 + +It is recommended to use the docker provided by us to run PaddleOCR. Please refer to the docker tutorial [link](https://www.runoob.com/docker/docker-tutorial.html/). + +*If you want to directly run the prediction code on Mac or Windows, you can start from step 2.* + +### 1. (Recommended) Prepare a docker environment + +For the first time you use this docker image, it will be downloaded automatically. Please be patient. + +```bash linenums="1" +# Switch to the working directory +cd /home/Projects +# You need to create a docker container for the first run, and do not need to run the current command when you run it again +# Create a docker container named ppocr and map the current directory to the /paddle directory of the container + +#If using CPU, use docker instead of nvidia-docker to create docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash +``` + +With CUDA10, please run the following command to create a container. +It is recommended to set a shared memory greater than or equal to 32G through the --shm-size parameter: + +```bash linenums="1" +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash +``` + +You can also visit [DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/) to get the image that fits your machine. + +```bash linenums="1" +# ctrl+P+Q to exit docker, to re-enter docker using the following command: +sudo docker container exec -it ppocr /bin/bash +``` + +### 2. Install PaddlePaddle 2.0 + +```bash linenums="1" +pip3 install --upgrade pip + +# If you have cuda9 or cuda10 installed on your machine, please run the following command to install +python3 -m pip install paddlepaddle-gpu==2.0.0 -i https://mirror.baidu.com/pypi/simple + +# If you only have cpu on your machine, please run the following command to install +python3 -m pip install paddlepaddle==2.0.0 -i https://mirror.baidu.com/pypi/simple +``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. + +### 3. Clone PaddleOCR repo + +```bash linenums="1" +# Recommend +git clone https://github.com/PaddlePaddle/PaddleOCR + +# If you cannot pull successfully due to network problems, you can switch to the mirror hosted on Gitee: + +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: The mirror on Gitee may not keep in synchronization with the latest update with the project on GitHub. There might be a delay of 3-5 days. Please try GitHub at first. +``` + +### 4. Install third-party libraries + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` + +If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. + +Please try to download Shapely whl file from [http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). + +Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) diff --git a/docs/ppocr/installation.md b/docs/ppocr/installation.md new file mode 100644 index 0000000000..06de5e4b3c --- /dev/null +++ b/docs/ppocr/installation.md @@ -0,0 +1,76 @@ +--- +comments: true +--- + +## 快速安装 + +经测试PaddleOCR可在glibc 2.23上运行,您也可以测试其他glibc版本或安装glic 2.23 +PaddleOCR 工作环境 + +- PaddlePaddle 2.0.0 +- python3.7 +- glibc 2.23 +- cuDNN 7.6+ (GPU) + +建议使用我们提供的docker运行PaddleOCR,有关docker、nvidia-docker使用请参考[链接](https://www.runoob.com/docker/docker-tutorial.html/)。 + +*如您希望使用 mac 或 windows直接运行预测代码,可以从第2步开始执行。* + +### 1. (建议)准备docker环境 + +第一次使用这个镜像,会自动下载该镜像,请耐心等待 + +```bash linenums="1" +# 切换到工作目录下 +cd /home/Projects +# 首次运行需创建一个docker容器,再次运行时不需要运行当前命令 +# 创建一个名字为ppocr的docker容器,并将当前目录映射到容器的/paddle目录下 + +如果您希望在CPU环境下使用docker,使用docker而不是nvidia-docker创建docker +sudo docker run --name ppocr -v $PWD:/paddle --network=host -it paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash + +如果使用CUDA10,请运行以下命令创建容器,设置docker容器共享内存shm-size为64G,建议设置32G以上 +sudo nvidia-docker run --name ppocr -v $PWD:/paddle --shm-size=64G --network=host -it paddlepaddle/paddle:latest-dev-cuda10.1-cudnn7-gcc82 /bin/bash + +您也可以访问[DockerHub](https://hub.docker.com/r/paddlepaddle/paddle/tags/)获取与您机器适配的镜像。 + +# ctrl+P+Q可退出docker 容器,重新进入docker 容器使用如下命令 +sudo docker container exec -it ppocr /bin/bash +``` + +### 2. 安装PaddlePaddle 2.0 + +```bash linenums="1" +pip3 install --upgrade pip + +# 如果您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 +python3 -m pip install paddlepaddle-gpu==2.0.0 -i https://mirror.baidu.com/pypi/simple + +# 如果您的机器是CPU,请运行以下命令安装 +python3 -m pip install paddlepaddle==2.0.0 -i https://mirror.baidu.com/pypi/simple + +# 更多的版本需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 +``` + +### 3. 克隆PaddleOCR repo代码 + +```bash linenums="1" +#【推荐】 +git clone https://github.com/PaddlePaddle/PaddleOCR + +# 如果因为网络问题无法pull成功,也可选择使用码云上的托管: + +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 +``` + +### 4. 安装第三方库 + +```bash linenums="1" +cd PaddleOCR +pip3 install -r requirements.txt +``` + +注意,windows环境下,建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装, +直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。 diff --git a/docs/ppocr/model_compress/knowledge_distillation.en.md b/docs/ppocr/model_compress/knowledge_distillation.en.md new file mode 100755 index 0000000000..8b54e6f108 --- /dev/null +++ b/docs/ppocr/model_compress/knowledge_distillation.en.md @@ -0,0 +1,610 @@ +--- +comments: true +--- + +# Knowledge Distillation + +## 1. Introduction + +### 1.1 Introduction to Knowledge Distillation + +In recent years, deep neural networks have been proved to be an extremely effective method for solving problems in the fields of computer vision and natural language processing. +By constructing a suitable neural network and training it, the performance metrics of the final network model will basically exceed the traditional algorithm. +When the amount of data is large enough, increasing the amount of parameters by constructing a reasonable network model can significantly improve the performance of the model, +but this brings about the problem of a sharp increase in the complexity of the model. Large models are more expensive to use in actual scenarios. +Deep neural networks generally have more parameter redundancy. At present, there are several main methods to compress the model and reduce the amount of its parameters. +Such as pruning, quantification, knowledge distillation, etc., where knowledge distillation refers to the use of teacher models to guide student models to learn specific tasks, +to ensure that the small model obtains a relatively large performance improvement under the condition of unchanged parameters. +In addition, in the knowledge distillation task, a mutual learning model training method was also derived. +The paper [Deep Mutual Learning](https://arxiv.org/abs/1706.00384) pointed out that using two identical models to supervise each other during the training process can achieve better results than a single model training. + +### 1.2 Introduction to PaddleOCR Knowledge Distillation + +Whether it is a large model distilling a small model, or a small model learning from each other and updating parameters, +they are essentially the output between different models or mutual supervision between feature maps. +The only difference is (1) whether the model requires fixed parameters. (2) Whether the model needs to be loaded with a pre-trained model. +For the case where a large model distills a small model, the large model generally needs to load the pre-trained model and fix the parameters. +For the situation where small models distill each other, the small models generally do not load the pre-trained model, and the parameters are also in a learnable state. + +In the task of knowledge distillation, it is not only the distillation between two models, but also the situation where multiple models learn from each other. +Therefore, in the knowledge distillation code framework, it is also necessary to support this type of distillation method. + +The algorithm of knowledge distillation is integrated in PaddleOCR. Specifically, it has the following main features: + +- It supports mutual learning of any network, and does not require the sub-network structure to be completely consistent or to have a pre-trained model. At the same time, there is no limit to the number of sub-networks, just add it in the configuration file. +- Support arbitrarily configuring the loss function through the configuration file, not only can use a certain loss, but also a combination of multiple losses. +- Support all model-related environments such as knowledge distillation training, prediction, evaluation, and export, which is convenient for use and deployment. + +Through knowledge distillation, in the common Chinese and English text recognition task, without adding any time-consuming prediction, +the accuracy of the model can be improved by more than 3%. Combining the learning rate adjustment strategy and the model structure fine-tuning strategy, +the final improvement is more than 5%. + +## 2. Configuration File Analysis + +In the process of knowledge distillation training, there is no change in data preprocessing, optimizer, learning rate, and some global attributes. +The configuration files of the model structure, loss function, post-processing, metric calculation and other modules need to be fine-tuned. + +The following takes the knowledge distillation configuration file for recognition and detection as an example to analyze the training and configuration of knowledge distillation. + +### 2.1 Recognition Model Configuration File Analysis + +The configuration file is in [ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml). + +#### 2.1.1 Model Structure + +In the knowledge distillation task, the model structure configuration is as follows. + +```yaml linenums="1" +Architecture: + model_type: &model_type "rec" # Model category, recognition, detection, etc. + name: DistillationModel # Structure name, in the distillation task, it is DistillationModel + algorithm: Distillation # Algorithm name + Models: # Model, including the configuration information of the subnet + Teacher: # The name of the subnet, it must include at least the `pretrained` and `freeze_params` parameters, and the other parameters are the construction parameters of the subnet + pretrained: # Does this sub-network need to load pre-training weights + freeze_params: false # Do you need fixed parameters + return_all_feats: true # Do you need to return all features, if it is False, only the final output is returned + model_type: *model_type # Model category + algorithm: SVTR # The algorithm name of the sub-network. The remaining parameters of the sub-network are consistent with the general model training configuration + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student: # Another sub-network, here is a distillation example of DML, the two sub-networks have the same structure, and both need to learn parameters + pretrained: # The following parameters are the same as above + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length +``` + +If you want to add more sub-networks for training, you can also add the corresponding fields in the configuration file according to the way of adding `Student` and `Teacher`. +For example, if you want 3 models to supervise each other and train together, then `Architecture` can be written in the following format. + +```yaml linenums="1" +Architecture: + model_type: &model_type "rec" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student2: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length +``` + +When the model is finally trained, it contains 3 sub-networks: `Teacher`, `Student`, `Student2`. + +The specific implementation code of the `DistillationModel` class can refer to [distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py). +The final model output is a dictionary, the key is the name of all the sub-networks, for example, here are `Student` and `Teacher`, and the value is the output of the corresponding sub-network, +which can be `Tensor` (only the last layer of the network is returned) and `dict` (also returns the characteristic information in the middle). +In the recognition task, in order to add more loss functions and ensure the scalability of the distillation method, the output of each sub-network is saved as a `dict`, which contains the sub-module output. +Take the recognition model as an example. The output result of each sub-network is `dict`, the key contains `backbone_out`, `neck_out`, `head_out`, and `value` is the tensor of the corresponding module. Finally, for the above configuration file, `DistillationModel` The output format is as follows. + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.1.2 Loss Function + +In the knowledge distillation task, the loss function configuration is as follows. + +```yaml linenums="1" +Loss: + name: CombinedLoss # Loss function name + loss_config_list: # List of loss function configuration files, mandatory functions for CombinedLoss + - DistillationCTCLoss: # CTC loss function based on distillation, inherited from standard CTC loss + weight: 1.0 # The weight of the loss function. In loss_config_list, each loss function must include this field + model_name_list: ["Student", "Teacher"] # For the prediction results of the distillation model, extract the output of these two sub-networks and calculate the CTC loss with gt + key: head_out # In the sub-network output dict, take the corresponding tensor + - DistillationDMLLoss: # DML loss function, inherited from the standard DMLLoss + weight: 1.0 + act: "softmax" # Activation function, use it to process the input, can be softmax, sigmoid or None, the default is None + model_name_pairs: # The subnet name pair used to calculate DML loss. If you want to calculate the DML loss of other subnets, you can continue to add it below the list + - ["Student", "Teacher"] + key: head_out + multi_head: True # whether to use mult_head + dis_head: ctc # assign the head name to calculate loss + name: dml_ctc # prefix name of the loss + - DistillationDMLLoss: # DML loss function, inherited from the standard DMLLoss + weight: 0.5 + act: "softmax" # Activation function, use it to process the input, can be softmax, sigmoid or None, the default is None + model_name_pairs: # The subnet name pair used to calculate DML loss. If you want to calculate the DML loss of other subnets, you can continue to add it below the list + - ["Student", "Teacher"] + key: head_out + multi_head: True # whether to use mult_head + dis_head: sar # assign the head name to calculate loss + name: dml_sar # prefix name of the loss + - DistillationDistanceLoss: # Distilled distance loss function + weight: 1.0 + mode: "l2" # Support l1, l2 or smooth_l1 + model_name_pairs: # Calculate the distance loss of the subnet name pair + - ["Student", "Teacher"] + key: backbone_out + - DistillationSARLoss: # SAR loss function based on distillation, inherited from standard SAR loss + weight: 1.0 # The weight of the loss function. In loss_config_list, each loss function must include this field + model_name_list: ["Student", "Teacher"] # For the prediction results of the distillation model, extract the output of these two sub-networks and calculate the SAR loss with gt + key: head_out # In the sub-network output dict, take the corresponding tensor + multi_head: True # whether it is multi-head or not, if true, SAR branch is used to calculate the loss +``` + +Among the above loss functions, all distillation loss functions are inherited from the standard loss function class. +The main functions are: Analyze the output of the distillation model, find the intermediate node (tensor) used to calculate the loss, +and then use the standard loss function class to calculate. + +Taking the above configuration as an example, the final distillation training loss function contains the following five parts. + +- CTC branch of the final output `head_out` for `Student` and `Teacher` calculates the CTC loss with gt (loss weight equals 1.0). Here, because both sub-networks need to update the parameters, both of them need to calculate the loss with gt. +- SAR branch of the final output `head_out` for `Student` and `Teacher` calculates the SAR loss with gt (loss weight equals 1.0). Here, because both sub-networks need to update the parameters, both of them need to calculate the loss with gt. +- DML loss between CTC branch of `Student` and `Teacher`'s final output `head_out` (loss weight equals 1.0). +- DML loss between SAR branch of `Student` and `Teacher`'s final output `head_out` (loss weight equals 0.5). +- L2 loss between `Student` and `Teacher`'s backbone network output `backbone_out` (loss weight equals 1.0). + +For more specific implementation of `CombinedLoss`, please refer to: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23). +For more specific implementations of distillation loss functions such as `DistillationCTCLoss`, please refer to [distillation_loss.py](../../ppocr/losses/distillation_loss.py) + +#### 2.1.3 Post-processing + +In the knowledge distillation task, the post-processing configuration is as follows. + +```yaml linenums="1" +PostProcess: + name: DistillationCTCLabelDecode # CTC decoding post-processing of distillation tasks, inherited from the standard CTCLabelDecode class + model_name: ["Student", "Teacher"] # For the prediction results of the distillation model, extract the outputs of these two sub-networks and decode them + key: head_out # Take the corresponding tensor in the subnet output dict + multi_head: True # whether it is multi-head or not, if true, CTC branch is used to calculate the loss +``` + +Taking the above configuration as an example, the CTC decoding output of the two sub-networks `Student` and `Teahcer` will be calculated at the same time. +Among them, `key` is the name of the subnet, and `value` is the list of subnets. + +For more specific implementation of `DistillationCTCLabelDecode`, please refer to: [rec_postprocess.py](../../ppocr/postprocess/rec_postprocess.py#L128) + +#### 2.1.4 Metric Calculation + +In the knowledge distillation task, the metric calculation configuration is as follows. + +```yaml linenums="1" +Metric: + name: DistillationMetric # CTC decoding post-processing of distillation tasks, inherited from the standard CTCLabelDecode class + base_metric_name: RecMetric # The base class of indicator calculation. For the output of the model, the indicator will be calculated based on this class + main_indicator: acc # The name of the indicator + key: "Student" # Select the main_indicator of this subnet as the criterion for saving the best model + ignore_space: False # whether to ignore space during evaulation +``` + +Taking the above configuration as an example, the accuracy metric of the `Student` subnet will be used as the judgment metric for saving the best model. +At the same time, the accuracy metric of all subnets will be printed out in the log. + +For more specific implementation of `DistillationMetric`, please refer to: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24). + +#### 2.1.5 Fine-tuning Distillation Model + +There are two ways to fine-tune the recognition distillation task. + +1. Fine-tuning based on knowledge distillation: this situation is relatively simple, download the pre-trained model. Then configure the pre-training model path and your own data path in [ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) to perform fine-tuning training of the model. +2. Do not use knowledge distillation in fine-tuning: In this case, you need to first extract the student model parameters from the pre-training model. The specific steps are as follows. + +- First download the pre-trained model and unzip it. + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +tar -xf ch_PP-OCRv3_rec_train.tar +``` + +- Then use python to extract the student model parameters + +```python linenums="1" +import paddle +# Load the pre-trained model +all_params = paddle.load("ch_PP-OCRv3_rec_train/best_accuracy.pdparams") +# View the keys of the weight parameter +print(all_params.keys()) +# Weight extraction of student model +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# View the keys of the weight parameters of the student model +print(s_params.keys()) +# Save weight parameters +paddle.save(s_params, "ch_PP-OCRv3_rec_train/student.pdparams") +``` + +After the extraction is complete, use [ch_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml) to modify the path of the pre-trained model (the path of the exported `student.pdparams` model) and your own data path to fine-tune the model. + +### 2.2 Detection Model Configuration File Analysis + +The configuration file of the detection model distillation is in the ```PaddleOCR/configs/det/ch_PP-OCRv3/``` directory, which contains three distillation configuration files: + +- ```ch_PP-OCRv3_det_cml.yml```, Use one large model to distill two small models, and the two small models learn from each other +- ```ch_PP-OCRv3_det_dml.yml```, Method of mutual distillation of two student models + +#### 2.2.1 Model Structure + +In the knowledge distillation task, the model structure configuration is as follows: + +```yaml linenums="1" +Architecture: + name: DistillationModel # Structure name, in the distillation task, it is DistillationModel + algorithm: Distillation # Algorithm name + Models: # Model, including the configuration information of the subnet + Student: # The name of the subnet, it must include at least the `pretrained` and `freeze_params` parameters, and the other parameters are the construction parameters of the subnet + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained # Does this sub-network need to load pre-training weights + freeze_params: false # Do you need fixed parameters + return_all_feats: false # Do you need to return all features, if it is False, only the final output is returned + model_type: det + algorithm: DB + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + Teacher: # Another sub-network, here is a distillation example of a large model distill a small model + pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + +``` + +If DML is used, that is, the method of two small models learning from each other, the Teacher network structure in the above configuration file needs to be set to the same configuration as the Student model. +Refer to the configuration file for details. [ch_PP-OCRv3_det_dml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml) + +The following describes the configuration file parameters [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml): + +```yaml linenums="1" +Architecture: + name: DistillationModel + algorithm: Distillation + model_type: det + Models: + Teacher: # Teacher model configuration of CML distillation + pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy + freeze_params: true # Teacher does not train + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + Student: # Student model configuration for CML distillation + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + Student2: # Student2 model configuration for CML distillation + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +``` + +The specific implementation code of the distillation model `DistillationModel` class can refer to [distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py). + +The final model output is a dictionary, the key is the name of all the sub-networks, for example, here are `Student` and `Teacher`, and the value is the output of the corresponding sub-network, +which can be `Tensor` (only the last layer of the network is returned) and `dict` (also returns the characteristic information in the middle). + +In the distillation task, in order to facilitate the addition of the distillation loss function, the output of each network is saved as a `dict`, which contains the sub-module output. +The key contains `backbone_out`, `neck_out`, `head_out`, and `value` is the tensor of the corresponding module. Finally, for the above configuration file, the output format of `DistillationModel` is as follows. + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.2.2 Loss Function + +The distillation loss function configuration(`ch_PP-OCRv3_det_cml.yml`) is shown below. + +```yaml linenums="1" +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - ["Student", "Teacher"] + - ["Student2", "Teacher"] # 1. Calculate the loss of two Student and Teacher + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDMLLoss: # 2. Add to calculate the loss between two students + model_name_pairs: + - ["Student", "Student2"] + maps_name: "thrink_maps" + weight: 1.0 + # act: None + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: ["Student", "Student2"] # 3. Calculate the loss between two students and GT + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 +``` + +For more specific implementation of `DistillationDilaDBLoss`, please refer to: [distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppocr/losses/distillation_loss.py#L185). +For more specific implementations of distillation loss functions such as `DistillationDBLoss`, please refer to: [distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/04c44974b13163450dfb6bd2c327863f8a194b3c/ppocr/losses/distillation_loss.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L148) + +#### 2.2.3 Post-processing + +In the task of detecting knowledge distillation, the post-processing configuration of detecting distillation is as follows. + +```yaml linenums="1" +PostProcess: + name: DistillationDBPostProcess # The post-processing of the DB detection distillation task, inherited from the standard DBPostProcess class + model_name: ["Student", "Student2", "Teacher"] # Extract the output of multiple sub-networks and decode them. The network that does not require post-processing is not set in model_name + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +``` + +Taking the above configuration as an example, the output of the three subnets `Student`, `Student2` and `Teacher` will be calculated at the same time for post-processing calculations. +Since there are multiple inputs, there are also multiple outputs returned by post-processing. +For a more specific implementation of `DistillationDBPostProcess`, please refer to: [db_postprocess.py](../../ppocr/postprocess/db_postprocess.py#L195) + +#### 2.2.4 Metric Calculation + +In the knowledge distillation task, the metric calculation configuration is as follows. + +```yaml linenums="1" +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" +``` + +Since distillation needs to include multiple networks, only one network metrics needs to be calculated when calculating the metrics. +The `key` field is set to `Student`, it means that only the metrics of the `Student` network is calculated. +Model Structure + +#### 2.2.5 Fine-tuning Distillation Model + +There are three ways to fine-tune the detection distillation task: + +- `ch_PP-OCRv3_det_distill.yml`, The teacher model is set to the model provided by PaddleOCR or the large model you have trained. +- `ch_PP-OCRv3_det_cml.yml`, Use cml distillation. Similarly, the Teacher model is set to the model provided by PaddleOCR or the large model you have trained. +- `ch_PP-OCRv3_det_dml.yml`, Distillation using DML. The method of mutual distillation of the two Student models has an accuracy improvement of about 1.7% on the data set used by PaddleOCR. + +In fine-tune, you need to set the pre-trained model to be loaded in the `pretrained` parameter of the network structure. + +In terms of accuracy improvement, `cml` > `dml` > `distill`. When the amount of data is insufficient or the accuracy of the teacher model is similar to that of the student, this conclusion may change. + +In addition, since the distillation pre-training model provided by PaddleOCR contains multiple model parameters, if you want to extract the parameters of the student model, you can refer to the following code: + +```sh +# Download the parameters of the distillation training model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +``` + +```python linenums="1" +import paddle +# Load the pre-trained model +all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# View the keys of the weight parameter +print(all_params.keys()) +# Extract the weights of the student model +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# View the keys of the weight parameters of the student model +print(s_params.keys()) +# Save +paddle.save(s_params, "ch_PP-OCRv3_det_distill_train/student.pdparams") +``` + +Finally, the parameters of the student model will be saved in `ch_PP-OCRv3_det_distill_train/student.pdparams` for the fine-tune of the model. diff --git a/docs/ppocr/model_compress/knowledge_distillation.md b/docs/ppocr/model_compress/knowledge_distillation.md new file mode 100644 index 0000000000..7b67ef9fac --- /dev/null +++ b/docs/ppocr/model_compress/knowledge_distillation.md @@ -0,0 +1,593 @@ +--- +comments: true +--- + +# 知识蒸馏 + +## 1. 简介 + +### 1.1 知识蒸馏介绍 + +近年来,深度神经网络在计算机视觉、自然语言处理等领域被验证是一种极其有效的解决问题的方法。通过构建合适的神经网络,加以训练,最终网络模型的性能指标基本上都会超过传统算法。 + +在数据量足够大的情况下,通过合理构建网络模型的方式增加其参数量,可以显著改善模型性能,但是这又带来了模型复杂度急剧提升的问题。大模型在实际场景中使用的成本较高。 + +深度神经网络一般有较多的参数冗余,目前有几种主要的方法对模型进行压缩,减小其参数量。如裁剪、量化、知识蒸馏等,其中知识蒸馏是指使用教师模型(teacher model)去指导学生模型(student model)学习特定任务,保证小模型在参数量不变的情况下,得到比较大的性能提升。 + +此外,在知识蒸馏任务中,也衍生出了互学习的模型训练方法,论文[Deep Mutual Learning](https://arxiv.org/abs/1706.00384)中指出,使用两个完全相同的模型在训练的过程中互相监督,可以达到比单个模型训练更好的效果。 + +### 1.2 PaddleOCR知识蒸馏简介 + +无论是大模型蒸馏小模型,还是小模型之间互相学习,更新参数,他们本质上是都是不同模型之间输出或者特征图(feature map)之间的相互监督,区别仅在于 (1) 模型是否需要固定参数。(2) 模型是否需要加载预训练模型。 + +对于大模型蒸馏小模型的情况,大模型一般需要加载预训练模型并固定参数;对于小模型之间互相蒸馏的情况,小模型一般都不加载预训练模型,参数也都是可学习的状态。 + +在知识蒸馏任务中,不只有2个模型之间进行蒸馏的情况,多个模型之间互相学习的情况也非常普遍。因此在知识蒸馏代码框架中,也有必要支持该种类别的蒸馏方法。 + +PaddleOCR中集成了知识蒸馏的算法,具体地,有以下几个主要的特点: + +- 支持任意网络的互相学习,不要求子网络结构完全一致或者具有预训练模型;同时子网络数量也没有任何限制,只需要在配置文件中添加即可。 +- 支持loss函数通过配置文件任意配置,不仅可以使用某种loss,也可以使用多种loss的组合 +- 支持知识蒸馏训练、预测、评估与导出等所有模型相关的环境,方便使用与部署。 + +通过知识蒸馏,在中英文通用文字识别任务中,不增加任何预测耗时的情况下,可以给模型带来3%以上的精度提升,结合学习率调整策略以及模型结构微调策略,最终提升提升超过5%。 + +## 2. 配置文件解析 + +在知识蒸馏训练的过程中,数据预处理、优化器、学习率、全局的一些属性没有任何变化。模型结构、损失函数、后处理、指标计算等模块的配置文件需要进行微调。 + +下面以识别与检测的知识蒸馏配置文件为例,对知识蒸馏的训练与配置进行解析。 + +### 2.1 识别配置文件解析 + +配置文件在[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)。 + +#### 2.1.1 模型结构 + +知识蒸馏任务中,模型结构配置如下所示。 + +```yaml linenums="1" +Architecture: + model_type: &model_type "rec" # 模型类别,rec、det等,每个子网络的模型类别 + name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构 + algorithm: Distillation # 算法名称 + Models: # 模型,包含子网络的配置信息 + Teacher: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数 + pretrained: # 该子网络是否需要加载预训练模型 + freeze_params: false # 是否需要固定参数 + return_all_feats: true # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出 + model_type: *model_type # 模型类别 + algorithm: SVTR # 子网络的算法名称,该子网络其余参数均为构造参数,与普通的模型训练配置一致 + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length +``` + +当然,这里如果希望添加更多的子网络进行训练,也可以按照`Student`与`Teacher`的添加方式,在配置文件中添加相应的字段。比如说如果希望有3个模型互相监督,共同训练,那么`Architecture`可以写为如下格式。 + +```yaml linenums="1" +Architecture: + model_type: &model_type "rec" + name: DistillationModel + algorithm: Distillation + Models: + Teacher: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length + Student2: + pretrained: + freeze_params: false + return_all_feats: true + model_type: *model_type + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Head: + name: MultiHead + head_list: + - CTCHead: + Neck: + name: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + fc_decay: 0.00001 + - SARHead: + enc_dim: 512 + max_text_length: *max_text_length +``` + +最终该模型训练时,包含3个子网络:`Teacher`, `Student`, `Student2`。 + +蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)。 + +最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student`与`Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。 + +在识别任务中,为了添加更多损失函数,保证蒸馏方法的可扩展性,将每个子网络的输出保存为`dict`,其中包含子模块输出。以该识别模型为例,每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out`,`value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下: + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.1.2 损失函数 + +知识蒸馏任务中,损失函数配置如下所示。 + +```yaml linenums="1" +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss + weight: 1.0 # 权重 + act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None + use_log: true # 对输入计算log,如果函数已经 + model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充 + - ["Student", "Teacher"] + key: head_out # 取子网络输出dict中,该key对应的tensor + multi_head: True # 是否为多头结构 + dis_head: ctc # 指定用于计算损失函数的head + name: dml_ctc # 蒸馏loss的前缀名称,避免不同loss之间的命名冲突 + - DistillationDMLLoss: # 蒸馏的DML损失函数,继承自标准的DMLLoss + weight: 0.5 # 权重 + act: "softmax" # 激活函数,对输入使用激活函数处理,可以为softmax, sigmoid或者为None,默认为None + use_log: true # 对输入计算log,如果函数已经 + model_name_pairs: # 用于计算DML loss的子网络名称对,如果希望计算其他子网络的DML loss,可以在列表下面继续填充 + - ["Student", "Teacher"] + key: head_out # 取子网络输出dict中,该key对应的tensor + multi_head: True # 是否为多头结构 + dis_head: sar # 指定用于计算损失函数的head + name: dml_sar # 蒸馏loss的前缀名称,避免不同loss之间的命名冲突 + - DistillationDistanceLoss: # 蒸馏的距离损失函数 + weight: 1.0 # 权重 + mode: "l2" # 距离计算方法,目前支持l1, l2, smooth_l1 + model_name_pairs: # 用于计算distance loss的子网络名称对 + - ["Student", "Teacher"] + key: backbone_out # 取子网络输出dict中,该key对应的tensor + - DistillationCTCLoss: # 基于蒸馏的CTC损失函数,继承自标准的CTC loss + weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段 + model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss + key: head_out # 取子网络输出dict中,该key对应的tensor + - DistillationSARLoss: # 基于蒸馏的SAR损失函数,继承自标准的SARLoss + weight: 1.0 # 损失函数的权重,loss_config_list中,每个损失函数的配置都必须包含该字段 + model_name_list: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,与gt计算CTC loss + key: head_out # 取子网络输出dict中,该key对应的tensor + multi_head: True # 是否为多头结构,为true时,取出其中的SAR分支计算损失函数 +``` + +上述损失函数中,所有的蒸馏损失函数均继承自标准的损失函数类,主要功能为: 对蒸馏模型的输出进行解析,找到用于计算损失的中间节点(tensor),再使用标准的损失函数类去计算。 + +以上述配置为例,最终蒸馏训练的损失函数包含下面5个部分。 + +- `Student`和`Teacher`最终输出(`head_out`)的CTC分支与gt的CTC loss,权重为1。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。 +- `Student`和`Teacher`最终输出(`head_out`)的SAR分支与gt的SAR loss,权重为1.0。在这里因为2个子网络都需要更新参数,因此2者都需要计算与g的loss。 +- `Student`和`Teacher`最终输出(`head_out`)的CTC分支之间的DML loss,权重为1。 +- `Student`和`Teacher`最终输出(`head_out`)的SAR分支之间的DML loss,权重为0.5。 +- `Student`和`Teacher`的骨干网络输出(`backbone_out`)之间的l2 loss,权重为1。 + +关于`CombinedLoss`更加具体的实现可以参考: [combined_loss.py](../../ppocr/losses/combined_loss.py#L23)。关于`DistillationCTCLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](../../ppocr/losses/distillation_loss.py)。 + +#### 2.1.3 后处理 + +知识蒸馏任务中,后处理配置如下所示。 + +```yaml linenums="1" +PostProcess: + name: DistillationCTCLabelDecode # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + model_name: ["Student", "Teacher"] # 对于蒸馏模型的预测结果,提取这两个子网络的输出,进行解码 + key: head_out # 取子网络输出dict中,该key对应的tensor + multi_head: True # 多头结构时,会取出其中的CTC分支进行计算 +``` + +以上述配置为例,最终会同时计算`Student`和`Teahcer` 2个子网络的CTC解码输出,返回一个`dict`,`key`为用于处理的子网络名称,`value`为用于处理的子网络列表。 + +关于`DistillationCTCLabelDecode`更加具体的实现可以参考: [rec_postprocess.py](../../ppocr/postprocess/rec_postprocess.py#L128) + +#### 2.1.4 指标计算 + +知识蒸馏任务中,指标计算配置如下所示。 + +```yaml linenums="1" +Metric: + name: DistillationMetric # 蒸馏任务的CTC解码后处理,继承自标准的CTCLabelDecode类 + base_metric_name: RecMetric # 指标计算的基类,对于模型的输出,会基于该类,计算指标 + main_indicator: acc # 指标的名称 + key: "Student" # 选取该子网络的 main_indicator 作为作为保存保存best model的判断标准 + ignore_space: False # 评估时是否忽略空格的影响 +``` + +以上述配置为例,最终会使用`Student`子网络的acc指标作为保存best model的判断指标,同时,日志中也会打印出所有子网络的acc指标。 + +关于`DistillationMetric`更加具体的实现可以参考: [distillation_metric.py](../../ppocr/metrics/distillation_metric.py#L24)。 + +#### 2.1.5 蒸馏模型微调 + +对蒸馏得到的识别蒸馏进行微调有2种方式。 + +(1)基于知识蒸馏的微调:这种情况比较简单,下载预训练模型,在[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)中配置好预训练模型路径以及自己的数据路径,即可进行模型微调训练。 + +(2)微调时不使用知识蒸馏:这种情况,需要首先将预训练模型中的学生模型参数提取出来,具体步骤如下: + +- 首先下载预训练模型并解压。 + +```bash linenums="1" +# 下面预训练模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar +tar -xf ch_PP-OCRv3_rec_train.tar +``` + +- 然后使用python,对其中的学生模型参数进行提取 + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("ch_PP-OCRv3_rec_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "ch_PP-OCRv3_rec_train/student.pdparams") +``` + +转化完成之后,使用[ch_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec.yml),修改预训练模型的路径(为导出的`student.pdparams`模型路径)以及自己的数据路径,即可进行模型微调。 + +### 2.2 检测配置文件解析 + +检测模型蒸馏的配置文件在PaddleOCR/configs/det/ch_PP-OCRv3/目录下,包含两个个蒸馏配置文件: + +- ch_PP-OCRv3_det_cml.yml,采用cml蒸馏,采用一个大模型蒸馏两个小模型,且两个小模型互相学习的方法 +- ch_PP-OCRv3_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法 + +#### 2.2.1 模型结构 + +知识蒸馏任务中,模型结构配置如下所示: + +```yaml linenums="1" +Architecture: + name: DistillationModel # 结构名称,蒸馏任务中,为DistillationModel,用于构建对应的结构 + algorithm: Distillation # 算法名称 + Models: # 模型,包含子网络的配置信息 + Student: # 子网络名称,至少需要包含`pretrained`与`freeze_params`信息,其他的参数为子网络的构造参数 + freeze_params: false # 是否需要固定参数 + return_all_feats: false # 子网络的参数,表示是否需要返回所有的features,如果为False,则只返回最后的输出 + model_type: det + algorithm: DB + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + Teacher: # 另外一个子网络,这里给的是DML蒸馏示例, + freeze_params: true + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + +``` + +如果是采用DML,即两个小模型互相学习的方法,上述配置文件里的Teacher网络结构需要设置为Student模型一样的配置,具体参考配置文件[ch_PP-OCRv3_det_dml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)。 + +下面介绍[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)的配置文件参数: + +```yaml linenums="1" +Architecture: + name: DistillationModel + algorithm: Distillation + model_type: det + Models: + Teacher: # CML蒸馏的Teacher模型配置 + pretrained: ./pretrain_models/ch_ppocr_server_v2.0_det_train/best_accuracy + freeze_params: true # Teacher 不训练 + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: ResNet + in_channels: 3 + layers: 50 + Neck: + name: LKPAN + out_channels: 256 + Head: + name: DBHead + kernel_list: [7,2,2] + k: 50 + Student: # CML蒸馏的Student模型配置 + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + Student2: # CML蒸馏的Student2模型配置 + pretrained: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + freeze_params: false + return_all_feats: false + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: true + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +``` + +蒸馏模型`DistillationModel`类的具体实现代码可以参考[distillation_model.py](../../ppocr/modeling/architectures/distillation_model.py)。 + +最终模型`forward`输出为一个字典,key为所有的子网络名称,例如这里为`Student`与`Teacher`,value为对应子网络的输出,可以为`Tensor`(只返回该网络的最后一层)和`dict`(也返回了中间的特征信息)。 + +在蒸馏任务中,为了方便添加蒸馏损失函数,每个网络的输出保存为`dict`,其中包含子模块输出。每个子网络的输出结果均为`dict`,key包含`backbone_out`,`neck_out`, `head_out`,`value`为对应模块的tensor,最终对于上述配置文件,`DistillationModel`的输出格式如下: + +```json +{ + "Teacher": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + }, + "Student": { + "backbone_out": tensor, + "neck_out": tensor, + "head_out": tensor, + } +} +``` + +#### 2.2.2 损失函数 + +检测ch_PP-OCRv3_det_cml.yml蒸馏损失函数配置如下所示。 + +```yaml linenums="1" +Loss: + name: CombinedLoss + loss_config_list: + - DistillationDilaDBLoss: + weight: 1.0 + model_name_pairs: + - ["Student", "Teacher"] + - ["Student2", "Teacher"] # 改动1,计算两个Student和Teacher的损失 + key: maps + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + - DistillationDMLLoss: # 改动2,增加计算两个Student之间的损失 + model_name_pairs: + - ["Student", "Student2"] + maps_name: "thrink_maps" + weight: 1.0 + # act: None + key: maps + - DistillationDBLoss: + weight: 1.0 + model_name_list: ["Student", "Student2"] # 改动3,计算两个Student和GT之间的损失 + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +``` + +关于`DistillationDilaDBLoss`更加具体的实现可以参考: [distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.4/ppocr/losses/distillation_loss.py#L185)。关于`DistillationDBLoss`等蒸馏损失函数更加具体的实现可以参考[distillation_loss.py](https://github.com/PaddlePaddle/PaddleOCR/blob/04c44974b13163450dfb6bd2c327863f8a194b3c/ppocr/losses/distillation_loss.py?_pjax=%23js-repo-pjax-container%2C%20div%5Bitemtype%3D%22http%3A%2F%2Fschema.org%2FSoftwareSourceCode%22%5D%20main%2C%20%5Bdata-pjax-container%5D#L148)。 + +#### 2.2.3 后处理 + +知识蒸馏任务中,检测蒸馏后处理配置如下所示。 + +```yaml linenums="1" +PostProcess: + name: DistillationDBPostProcess # DB检测蒸馏任务的CTC解码后处理,继承自标准的DBPostProcess类 + model_name: ["Student", "Student2", "Teacher"] # 对于蒸馏模型的预测结果,提取多个子网络的输出,进行解码,不需要后处理的网络可以不在model_name中设置 + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 +``` + +以上述配置为例,最终会同时计算`Student`,`Student2`和`Teacher` 3个子网络的输出做后处理计算。同时,由于有多个输入,后处理返回的输出也有多个, + +关于`DistillationDBPostProcess`更加具体的实现可以参考: [db_postprocess.py](../../ppocr/postprocess/db_postprocess.py#L195) + +#### 2.2.4 蒸馏指标计算 + +知识蒸馏任务中,检测蒸馏指标计算配置如下所示。 + +```yaml linenums="1" +Metric: + name: DistillationMetric + base_metric_name: DetMetric + main_indicator: hmean + key: "Student" +``` + +由于蒸馏需要包含多个网络,甚至多个Student网络,在计算指标的时候只需要计算一个Student网络的指标即可,`key`字段设置为`Student`则表示只计算`Student`网络的精度。 + +#### 2.2.5 检测蒸馏模型finetune + +PP-OCRv3检测蒸馏有两种方式: + +- 采用ch_PP-OCRv3_det_cml.yml,采用cml蒸馏,同样Teacher模型设置为PaddleOCR提供的模型或者您训练好的大模型 +- 采用ch_PP-OCRv3_det_dml.yml,采用DML的蒸馏,两个Student模型互蒸馏的方法,在PaddleOCR采用的数据集上相比单独训练Student模型有1%-2%的提升。 + +在具体fine-tune时,需要在网络结构的`pretrained`参数中设置要加载的预训练模型。 + +在精度提升方面,cml的精度>dml的精度蒸馏方法的精度。当数据量不足或者Teacher模型精度与Student精度相差不大的时候,这个结论或许会改变。 + +另外,由于PaddleOCR提供的蒸馏预训练模型包含了多个模型的参数,如果您希望提取Student模型的参数,可以参考如下代码: + +```bash linenums="1" +# 下载蒸馏训练模型的参数 +wget https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv3_det_distill_train.tar +``` + +```python linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 学生模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看学生模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "ch_PP-OCRv3_det_distill_train/student.pdparams") +``` + +最终`Student`模型的参数将会保存在`ch_PP-OCRv3_det_distill_train/student.pdparams`中,用于模型的fine-tune。 diff --git a/docs/ppocr/model_compress/prune.en.md b/docs/ppocr/model_compress/prune.en.md new file mode 100644 index 0000000000..7579f347d8 --- /dev/null +++ b/docs/ppocr/model_compress/prune.en.md @@ -0,0 +1,75 @@ +--- +comments: true +--- + +# PP-OCR Models Pruning + +Generally, a more complex model would achieve better performance in the task, but it also leads to some redundancy in the model. Model Pruning is a technique that reduces this redundancy by removing the sub-models in the neural network model, so as to reduce model calculation complexity and improve model inference performance. + +This example uses PaddleSlim provided[APIs of Pruning](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/docs/zh_cn/api_cn/dygraph/pruners) to compress the OCR model. +[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim), an open source library which integrates model pruning, quantization (including quantization training and offline quantization), distillation, neural network architecture search, and many other commonly used and leading model compression technique in the industry. + +It is recommended that you could understand following pages before reading this example: + +1. [PaddleOCR training methods](../model_train/training.en.md) +2. [The demo of prune](https://github.com/PaddlePaddle/PaddleSlim/blob/release%2F2.0.0/docs/zh_cn/tutorials/pruning/dygraph/filter_pruning.md) + +## Quick start + +### 1. Install PaddleSlim + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleSlim.git +cd PaddleSlim +git checkout develop +python3 setup.py install +``` + +### 2. Download Pre-trained Model + +Model prune needs to load pre-trained models. +PaddleOCR also provides a series of [models](../model_list.en.md). Developers can choose their own models or use their own models according to their needs. + +### 3. Pruning sensitivity analysis + +After the pre-trained model is loaded, sensitivity analysis is performed on each network layer of the model to understand the redundancy of each network layer, and save a sensitivity file which named: sen.pickle. After that, user could load the sensitivity file via the [methods provided by PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/paddleslim/prune/sensitive.py#L221) and determining the pruning ratio of each network layer automatically. For specific details of sensitivity analysis, see:[Sensitivity analysis](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/en/tutorials/image_classification_sensitivity_analysis_tutorial_en.md) +The data format of sensitivity file: + +```python linenums="1" +sen.pickle(Dict){ + 'layer_weight_name_0': sens_of_each_ratio(Dict){'pruning_ratio_0': acc_loss, 'pruning_ratio_1': acc_loss} + 'layer_weight_name_1': sens_of_each_ratio(Dict){'pruning_ratio_0': acc_loss, 'pruning_ratio_1': acc_loss} + } +``` + +example: + +```python linenums="1" +{ + 'conv10_expand_weights': {0.1: 0.006509952684312718, 0.2: 0.01827734339798862, 0.3: 0.014528405644659832, 0.6: 0.06536008804270439, 0.8: 0.11798612250664964, 0.7: 0.12391408417493704, 0.4: 0.030615754498018757, 0.5: 0.047105205602406594} + 'conv10_linear_weights': {0.1: 0.05113190831455035, 0.2: 0.07705573833558801, 0.3: 0.12096721757739311, 0.6: 0.5135061352930738, 0.8: 0.7908166677143281, 0.7: 0.7272187676899062, 0.4: 0.1819252083008504, 0.5: 0.3728054727792405} +} +``` + +The function would return a dict after loading the sensitivity file. The keys of the dict are name of parameters in each layer. And the value of key is the information about pruning sensitivity of corresponding layer. In example, pruning 10% filter of the layer corresponding to conv10_expand_weights would lead to 0.65% degradation of model performance. The details could be seen at: [Sensitivity analysis](https://github.com/PaddlePaddle/PaddleSlim/blob/release/2.0-alpha/docs/zh_cn/algo/algo.md) + +The function would return a dict after loading the sensitivity file. The keys of the dict are name of parameters in each layer. And the value of key is the information about pruning sensitivity of corresponding layer. In example, pruning 10% filter of the layer corresponding to conv10_expand_weights would lead to 0.65% degradation of model performance. The details could be seen at: [Sensitivity analysis](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/algo/algo.md#2-%E5%8D%B7%E7%A7%AF%E6%A0%B8%E5%89%AA%E8%A3%81%E5%8E%9F%E7%90%86) + +Enter the PaddleOCR root directory,perform sensitivity analysis on the model with the following command: + +```bash linenums="1" +python3.7 deploy/slim/prune/sensitivity_anal.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model="your trained model" Global.save_model_dir=./output/prune_model/ +``` + +### 5. Export inference model and deploy it + +We can export the pruned model as inference_model for deployment: + +```bash linenums="1" +python deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./output/det_db/best_accuracy Global.save_inference_dir=./prune/prune_inference_model +``` + +Reference for prediction and deployment of inference model: + +1. [inference model python prediction](../infer_deploy/python_infer.en.md) +2. [inference model C++ prediction](../infer_deploy/cpp_infer.en.md) diff --git a/docs/ppocr/model_compress/prune.md b/docs/ppocr/model_compress/prune.md new file mode 100644 index 0000000000..ea3103ba80 --- /dev/null +++ b/docs/ppocr/model_compress/prune.md @@ -0,0 +1,71 @@ +--- +comments: true +--- + +# PP-OCR模型裁剪 + +复杂的模型有利于提高模型的性能,但也导致模型中存在一定冗余,模型裁剪通过移出网络模型中的子模型来减少这种冗余,达到减少模型计算复杂度,提高模型推理性能的目的。 +本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleOCR模型的压缩。 +[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim)集成了模型剪枝、量化(包括量化训练和离线量化)、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能,如果您感兴趣,可以关注并了解。 + +在开始本教程之前,建议先了解: + +1. [PaddleOCR模型的训练方法](../model_train/training.md) +2. [模型裁剪教程](https://github.com/PaddlePaddle/PaddleSlim/blob/release%2F2.0.0/docs/zh_cn/tutorials/pruning/dygraph/filter_pruning.md) + +## 快速开始 + +### 1.安装PaddleSlim + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleSlim.git +cd PaddleSlim +git checkout develop +python3 setup.py install +``` + +### 2.获取预训练模型 + +模型裁剪需要加载事先训练好的模型,PaddleOCR也提供了一系列[模型](../model_list.md),开发者可根据需要自行选择模型或使用自己的模型。 + +### 3.敏感度分析训练 + +加载预训练模型后,通过对现有模型的每个网络层进行敏感度分析,得到敏感度文件:sen.pickle,可以通过PaddleSlim提供的[接口](https://github.com/PaddlePaddle/PaddleSlim/blob/9b01b195f0c4bc34a1ab434751cb260e13d64d9e/paddleslim/dygraph/prune/filter_pruner.py#L75)加载文件,获得各网络层在不同裁剪比例下的精度损失。从而了解各网络层冗余度,决定每个网络层的裁剪比例。 +敏感度文件内容格式: + +```python linenums="1" +sen.pickle(Dict){ + 'layer_weight_name_0': sens_of_each_ratio(Dict){'pruning_ratio_0': acc_loss, 'pruning_ratio_1': acc_loss} + 'layer_weight_name_1': sens_of_each_ratio(Dict){'pruning_ratio_0': acc_loss, 'pruning_ratio_1': acc_loss} + } +``` + +例子: + +```python linenums="1" +{ + 'conv10_expand_weights': {0.1: 0.006509952684312718, 0.2: 0.01827734339798862, 0.3: 0.014528405644659832, 0.6: 0.06536008804270439, 0.8: 0.11798612250664964, 0.7: 0.12391408417493704, 0.4: 0.030615754498018757, 0.5: 0.047105205602406594} + 'conv10_linear_weights': {0.1: 0.05113190831455035, 0.2: 0.07705573833558801, 0.3: 0.12096721757739311, 0.6: 0.5135061352930738, 0.8: 0.7908166677143281, 0.7: 0.7272187676899062, 0.4: 0.1819252083008504, 0.5: 0.3728054727792405} +} +``` + +加载敏感度文件后会返回一个字典,字典中的keys为网络模型参数模型的名字,values为一个字典,里面保存了相应网络层的裁剪敏感度信息。例如在例子中,conv10_expand_weights所对应的网络层在裁掉10%的卷积核后模型性能相较原模型会下降0.65%,详细信息可见[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/algo/algo.md#2-%E5%8D%B7%E7%A7%AF%E6%A0%B8%E5%89%AA%E8%A3%81%E5%8E%9F%E7%90%86) + +进入PaddleOCR根目录,通过以下命令对模型进行敏感度分析训练: + +```bash linenums="1" +python3.7 deploy/slim/prune/sensitivity_anal.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model="your trained model" Global.save_model_dir=./output/prune_model/ +``` + +### 4.导出模型、预测部署 + +在得到裁剪训练保存的模型后,我们可以将其导出为inference_model: + +```bash linenums="1" +pytho3.7 deploy/slim/prune/export_prune_model.py -c configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml -o Global.pretrained_model=./output/det_db/best_accuracy Global.save_inference_dir=./prune/prune_inference_model +``` + +inference model的预测和部署参考: + +1. [inference model python端预测](../infer_deploy/python_infer.md) +2. [inference model C++预测](../infer_deploy/cpp_infer.md) diff --git a/docs/ppocr/model_compress/quantization.en.md b/docs/ppocr/model_compress/quantization.en.md new file mode 100644 index 0000000000..78163ce2d7 --- /dev/null +++ b/docs/ppocr/model_compress/quantization.en.md @@ -0,0 +1,71 @@ +--- +comments: true +--- + +# PP-OCR Models Quantization + +Generally, a more complex model would achieve better performance in the task, but it also leads to some redundancy in the model. +Quantization is a technique that reduces this redundancy by reducing the full precision data to a fixed number, +so as to reduce model calculation complexity and improve model inference performance. + +This example uses PaddleSlim provided [APIs of Quantization](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/dygraph/quanter/qat.rst) to compress the OCR model. + +It is recommended that you could understand following pages before reading this example: + +- [The training strategy of OCR model](../model_train/training.en.md) +- [PaddleSlim Document](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/docs/zh_cn/api_cn/dygraph/quanter/qat.rst) + +## Quick Start + +Quantization is mostly suitable for the deployment of lightweight models on mobile terminals. +After training, if you want to further compress the model size and accelerate the prediction, you can use quantization methods to compress the model according to the following steps. + +1. Install PaddleSlim +2. Prepare trained model +3. Quantization-Aware Training +4. Export inference model +5. Deploy quantization inference model + +### 1. Install PaddleSlim + +```bash linenums="1" +pip3 install paddleslim==2.3.2 +``` + +### 2. Download Pre-trained Model + +PaddleOCR provides a series of pre-trained [models](../model_list.en.md). +If the model to be quantified is not in the list, you need to follow the [Regular Training](../quick_start.en.md) method to get the trained model. + +### 3. Quant-Aware Training + +Quantization training includes offline quantization training and online quantization training. +Online quantization training is more effective. It is necessary to load the pre-trained model. +After the quantization strategy is defined, the model can be quantified. + +The code for quantization training is located in `slim/quantization/quant.py`. For example, the training instructions of slim PPOCRv3 detection model are as follows: + +```bash linenums="1" +# download provided model +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar + +python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/ +``` + +If you want to quantify the text recognition model, you can modify the configuration file and loaded model parameters. + +### 4. Export inference model + +Once we got the model after pruning and fine-tuning, we can export it as an inference model for the deployment of predictive tasks: + +```bash linenums="1" +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +``` + +### 5. Deploy + +The numerical range of the quantized model parameters derived from the above steps is still FP32, but the numerical range of the parameters is int8. +The derived model can be converted through the `opt tool` of PaddleLite. + +For quantitative model deployment, please refer to [Mobile terminal model deployment](../infer_deploy/lite.en.md) diff --git a/docs/ppocr/model_compress/quantization.md b/docs/ppocr/model_compress/quantization.md new file mode 100644 index 0000000000..eb72cfb3e1 --- /dev/null +++ b/docs/ppocr/model_compress/quantization.md @@ -0,0 +1,67 @@ +--- +comments: true +--- + +# PP-OCR模型量化 + +复杂的模型有利于提高模型的性能,但也导致模型中存在一定冗余,模型量化将全精度缩减到定点数减少这种冗余,达到减少模型计算复杂度,提高模型推理性能的目的。 +模型量化可以在基本不损失模型的精度的情况下,将FP32精度的模型参数转换为Int8精度,减小模型参数大小并加速计算,使用量化后的模型在移动端等部署时更具备速度优势。 + +本教程将介绍如何使用飞桨模型压缩库PaddleSlim做PaddleOCR模型的压缩。 +[PaddleSlim](https://github.com/PaddlePaddle/PaddleSlim) 集成了模型剪枝、量化(包括量化训练和离线量化)、蒸馏和神经网络搜索等多种业界常用且领先的模型压缩功能,如果您感兴趣,可以关注并了解。 + +在开始本教程之前,建议先了解[PaddleOCR模型的训练方法](../model_train/training.md)以及[PaddleSlim](https://paddleslim.readthedocs.io/zh_CN/latest/index.html) + +## 快速开始 + +量化多适用于轻量模型在移动端的部署,当训练出一个模型后,如果希望进一步的压缩模型大小并加速预测,可使用量化的方法压缩模型。 + +模型量化主要包括五个步骤: + +1. 安装 PaddleSlim +2. 准备训练好的模型 +3. 量化训练 +4. 导出量化推理模型 +5. 量化模型预测部署 + +### 1. 安装PaddleSlim + +```bash linenums="1" +pip3 install paddleslim==2.3.2 +``` + +### 2. 准备训练好的模型 + +PaddleOCR提供了一系列训练好的[模型](../model_list.md),如果待量化的模型不在列表中,需要按照[常规训练](../quick_start.md)方法得到训练好的模型。 + +### 3. 量化训练 + +量化训练包括离线量化训练和在线量化训练,在线量化训练效果更好,需加载预训练模型,在定义好量化策略后即可对模型进行量化。 + +量化训练的代码位于slim/quantization/quant.py 中,比如训练检测模型,以PPOCRv3检测模型为例,训练指令如下: + +```bash linenums="1" +# 下载检测预训练模型: +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar + +python deploy/slim/quantization/quant.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.pretrained_model='./ch_PP-OCRv3_det_distill_train/best_accuracy' Global.save_model_dir=./output/quant_model_distill/ +``` + +如果要训练识别模型的量化,修改配置文件和加载的模型参数即可。 + +### 4. 导出模型 + +在得到量化训练保存的模型后,我们可以将其导出为inference_model,用于预测部署: + +```bash linenums="1" +python deploy/slim/quantization/export_model.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=output/quant_model/best_accuracy Global.save_inference_dir=./output/quant_inference_model +``` + +### 5. 量化模型部署 + +上述步骤导出的量化模型,参数精度仍然是FP32,但是参数的数值范围是int8,导出的模型可以通过PaddleLite的opt模型转换工具完成模型转换。 + +量化模型移动端部署的可参考 [移动端模型部署](../infer_deploy/lite.md) + +备注:量化训练后的模型参数是float32类型,转inference model预测时相对不量化无加速效果,原因是量化后模型结构之间存在量化和反量化算子,如果要使用量化模型部署,建议使用TensorRT并设置precision为INT8加速量化模型的预测时间。 diff --git a/docs/ppocr/model_list.en.md b/docs/ppocr/model_list.en.md new file mode 100644 index 0000000000..b0750246e3 --- /dev/null +++ b/docs/ppocr/model_list.en.md @@ -0,0 +1,121 @@ +--- +comments: true +--- + +# OCR Model List(V3, updated on 2022.4.28) +> +> **Note** +> +> 1. Compared with model v2, the 3rd version of the detection model has an improvement in accuracy, and the 2.1 version of the recognition model has optimizations in accuracy and speed with CPU. +> 2. Compared with [models 1.1](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/models_list_en.md), which are trained with static graph programming paradigm, models 2.0 or higher are the dynamic graph trained version and achieve close performance. +> 3. All models in this tutorial are from the PaddleOCR series, for more introduction to algorithms and models based on the public dataset, you can refer to [algorithm overview tutorial](../algorithm/overview.en.md). + +The downloadable models provided by PaddleOCR include the `inference model`, `trained model`, `pre-trained model` and `nb model`. The differences between the models are as follows: + +|model type|model format|description| +|--- | --- | --- | +|inference model|inference.pdmodel、inference.pdiparams|Used for inference based on Paddle inference engine,[detail](./infer_deploy/python_infer.en.md)| +|trained model, pre-trained model|\*.pdparams、\*.pdopt、\*.states |The checkpoints model saved in the training process, which stores the parameters of the model, is mostly used for model evaluation and continuous training.| +|nb model|\*.nb| Model optimized by Paddle-Lite, which is suitable for mobile-side deployment scenarios (Paddle-Lite is needed for nb model deployment). | + +The relationship of the above models is as follows. + +![](../imgs_en/model_prod_flow_en.png) + +## 1. Text Detection Model + +### 1. Chinese Detection Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +|ch_PP-OCRv3_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb)| +|ch_PP-OCRv3_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar)| +|ch_PP-OCRv2_det_slim| [New] slim quantization with distillation lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)| 3.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +|ch_PP-OCRv2_det| [New] Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml)|3.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)| +|ch_ppocr_mobile_slim_v2.0_det|Slim pruned lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|2.6M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar)| +|ch_ppocr_mobile_v2.0_det|Original lightweight model, supporting Chinese, English, multilingual text detection|[ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml)|3.0M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| +|ch_ppocr_server_v2.0_det|General model, which is larger than the lightweight model, but achieved better performance|[ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml)|47.0M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| + +### 1.2 English Detection Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +|en_PP-OCRv3_det_slim | [New] Slim quantization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | +|en_PP-OCRv3_det | [New] Original lightweight detection model, supporting English |[ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | + +* Note: English configuration file is the same as Chinese except for training data, here we only provide one configuration file. + +### 1.3 Multilingual Detection Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +| ml_PP-OCRv3_det_slim | [New] Slim quantization with distillation lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | +| ml_PP-OCRv3_det |[New] Original lightweight detection model, supporting English | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml)| 3.8M | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) | + +* Note: English configuration file is the same as Chinese except for training data, here we only provide one configuration file. + +## 2. Text Recognition Model + +### 2.1 Chinese Recognition Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +|ch_PP-OCRv3_rec_slim | [New] Slim quantization with distillation lightweight model, supporting Chinese, English text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 4.9M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +|ch_PP-OCRv3_rec| [New] Original lightweight model, supporting Chinese, English, multilingual text recognition |[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml)| 12.4M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +|ch_PP-OCRv2_rec_slim| Slim quantization with distillation lightweight model, supporting Chinese, English text recognition|[ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml)| 9.0M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +|ch_PP-OCRv2_rec| Original lightweight model, supporting Chinese, English, and multilingual text recognition |[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)|8.5M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +|ch_ppocr_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)| 6.0M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +|ch_ppocr_mobile_v2.0_rec|Original lightweight model, supporting Chinese, English and number recognition|[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)|5.2M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +|ch_ppocr_server_v2.0_rec|General model, supporting Chinese, English and number recognition|[rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml)|94.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [pre-trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | + +**Note:** The `trained model` is fine-tuned on the `pre-trained model` with real data and synthesized vertical text data, which achieved better performance in the real scene. The `pre-trained model` is directly trained on the full amount of real data and synthesized data, which is more suitable for fine-tuning your dataset. + +### 2.2 English Recognition Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +|en_PP-OCRv3_rec_slim | [New] Slim quantization with distillation lightweight model, supporting English, English text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 3.2M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +|en_PP-OCRv3_rec| [New] Original lightweight model, supporting English, English, multilingual text recognition |[en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml)| 9.6M |[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +|en_number_mobile_slim_v2.0_rec|Slim pruned and quantized lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)| 2.7M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | +|en_number_mobile_v2.0_rec|Original lightweight model, supporting English and number recognition|[rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml)|2.6M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | + +**Note:** Dictionary file of all English recognition models is `ppocr/utils/en_dict.txt`. + +### 2.3 Multilingual Recognition Model(Updating...) + +|model name| dict file | description|config|model size|download| +| --- | --- | --- |--- | --- | --- | +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt |Lightweight model for Korean recognition|[korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml)|11.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt |Lightweight model for Japanese recognition|[japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml)|11.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | Lightweight model for chinese cht|[chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml)|12.0M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | Lightweight model for Telugu recognition |[te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | +| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt | Lightweight model for Kannada recognition |[ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml)|9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | +| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt |Lightweight model for Tamil recognition|[ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml)|9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | +| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | Lightweight model for latin recognition | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) |9.7M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) | +| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | Lightweight model for arabic recognition | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) | +| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | Lightweight model for cyrillic recognition | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) |9.6M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) | +| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt | Lightweight model for devanagari recognition | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) |9.9M|[inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) | + +For a complete list of languages ​​and tutorials, please refer to [Multi-language model](./blog/multi_languages.en.md) + +## 3. Text Angle Classification Model + +|model name|description|config|model size|download| +| --- | --- | --- | --- | --- | +|ch_ppocr_mobile_slim_v2.0_cls|Slim quantized model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)| 2.1M | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_train.tar) / [nb model](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | +|ch_ppocr_mobile_v2.0_cls|Original model for text angle classification|[cls_mv3.yml](../../configs/cls/cls_mv3.yml)|1.38M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | + +## 4. Paddle-Lite Model + +Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embedded, and IoT devices. It can further optimize the inference model and generate the `nb model` used for edge devices. It's suggested to optimize the quantization model using Paddle-Lite because the `INT8` format is used for the model storage and inference. + +This chapter lists OCR nb models with PP-OCRv2 or earlier versions. You can access the latest nb models from the above tables. + +|Version|Introduction|Model size|Detection model|Text Direction model|Recognition model|Paddle-Lite branch| +|---|---|---|---|---|---|---| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11.0M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb)|v2.10| +|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.6M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb)|v2.10| +|PP-OCRv2|extra-lightweight chinese OCR optimized model|11.0M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb)|v2.9| +|PP-OCRv2(slim)|extra-lightweight chinese OCR optimized model|4.9M|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb)|v2.9| +|V2.0|ppocr_v2.0 extra-lightweight chinese OCR optimized model|7.8M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb)|v2.9| +|V2.0(slim)|ppovr_v2.0 extra-lightweight chinese OCR optimized model|3.3M|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb)|[download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb)|v2.9| diff --git a/docs/ppocr/model_list.md b/docs/ppocr/model_list.md new file mode 100644 index 0000000000..2516a70485 --- /dev/null +++ b/docs/ppocr/model_list.md @@ -0,0 +1,130 @@ +--- +comments: true +--- + +# PP-OCR系列模型列表(V4,2023年8月1日更新) + +> **说明** +> +> 1. V4版模型相比V3版模型,在模型精度上有进一步提升 +> +> 2. V3版模型相比V2版模型,在模型精度上有进一步提升 +> +> 3. 2.0+版模型和[1.1版模型](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/models_list.md) 的主要区别在于动态图训练vs.静态图训练,模型性能上无明显差距。 +> +> 4. 本文档提供的是PPOCR自研模型列表,更多基于公开数据集的算法介绍与预训练模型可以参考:[算法概览文档](../algorithm/overview.md)。 + +PaddleOCR提供的可下载模型包括`推理模型`、`训练模型`、`预训练模型`、`nb模型`,模型区别说明如下: + +| 模型类型 | 模型格式 | 简介 | +| ---- | ----- | ----| +| 推理模型 | inference.pdmodel、inference.pdiparams | 用于预测引擎推理,[详情](./infer_deploy/python_infer.md) | +| 训练模型、预训练模型 | \*.pdparams、\*.pdopt、\*.states | 训练过程中保存的模型的参数、优化器状态和训练中间信息,多用于模型指标评估和恢复训练 | +| nb模型 | \*.nb | 经过飞桨Paddle-Lite工具优化后的模型,适用于移动端/IoT端等端侧部署场景(需使用飞桨Paddle Lite部署)。 | + +各个模型的关系如下面的示意图所示。 + +![](./images/model_prod_flow_ch.png) + +## 1. 文本检测模型 + +### 1.1 中文检测模型 + +| 模型名称 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址 | +| ------- | --------- | ------- | ------------ | ------- | +| ch_PP-OCRv4_det | 【最新】原始超轻量模型,支持中英文、多语种文本检测 | [ch_PP-OCRv4_det_cml.yml](../../configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_cml.yml) | 4.70M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_train.tar) | +| ch_PP-OCRv4_server_det | 【最新】原始高精度模型,支持中英文、多语种文本检测 | [ch_PP-OCRv4_det_teacher.yml](../../configs/det/ch_PP-OCRv4/ch_PP-OCRv4_det_teacher.yml) | 110M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_det_server_train.tar) | +| ch_PP-OCRv3_det_slim | slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_slim_infer.nb) | +| ch_PP-OCRv3_det | 原始超轻量模型,支持中英文、多语种文本检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 3.80M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | +| ch_PP-OCRv2_det_slim | slim量化+蒸馏版超轻量模型,支持中英文、多语种文本检测 | [ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml) | 3.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_quant_infer.tar)| +| ch_PP-OCRv2_det | 原始超轻量模型,支持中英文、多语种文本检测 | [ch_PP-OCRv2_det_cml.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_cml.yml) | 3.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | +| ch_ppocr_mobile_slim_v2.0_det | slim裁剪版超轻量模型,支持中英文、多语种文本检测 | [ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml) | 2.60M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/slim/ch_ppocr_mobile_v2.0_det_prune_infer.tar) | +| ch_ppocr_mobile_v2.0_det | 原始超轻量模型,支持中英文、多语种文本检测 | [ch_det_mv3_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_mv3_db_v2.0.yml) | 3.0M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar)| +| ch_ppocr_server_v2.0_det | 通用模型,支持中英文、多语种文本检测,比超轻量模型更大,但效果更好 | [ch_det_res18_db_v2.0.yml](../../configs/det/ch_ppocr_v2.0/ch_det_res18_db_v2.0.yml) | 47.0M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar)| + +### 1.2 英文检测模型 + +| 模型名称 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址| +| -------- | ----------- | --------- | ------------ | --------- | +| en_PP-OCRv3_det_slim | 【最新】slim量化版超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_slim_infer.nb) | +| en_PP-OCRv3_det | 【最新】原始超轻量模型,支持英文、数字检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | + +* 注:英文检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。 + +### 1.3 多语言检测模型 + +| 模型名称 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址 | +| ------ | --------| --------- | ------------ | --------- | +| ml_PP-OCRv3_det_slim | 【最新】slim量化版超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 1.1M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_distill_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_slim_infer.nb) | +| ml_PP-OCRv3_det | 【最新】原始超轻量模型,支持多语言检测 | [ch_PP-OCRv3_det_cml.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) | 3.8M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/Multilingual_PP-OCRv3_det_distill_train.tar) | + +* 注:多语言检测模型与中文检测模型结构完全相同,只有训练数据不同,在此仅提供相同的配置文件。 + +## 2. 文本识别模型 + +### 2.1 中文识别模型 + +| 模型名称 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址 | +| ----- | --------- | ------ | ------------ | ------- | +| ch_PP-OCRv4_rec | 【最新】超轻量模型,支持中英文、数字识别 | [ch_PP-OCRv4_rec_distill.yml](../../configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_distill.yml) | 10M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_train.tar) | +| ch_PP-OCRv4_server_rec | 【最新】高精度模型,支持中英文、数字识别 | [ch_PP-OCRv4_rec_hgnet.yml](../../configs/rec/PP-OCRv4/ch_PP-OCRv4_rec_hgnet.yml) | 88M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_server_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_server_train.tar) | +| ch_PP-OCRv3_rec_slim | slim量化版超轻量模型,支持中英文、数字识别 | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 4.9M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_slim_infer.nb) | +| ch_PP-OCRv3_rec | 原始超轻量模型,支持中英文、数字识别 | [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml) | 12.4M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| ch_PP-OCRv2_rec_slim | slim量化版超轻量模型,支持中英文、数字识别 | [ch_PP-OCRv2_rec.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec.yml) | 9.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_quant_train.tar) | +| ch_PP-OCRv2_rec | 原始超轻量模型,支持中英文、数字识别 | [ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml) | 8.50M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +| ch_ppocr_mobile_slim_v2.0_rec | slim裁剪量化版超轻量模型,支持中英文、数字识别 | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 6.0M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_train.tar) | +| ch_ppocr_mobile_v2.0_rec | 原始超轻量模型,支持中英文、数字识别 | [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml) | 5.20M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| ch_ppocr_server_v2.0_rec | 通用模型,支持中英文、数字识别 | [rec_chinese_common_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_common_train_v2.0.yml) | 94.8M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | + +**说明:** `训练模型`是基于预训练模型在真实数据与竖排合成文本数据上finetune得到的模型,在真实应用场景中有着更好的表现,`预训练模型`则是直接基于全量真实数据与合成数据训练得到,更适合用于在自己的数据集上finetune。 + +### 2.2 英文识别模型 + +| 模型名称 | 模型简介 | 配置文件| 推理模型大小 | 下载地址| +| ------- | -------- | ------ | ------------ | ----- | +| en_PP-OCRv4_rec | 【最新】原始超轻量模型,支持英文、数字识别 | [en_PP-OCRv4_rec.yml](../../configs/rec/PP-OCRv4/en_PP-OCRv4_rec.yml) | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_train.tar) | +| en_PP-OCRv3_rec_slim | slim量化版超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml) | 3.2M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_train.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_slim_infer.nb) | +| en_PP-OCRv3_rec | 原始超轻量模型,支持英文、数字识别 | [en_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml) | 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +| en_number_mobile_slim_v2.0_rec | slim裁剪量化版超轻量模型,支持英文、数字识别 | [rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml) | 2.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/en_number_mobile_v2.0_rec_slim_train.tar) | +| en_number_mobile_v2.0_rec | 原始超轻量模型,支持英文、数字识别 | [rec_en_number_lite_train.yml](../../configs/rec/multi_language/rec_en_number_lite_train.yml) | 2.6M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/multilingual/en_number_mobile_v2.0_rec_train.tar) | + +**注意:** 所有英文识别模型的字典文件均为`ppocr/utils/en_dict.txt` + +### 2.3 多语言识别模型(更多语言持续更新中...) + +| 模型名称 | 字典文件 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址 | +| ----- | ------ | ------------ | ------ | ------------ | ---- | +| korean_PP-OCRv3_rec | ppocr/utils/dict/korean_dict.txt | 韩文识别 | [korean_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/korean_PP-OCRv3_rec.yml) | 11.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/korean_PP-OCRv3_rec_train.tar) | +| japan_PP-OCRv3_rec | ppocr/utils/dict/japan_dict.txt | 日文识别 | [japan_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/japan_PP-OCRv3_rec.yml) | 11.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/japan_PP-OCRv3_rec_train.tar) | +| chinese_cht_PP-OCRv3_rec | ppocr/utils/dict/chinese_cht_dict.txt | 中文繁体识别 | [chinese_cht_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/chinese_cht_PP-OCRv3_rec.yml) | 12.0M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/chinese_cht_PP-OCRv3_rec_train.tar) | +| te_PP-OCRv3_rec | ppocr/utils/dict/te_dict.txt | 泰卢固文识别 | [te_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/te_PP-OCRv3_rec.yml) | 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/te_PP-OCRv3_rec_train.tar) | +| ka_PP-OCRv3_rec | ppocr/utils/dict/ka_dict.txt | 卡纳达文识别 | [ka_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ka_PP-OCRv3_rec.yml) | 9.9M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ka_PP-OCRv3_rec_train.tar) | +| ta_PP-OCRv3_rec | ppocr/utils/dict/ta_dict.txt | 泰米尔文识别 | [ta_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/ta_PP-OCRv3_rec.yml) | 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/ta_PP-OCRv3_rec_train.tar) | +| latin_PP-OCRv3_rec | ppocr/utils/dict/latin_dict.txt | 拉丁文识别 | [latin_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/latin_PP-OCRv3_rec.yml) | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_train.tar) | +| arabic_PP-OCRv3_rec | ppocr/utils/dict/arabic_dict.txt | 阿拉伯字母 | [arabic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/arabic_PP-OCRv3_rec.yml) | 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/arabic_PP-OCRv3_rec_train.tar) | +| cyrillic_PP-OCRv3_rec | ppocr/utils/dict/cyrillic_dict.txt | 斯拉夫字母 | [cyrillic_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/cyrillic_PP-OCRv3_rec.yml) | 9.6M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/cyrillic_PP-OCRv3_rec_train.tar) | +| devanagari_PP-OCRv3_rec | ppocr/utils/dict/devanagari_dict.txt | 梵文字母 | [devanagari_PP-OCRv3_rec.yml](../../configs/rec/PP-OCRv3/multi_language/devanagari_PP-OCRv3_rec.yml) | 9.9M | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/devanagari_PP-OCRv3_rec_train.tar) | + +查看完整语种列表与使用教程请参考: [多语言模型](./blog/multi_languages.md) + +## 3. 文本方向分类模型 + +| 模型名称 | 模型简介 | 配置文件 | 推理模型大小 | 下载地址 | +| ---------- | ------------- | ------------- | ------------ | -- | +| ch_ppocr_mobile_slim_v2.0_cls | slim量化版模型,对检测到的文本行文字角度分类 | [cls_mv3.yml](../../configs/cls/cls_mv3.yml) | 2.1M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_slim_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_slim_infer.tar) / [nb模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | +| ch_ppocr_mobile_v2.0_cls | 原始分类器模型,对检测到的文本行文字角度分类 | [cls_mv3.yml](../../configs/cls/cls_mv3.yml) | 1.38M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | + +## 4. Paddle-Lite 模型 + +Paddle-Lite 是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架,它可以对inference模型进一步优化,得到适用于移动端/IoT端等端侧部署场景的`nb模型`。一般建议基于量化模型进行转换,因为可以将模型以INT8形式进行存储与推理,从而进一步减小模型大小,提升模型速度。 + +本节主要列出PP-OCRv2以及更早版本的检测与识别nb模型,最新版本的nb模型可以直接从上面的模型列表中获得。 + +| 模型版本 | 模型简介| 模型大小 | 检测模型 | 文本方向分类模型 | 识别模型 | Paddle-Lite版本 | +| ----- | --- | ---- | --- | --- | --- | --- | +| PP-OCRv2 | 蒸馏版超轻量中文OCR移动端模型 | 11.0M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_infer_opt.nb) | v2.10 | +| PP-OCRv2(slim) | 蒸馏版超轻量中文OCR移动端模型 | 4.6M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_det_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/lite/ch_PP-OCRv2_rec_slim_opt.nb) | v2.10 | +| PP-OCRv2 | 蒸馏版超轻量中文OCR移动端模型 | 11.0M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer_opt.nb) | v2.9 | +| PP-OCRv2(slim) | 蒸馏版超轻量中文OCR移动端模型 | 4.9M | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_slim_opt.nb) | v2.9 | +| V2.0 | ppocr_v2.0超轻量中文OCR移动端模型 | 7.8M | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_opt.nb) | v2.9 | +| V2.0(slim) | ppocr_v2.0超轻量中文OCR移动端模型 | 3.3M | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_det_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_cls_slim_opt.nb) | [下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.0/lite/ch_ppocr_mobile_v2.0_rec_slim_opt.nb) | v2.9 | diff --git a/docs/ppocr/model_train/PPOCRv3_det_train.en.md b/docs/ppocr/model_train/PPOCRv3_det_train.en.md new file mode 100644 index 0000000000..836a61232a --- /dev/null +++ b/docs/ppocr/model_train/PPOCRv3_det_train.en.md @@ -0,0 +1,254 @@ +--- +comments: true +--- + +# PP-OCRv3 text detection model training + +## 1. Introduction + +PP-OCRv3 is a further upgrade of PP-OCRv2. This section introduces the training steps of the PP-OCRv3 detection model. For an introduction to the PP-OCRv3 strategy, refer to [document](../blog/PP-OCRv3_introduction.md). + +## 2. Detection training + +The PP-OCRv3 detection model is an upgrade of the [CML](https://arxiv.org/pdf/2109.03144.pdf) (Collaborative Mutual Learning) collaborative mutual learning text detection distillation strategy in PP-OCRv2. PP-OCRv3 further optimizes the detection teacher model and student model. Among them, when optimizing the teacher model, the PAN structure LK-PAN with a large receptive field and the DML (Deep Mutual Learning) distillation strategy are proposed; when optimizing the student model, the FPN structure RSE-FPN with a residual attention mechanism is proposed. + +PP-OCRv3 detection training includes two steps: + +- Step 1: Use DML distillation method to train detection teacher model + +- Step 2: Use the teacher model obtained in step 1 to train a lightweight student model using CML method + +### 2.1 Prepare data and operating environment + +The training data uses icdar2015 data. For the steps of preparing the training set, refer to [ocr_dataset](./dataset/ocr_datasets.md). + +For the preparation of the operating environment, refer to [document](./installation.md). + +### 2.2 Train the teacher model + +The configuration file for teacher model training is [ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml). The Backbone, Neck, and Head of the teacher model structure are Resnet50, LKPAN, and DBHead respectively, and are trained using the DML distillation method. For a detailed introduction to the configuration file, refer to [Document](./knowledge_distillation.md). + +Download ImageNet pre-trained model: + +```bash linenums="1" +# Download ResNet50_vd pre-trained model +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams +``` + +**Start training** + +```bash linenums="1" +# Single card training +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ +-o Architecture.Models.Student.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ +Architecture.Models.Student2.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ +Global.save_model_dir=./output/ +# If you want to use multi-GPU distributed training, please use the following command: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ +-o Architecture.Models.Student.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ +Architecture.Models.Student2.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ +Global.save_model_dir=./output/ +``` + +The model saved during training is in the output directory, which contains the following files: + +```bash linenums="1" +best_accuracy.states +best_accuracy.pdparams # The model parameters with the best accuracy are saved by default +best_accuracy.pdopt # The optimizer-related parameters with the best accuracy are saved by default +latest.states +latest.pdparams # The latest model parameters saved by default +latest.pdopt # The optimizer-related parameters of the latest model saved by default +``` + +Among them, best_accuracy is the model parameter with the highest accuracy saved, and the model can be directly used for evaluation. + +The model evaluation command is as follows: + +```bash linenums="1" +python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml -o Global.checkpoints=./output/best_accuracy +``` + +The trained teacher model has a larger structure and higher accuracy, which is used to improve the accuracy of the student model. + +**Extract teacher model parameters** +best_accuracy contains the parameters of two models, corresponding to Student and Student2 in the configuration file. The method to extract the parameters of Student is as follows: + +```bash linenums="1" +import paddle +# Load pre-trained model +all_params = paddle.load("output/best_accuracy.pdparams") +# View the keys of weight parameters +print(all_params.keys()) +# Model weight extraction +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# View the keys of model weight parameters +print(s_params.keys()) +# Save +paddle.save(s_params, "./pretrain_models/dml_teacher.pdparams") +``` + +The extracted model parameters can be used for further fine-tuning or distillation training of the model. + +### 2.3 Training the student model + +The configuration file for training the student model is [ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) +The teacher model trained in the previous section is used as supervision, and the CML method is used to train a lightweight student model. + +Download the ImageNet pre-trained model of the student model: + +```bash linenums="1" +# Download the pre-trained model of MobileNetV3 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams +``` + +**Start training** + +```bash linenums="1" +# Single card training +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ +-o Architecture.Models.Student.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ +Architecture.Models.Student2.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ +Architecture.Models.Teacher.pretrained=./pretrain_models/dml_teacher \ +Global.save_model_dir=./output/ +# If you want to use multi-GPU distributed training, please use the following command: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ +-o Architecture.Models.Student.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ +Architecture.Models.Student2.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ +Architecture.Models.Teacher.pretrained=./pretrain_models/dml_teacher \ +Global.save_model_dir=./output/ +``` + +The model saved during the training process is in the output directory. +The model evaluation command is as follows: + +```bash linenums="1" +python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=./output/best_accuracy +``` + +best_accuracy contains the parameters of three models, corresponding to Student, Student2, and Teacher in the configuration file. The method to extract Student parameters is as follows: + +```bash linenums="1" +import paddle +# Load pre-trained model +all_params = paddle.load("output/best_accuracy.pdparams") +# View the keys of weight parameters +print(all_params.keys()) +# Model weight extraction +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# View the keys of model weight parameters +print(s_params.keys()) +# Save +paddle.save(s_params, "./pretrain_models/cml_student.pdparams") +``` + +The extracted Student parameters can be used for model deployment or further fine-tuning training. + +## 3. Fine-tune training based on PP-OCRv3 detection + +This section describes how to use the PP-OCRv3 detection model for fine-tune training in other scenarios. + +Fine-tune training is applicable to three scenarios: + +- Fine-tune training based on the CML distillation method is applicable to scenarios where the teacher model has higher accuracy than the PP-OCRv3 detection model in the usage scenario and a lightweight detection model is desired. + +- Fine-tune training based on the PP-OCRv3 lightweight detection model does not require the training of the teacher model and is intended to improve the accuracy of the usage scenario based on the PP-OCRv3 detection model. + +- Fine-tune training based on the DML distillation method is applicable to scenarios where the DML method is used to further improve accuracy. + +**Finetune training based on CML distillation method** + +Download PP-OCRv3 training model: + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar +``` + +ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams contains the parameters of Student, Student2, and Teacher models in the CML configuration file. + +Start training: + +```bash linenums="1" +# Single card training +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ +-o Global.pretrained_model=./ch_PP-OCRv3_det_distill_train/best_accuracy \ +Global.save_model_dir=./output/ +# If you want to use multi-GPU distributed training, please use the following command: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ +-o Global.pretrained_model=./ch_PP-OCRv3_det_distill_train/best_accuracy \ +Global.save_model_dir=./output/ +``` + +**Finetune training based on PP-OCRv3 lightweight detection model** + +Download PP-OCRv3 training model and extract model parameters of Student structure: + +``` +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar +``` + +The method to extract Student parameters is as follows: + +```bash linenums="1" +import paddle +# Load pre-trained model +all_params = paddle.load("output/best_accuracy.pdparams") +# View the keys of weight parameters +print(all_params.keys()) +# Model weight extraction +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# View the keys of the model weight parameters +print(s_params.keys()) +# Save +paddle.save(s_params, "./student.pdparams") +``` + +Train using the configuration file [ch_PP-OCRv3_det_student.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml). + +**Start training** + +```bash linenums="1" +# Single card training +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ +-o Global.pretrained_model=./student \ +Global.save_model_dir=./output/ +# If you want to use multi-GPU distributed training, please use the following command: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ +-o Global.pretrained_model=./student \ +Global.save_model_dir=./output/ +``` + +**Finetune training based on DML distillation method** + +Take the Teacher model in ch_PP-OCRv3_det_distill_train as an example. First, extract the parameters of the Teacher structure. The method is as follows: + +```bash linenums="1" +import paddle +# Load pre-trained model +all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# View the keys of weight parameters +print(all_params.keys()) +# Model weight extraction +s_params = {key[len("Teacher."):]: all_params[key] for key in all_params if "Teacher." in key} +# View the keys of model weight parameters +print(s_params.keys()) +# Save +paddle.save(s_params, "./teacher.pdparams") +``` + +**Start training** + +```bash linenums="1" +# Single card training +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ +-o Architecture.Models.Student.pretrained=./teacher \ +Architecture.Models.Student2.pretrained=./teacher \ +Global.save_model_dir=./output/ +# If you want to use multi-GPU distributed training, please use the following command: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ +-o Architecture.Models.Student.pretrained=./teacher \ +Architecture.Models.Student2.pretrained=./teacher \ +Global.save_model_dir=./output/ +``` diff --git a/docs/ppocr/model_train/PPOCRv3_det_train.md b/docs/ppocr/model_train/PPOCRv3_det_train.md new file mode 100644 index 0000000000..abdff65475 --- /dev/null +++ b/docs/ppocr/model_train/PPOCRv3_det_train.md @@ -0,0 +1,251 @@ +--- +comments: true +--- + +# PP-OCRv3 文本检测模型训练 + +## 1. 简介 + +PP-OCRv3在PP-OCRv2的基础上进一步升级。本节介绍PP-OCRv3检测模型的训练步骤。有关PP-OCRv3策略介绍参考[文档](../blog/PP-OCRv3_introduction.md)。 + +## 2. 检测训练 + +PP-OCRv3检测模型是对PP-OCRv2中的[CML](https://arxiv.org/pdf/2109.03144.pdf)(Collaborative Mutual Learning) 协同互学习文本检测蒸馏策略进行了升级。PP-OCRv3分别针对检测教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML(Deep Mutual Learning)蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 + +PP-OCRv3检测训练包括两个步骤: + +- 步骤1:采用DML蒸馏方法训练检测教师模型 +- 步骤2:使用步骤1得到的教师模型采用CML方法训练出轻量学生模型 + +### 2.1 准备数据和运行环境 + +训练数据采用icdar2015数据,准备训练集步骤参考[ocr_dataset](./dataset/ocr_datasets.md). + +运行环境准备参考[文档](./installation.md)。 + +### 2.2 训练教师模型 + +教师模型训练的配置文件是[ch_PP-OCRv3_det_dml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml)。教师模型模型结构的Backbone、Neck、Head分别为Resnet50, LKPAN, DBHead,采用DML的蒸馏方法训练。有关配置文件的详细介绍参考[文档](./knowledge_distillation.md)。 + +下载ImageNet预训练模型: + +```bash linenums="1" +# 下载ResNet50_vd的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams +``` + +**启动训练** + +```bash linenums="1" +# 单卡训练 +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ + -o Architecture.Models.Student.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ + Architecture.Models.Student2.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ + Global.save_model_dir=./output/ +# 如果要使用多GPU分布式训练,请使用如下命令: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ + -o Architecture.Models.Student.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ + Architecture.Models.Student2.pretrained=./pretrain_models/ResNet50_vd_ssld_pretrained \ + Global.save_model_dir=./output/ +``` + +训练过程中保存的模型在output目录下,包含以下文件: + +```bash linenums="1" +best_accuracy.states +best_accuracy.pdparams # 默认保存最优精度的模型参数 +best_accuracy.pdopt # 默认保存最优精度的优化器相关参数 +latest.states +latest.pdparams # 默认保存的最新模型参数 +latest.pdopt # 默认保存的最新模型的优化器相关参数 +``` + +其中,best_accuracy是保存的精度最高的模型参数,可以直接使用该模型评估。 + +模型评估命令如下: + +```bash linenums="1" +python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml -o Global.checkpoints=./output/best_accuracy +``` + +训练的教师模型结构更大,精度更高,用于提升学生模型的精度。 + +**提取教师模型参数** +best_accuracy包含两个模型的参数,分别对应配置文件中的Student,Student2。提取Student的参数方法如下: + +```bash linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("output/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "./pretrain_models/dml_teacher.pdparams") +``` + +提取出来的模型参数可以用于模型进一步的finetune训练或者蒸馏训练。 + +### 2.3 训练学生模型 + +训练学生模型的配置文件是[ch_PP-OCRv3_det_cml.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml) +上一节训练得到的教师模型作为监督,采用CML方式训练得到轻量的学生模型。 + +下载学生模型的ImageNet预训练模型: + +```bash linenums="1" +# 下载MobileNetV3的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams +``` + +**启动训练** + +```bash linenums="1" +# 单卡训练 +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Architecture.Models.Student.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Architecture.Models.Student2.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Architecture.Models.Teacher.pretrained=./pretrain_models/dml_teacher \ + Global.save_model_dir=./output/ +# 如果要使用多GPU分布式训练,请使用如下命令: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Architecture.Models.Student.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Architecture.Models.Student2.pretrained=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Architecture.Models.Teacher.pretrained=./pretrain_models/dml_teacher \ + Global.save_model_dir=./output/ +``` + +训练过程中保存的模型在output目录下, +模型评估命令如下: + +```bash linenums="1" +python3 tools/eval.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml -o Global.checkpoints=./output/best_accuracy +``` + +best_accuracy包含三个模型的参数,分别对应配置文件中的Student,Student2,Teacher。提取Student参数的方法如下: + +```bash linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("output/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "./pretrain_models/cml_student.pdparams") +``` + +提取出来的Student的参数可用于模型部署或者做进一步的finetune训练。 + +## 3. 基于PP-OCRv3检测finetune训练 + +本节介绍如何使用PP-OCRv3检测模型在其他场景上的finetune训练。 + +finetune训练适用于三种场景: + +- 基于CML蒸馏方法的finetune训练,适用于教师模型在使用场景上精度高于PP-OCRv3检测模型,且希望得到一个轻量检测模型。 +- 基于PP-OCRv3轻量检测模型的finetune训练,无需训练教师模型,希望在PP-OCRv3检测模型基础上提升使用场景上的精度。 +- 基于DML蒸馏方法的finetune训练,适用于采用DML方法进一步提升精度的场景。 + +**基于CML蒸馏方法的finetune训练** + +下载PP-OCRv3训练模型: + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar +``` + +ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams包含CML配置文件中Student、Student2、Teacher模型的参数。 + +启动训练: + +```bash linenums="1" +# 单卡训练 +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Global.pretrained_model=./ch_PP-OCRv3_det_distill_train/best_accuracy \ + Global.save_model_dir=./output/ +# 如果要使用多GPU分布式训练,请使用如下命令: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_cml.yml \ + -o Global.pretrained_model=./ch_PP-OCRv3_det_distill_train/best_accuracy \ + Global.save_model_dir=./output/ +``` + +**基于PP-OCRv3轻量检测模型的finetune训练** + +下载PP-OCRv3训练模型,并提取Student结构的模型参数: + +``` +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar +tar xf ch_PP-OCRv3_det_distill_train.tar +``` + +提取Student参数的方法如下: + +```bash linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("output/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 模型的权重提取 +s_params = {key[len("Student."):]: all_params[key] for key in all_params if "Student." in key} +# 查看模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "./student.pdparams") +``` + +使用配置文件[ch_PP-OCRv3_det_student.yml](https://github.com/PaddlePaddle/PaddleOCR/blob/release%2F2.5/configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml)训练。 + +**启动训练** + +```bash linenums="1" +# 单卡训练 +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ + -o Global.pretrained_model=./student \ + Global.save_model_dir=./output/ +# 如果要使用多GPU分布式训练,请使用如下命令: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml \ + -o Global.pretrained_model=./student \ + Global.save_model_dir=./output/ +``` + +**基于DML蒸馏方法的finetune训练** + +以ch_PP-OCRv3_det_distill_train中的Teacher模型为例,首先提取Teacher结构的参数,方法如下: + +```bash linenums="1" +import paddle +# 加载预训练模型 +all_params = paddle.load("ch_PP-OCRv3_det_distill_train/best_accuracy.pdparams") +# 查看权重参数的keys +print(all_params.keys()) +# 模型的权重提取 +s_params = {key[len("Teacher."):]: all_params[key] for key in all_params if "Teacher." in key} +# 查看模型权重参数的keys +print(s_params.keys()) +# 保存 +paddle.save(s_params, "./teacher.pdparams") +``` + +**启动训练** + +```bash linenums="1" +# 单卡训练 +python3 tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ + -o Architecture.Models.Student.pretrained=./teacher \ + Architecture.Models.Student2.pretrained=./teacher \ + Global.save_model_dir=./output/ +# 如果要使用多GPU分布式训练,请使用如下命令: +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_dml.yml \ + -o Architecture.Models.Student.pretrained=./teacher \ + Architecture.Models.Student2.pretrained=./teacher \ + Global.save_model_dir=./output/ +``` diff --git a/docs/ppocr/model_train/angle_class.en.md b/docs/ppocr/model_train/angle_class.en.md new file mode 100644 index 0000000000..48f50be72f --- /dev/null +++ b/docs/ppocr/model_train/angle_class.en.md @@ -0,0 +1,149 @@ +--- +comments: true +--- + +# Text Direction Classification + +## 1. Method Introduction + +The angle classification is used in the scene where the image is not 0 degrees. In this scene, it is necessary to perform a correction operation on the text line detected in the picture. In the PaddleOCR system, +The text line image obtained after text detection is sent to the recognition model after affine transformation. At this time, only a 0 and 180 degree angle classification of the text is required, so the built-in PaddleOCR text angle classifier **only supports 0 and 180 degree classification**. If you want to support more angles, you can modify the algorithm yourself to support. + +Example of 0 and 180 degree data samples: + +![img](./images/angle_class_example.jpg) + +## 2. Data Preparation + +Please organize the dataset as follows: + +The default storage path for training data is `PaddleOCR/train_data/cls`, if you already have a dataset on your disk, just create a soft link to the dataset directory: + +```bash linenums="1" +ln -sf /train_data/cls/dataset +``` + +please refer to the following to organize your data. + +### Training set + +First put the training images in the same folder (train_images), and use a txt file (cls_gt_train.txt) to store the image path and label. + +- Note: by default, the image path and image label are split with `\t`, if you use other methods to split, it will cause training error + +0 and 180 indicate that the angle of the image is 0 degrees and 180 degrees, respectively. + +```text linenums="1" +" Image file name Image annotation " + +train/word_001.jpg 0 +train/word_002.jpg 180 +``` + +The final training set should have the following file structure: + +```text linenums="1" +|-train_data + |-cls + |- cls_gt_train.txt + |- train + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... +``` + +### Test set + +Similar to the training set, the test set also needs to be provided a folder +containing all images (test) and a cls_gt_test.txt. The structure of the test set is as follows: + +```text linenums="1" +|-train_data + |-cls + |- cls_gt_test.txt + |- test + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... +``` + +## 3. Training + +Write the prepared txt file and image folder path into the configuration file under the `Train/Eval.dataset.label_file_list` and `Train/Eval.dataset.data_dir` fields, the absolute path of the image consists of the `Train/Eval.dataset.data_dir` field and the image name recorded in the txt file. + +PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. + +### Start training + +```bash linenums="1" +# Set PYTHONPATH path +export PYTHONPATH=$PYTHONPATH:. +# GPU training Support single card and multi-card training, specify the card number through --gpus. +# Start training, the following command has been written into the train.sh file, just modify the configuration file path in the file +python3 -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/cls/cls_mv3.yml +``` + +### Data Augmentation + +PaddleOCR provides a variety of data augmentation methods. If you want to add disturbance during training, Please uncomment the `RecAug` and `RandAugment` fields under `Train.dataset.transforms` in the configuration file. + +The default perturbation methods are: cvtColor, blur, jitter, Gauss noise, random crop, perspective, color reverse, RandAugment. + +Except for RandAugment, each disturbance method is selected with a 50% probability during the training process. For specific code implementation, please refer to: +[rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) +[randaugment.py](../../ppocr/data/imaug/randaugment.py) + +### Training + +PaddleOCR supports alternating training and evaluation. You can modify `eval_batch_step` in `configs/cls/cls_mv3.yml` to set the evaluation frequency. By default, it is evaluated every 1000 iter. The following content will be saved during training: + +```bash linenums="1" +├── best_accuracy.pdopt # Optimizer parameters for the best model +├── best_accuracy.pdparams # Parameters of the best model +├── best_accuracy.states # Metric info and epochs of the best model +├── config.yml # Configuration file for this experiment +├── latest.pdopt # Optimizer parameters for the latest model +├── latest.pdparams # Parameters of the latest model +├── latest.states # Metric info and epochs of the latest model +└── train.log # Training log +``` + +If the evaluation set is large, the test will be time-consuming. It is recommended to reduce the number of evaluations, or evaluate after training. + +**Note that the configuration file for prediction/evaluation must be consistent with the training.** + +## 4. Evaluation + +The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/cls/cls_mv3.yml` file. + +```bash linenums="1" +export CUDA_VISIBLE_DEVICES=0 +# GPU evaluation, Global.checkpoints is the weight to be tested +python3 tools/eval.py -c configs/cls/cls_mv3.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +## 5. Prediction + +### Training engine prediction + +Using the model trained by paddleocr, you can quickly get prediction through the following script. + +Use `Global.infer_img` to specify the path of the predicted picture or folder, and use `Global.checkpoints` to specify the weight: + +```bash linenums="1" +# Predict English results +python3 tools/infer_cls.py -c configs/cls/cls_mv3.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words_en/word_10.png +``` + +Input image: + +![](./images/word_1-20240704092713071.jpg) + +Get the prediction result of the input image: + +```bash linenums="1" +infer_img: doc/imgs_words_en/word_10.png + result: ('0', 0.9999995) +``` diff --git a/docs/ppocr/model_train/angle_class.md b/docs/ppocr/model_train/angle_class.md new file mode 100644 index 0000000000..1eaf5e9bdd --- /dev/null +++ b/docs/ppocr/model_train/angle_class.md @@ -0,0 +1,151 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 文本方向分类器 + +## 1. 方法介绍 + +文本方向分类器主要用于图片非0度的场景下,在这种场景下需要对图片里检测到的文本行进行一个转正的操作。在PaddleOCR系统内, +文字检测之后得到的文本行图片经过仿射变换之后送入识别模型,此时只需要对文字进行一个0和180度的角度分类,因此PaddleOCR内置的 +文本方向分类器**只支持了0和180度的分类**。如果想支持更多角度,可以自己修改算法进行支持。 + +0和180度数据样本例子: + +![img](./images/angle_class_example.jpg) + +## 2. 数据准备 + +请按如下步骤设置数据集: + +训练数据的默认存储路径是 `PaddleOCR/train_data/cls`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +```bash linenums="1" +ln -sf /train_data/cls/dataset +``` + +请参考下文组织您的数据。 + +### 训练集 + +首先建议将训练图片放入同一个文件夹,并用一个txt文件(cls_gt_train.txt)记录图片路径和标签。 + +**注意:** 默认请将图片路径和图片标签用 `\t` 分割,如用其他方式分割将造成训练报错 + +0和180分别表示图片的角度为0度和180度 + +```text linenums="1" +" 图像文件名 图像标注信息 " +train/cls/train/word_001.jpg 0 +train/cls/train/word_002.jpg 180 +``` + +最终训练集应有如下文件结构: + +```text linenums="1" +|-train_data + |-cls + |- cls_gt_train.txt + |- train + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... +``` + +### 测试集 + +同训练集类似,测试集也需要提供一个包含所有图片的文件夹(test)和一个cls_gt_test.txt,测试集的结构如下所示: + +```text linenums="1" +|-train_data + |-cls + |- cls_gt_test.txt + |- test + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... +``` + +## 3. 启动训练 + +将准备好的txt文件和图片文件夹路径分别写入配置文件的 `Train/Eval.dataset.label_file_list` 和 `Train/Eval.dataset.data_dir` 字段下,`Train/Eval.dataset.data_dir`字段下的路径和文件里记载的图片名构成了图片的绝对路径。 + +PaddleOCR提供了训练脚本、评估脚本和预测脚本。 + +### 开始训练 + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +```bash linenums="1" +# GPU训练 支持单卡,多卡训练,通过 '--gpus' 指定卡号。 +# 启动训练,下面的命令已经写入train.sh文件中,只需修改文件里的配置文件路径即可 +python3 -m paddle.distributed.launch --gpus '0,1,2,3,4,5,6,7' tools/train.py -c configs/cls/cls_mv3.yml +``` + +### 数据增强 + +PaddleOCR提供了多种数据增强方式,如果您希望在训练时加入扰动,请在配置文件中取消`Train.dataset.transforms`下的`RecAug`和`RandAugment`字段的注释。 + +默认的扰动方式有:颜色空间转换(cvtColor)、模糊(blur)、抖动(jitter)、噪声(Gasuss noise)、随机切割(random crop)、透视(perspective)、颜色反转(reverse),随机数据增强(RandAugment)。 + +训练过程中除随机数据增强外每种扰动方式以50%的概率被选择,具体代码实现请参考: +[rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) +[randaugment.py](../../ppocr/data/imaug/randaugment.py) + +*由于OpenCV的兼容性问题,扰动操作暂时只支持linux* + +## 4. 训练 + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/cls/cls_mv3.yml` 中修改 `eval_batch_step` 设置评估频率,默认每1000个iter评估一次。训练过程中将会保存如下内容: + +```bash linenums="1" +├── best_accuracy.pdopt # 最佳模型的优化器参数 +├── best_accuracy.pdparams # 最佳模型的参数 +├── best_accuracy.states # 最佳模型的指标和epoch等信息 +├── config.yml # 本次实验的配置文件 +├── latest.pdopt # 最新模型的优化器参数 +├── latest.pdparams # 最新模型的参数 +├── latest.states # 最新模型的指标和epoch等信息 +└── train.log # 训练日志 +``` + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +**注意,预测/评估时的配置文件请务必与训练一致。** + +## 5. 评估 + +评估数据集可以通过修改`configs/cls/cls_mv3.yml`文件里的`Eval.dataset.label_file_list` 字段设置。 + +```bash linenums="1" +export CUDA_VISIBLE_DEVICES=0 +# GPU 评估, Global.checkpoints 为待测权重 +python3 tools/eval.py -c configs/cls/cls_mv3.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +## 6. 预测 + +### 训练引擎的预测 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +通过 `Global.infer_img` 指定预测图片或文件夹路径,通过 `Global.checkpoints` 指定权重: + +```bash linenums="1" +# 预测分类结果 +python3 tools/infer_cls.py -c configs/cls/cls_mv3.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.load_static_weights=false Global.infer_img=doc/imgs_words/ch/word_1.jpg +``` + +预测图片: + +![img](./images/word_1-20240704092713071.jpg) + +得到输入图像的预测结果: + +```bash linenums="1" +infer_img: doc/imgs_words/ch/word_1.jpg + result: ('0', 0.9998784) +``` diff --git a/docs/ppocr/model_train/detection.en.md b/docs/ppocr/model_train/detection.en.md new file mode 100644 index 0000000000..32ed048e4b --- /dev/null +++ b/docs/ppocr/model_train/detection.en.md @@ -0,0 +1,248 @@ +--- +comments: true +--- + +# Text Detection + +This section uses the icdar2015 dataset as an example to introduce the training, evaluation, and testing of the detection model in PaddleOCR. + +## 1. Data and Weights Preparation + +### 1.1 Data Preparation + +To prepare datasets, refer to [ocr_datasets](../../datasets/ocr_datasets.en.md) . + +### 1.2 Download Pre-trained Model + +First download the pre-trained model. The detection model of PaddleOCR currently supports 3 backbones, namely MobileNetV3, ResNet18_vd and ResNet50_vd. You can use the model in [PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/ppcls/modeling/architectures) to replace backbone according to your needs. +And the responding download link of backbone pre-trained weights can be found in (). + +```bash linenums="1" +cd PaddleOCR/ +# Download the pre-trained model of MobileNetV3 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams +# or, download the pre-trained model of ResNet18_vd +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams +# or, download the pre-trained model of ResNet50_vd +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams + +``` + +## 2. Training + +### 2.1 Start Training + +*If CPU version installed, please set the parameter `use_gpu` to `false` in the configuration.* + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained +``` + +In the above instruction, use `-c` to select the training to use the `configs/det/det_mv3_db.yml` configuration file. +For a detailed explanation of the configuration file, please refer to [config](../blog/config.en.md). + +You can also use `-o` to change the training parameters without modifying the yml file. For example, adjust the training learning rate to 0.0001 + +```bash linenums="1" +# single GPU training +python3 tools/train.py -c configs/det/det_mv3_db.yml -o \ + Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Optimizer.base_lr=0.0001 + +# multi-GPU training +# Set the GPU ID used by the '--gpus' parameter. +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained + +# multi-Node, multi-GPU training +# Set the IPs of your nodes used by the '--ips' parameter. Set the GPU ID used by the '--gpus' parameter. +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained +``` + +**Note:** For multi-Node multi-GPU training, you need to replace the `ips` value in the preceding command with the address of your machine, and the machines must be able to ping each other. In addition, it requires activating commands separately on multiple machines when we start the training. The command for viewing the IP address of the machine is `ifconfig`. + +If you want to further speed up the training, you can use [automatic mixed precision training](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_en.html). for single card training, the command is as follows: + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True +``` + +### 2.2 Load Trained Model and Continue Training + +If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. + +For example: + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model +``` + +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrained_model`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrained_model` will be loaded. + +### 2.3 Training with New Backbone + +The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones-> +necks->heads). + +```bash linenums="1" +├── architectures # Code for building network +├── transforms # Image Transformation Module +├── backbones # Feature extraction module +├── necks # Feature enhancement module +└── heads # Output module +``` + +If the Backbone to be replaced has a corresponding implementation in PaddleOCR, you can directly modify the parameters in the `Backbone` part of the configuration yml file. + +However, if you want to use a new Backbone, an example of replacing the backbones is as follows: + +1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py. +2. Add code in the my_backbone.py file, the sample code is as follows: + +```python linenums="1" +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. Import the added module in the [ppocr/modeling/backbones/\_*init\_*.py](../../ppocr/modeling/backbones/__init__.py) file. + +After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as: + +```yaml linenums="1" + Backbone: + name: MyBackbone + args1: args1 +``` + +**NOTE**: More details about replace Backbone and other mudule can be found in [doc](../../algorithm/add_new_algorithm.en.md). + +### 2.4 Mixed Precision Training + +If you want to speed up your training further, you can use [Auto Mixed Precision Training](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), taking a single machine and a single gpu as an example, the commands are as follows: + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True +``` + +### 2.5 Distributed Training + +During multi-machine multi-gpu training, use the `--ips` parameter to set the used machine IP address, and the `--gpus` parameter to set the used GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained +``` + +**Note:** (1) When using multi-machine and multi-gpu training, you need to replace the ips value in the above command with the address of your machine, and the machines need to be able to ping each other. (2) Training needs to be launched separately on multiple machines. The command to view the ip address of the machine is `ifconfig`. (3) For more details about the distributed training speedup ratio, please refer to [Distributed Training Tutorial](../blog/distributed_training.en.md). + +### 2.6 Training with knowledge distillation + +Knowledge distillation is supported in PaddleOCR for text detection training process. For more details, please refer to [doc](./../model_compress/knowledge_distillation.en.md). + +### 2.7 Training on other platform(Windows/macOS/Linux DCU) + +- Windows GPU/CPU +The Windows platform is slightly different from the Linux platform: +Windows platform only supports `single gpu` training and inference, specify GPU for training `set CUDA_VISIBLE_DEVICES=0` +On the Windows platform, DataLoader only supports single-process mode, so you need to set `num_workers` to 0; + +- macOS +GPU mode is not supported, you need to set `use_gpu` to False in the configuration file, and the rest of the training evaluation prediction commands are exactly the same as Linux GPU. + +- Linux DCU +Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +### 2.8 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the detection model, please refer to: [Model Fine-tuning Tutorial](./finetune_en.md). + +## 3. Evaluation and Test + +### 3.1 Evaluation + +PaddleOCR calculates three indicators for evaluating performance of OCR detection task: Precision, Recall, and Hmean(F-Score). + +Run the following code to calculate the evaluation indicators. The result will be saved in the test result file specified by `save_res_path` in the configuration file `det_db_mv3.yml` + +When evaluating, set post-processing parameters `box_thresh=0.6`, `unclip_ratio=1.5`. If you use different datasets, different models for training, these two parameters should be adjusted for better result. + +The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. + +```bash linenums="1" +python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=1.5 +``` + +- Note: `box_thresh` and `unclip_ratio` are parameters required for DB post-processing, and not need to be set when evaluating the EAST and SAST model. + +### 3.2 Test + +Test the detection result on a single image: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" +``` + +When testing the DB model, adjust the post-processing threshold: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=2.0 +``` + +Test the detection result on all images in the folder: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/det_db/best_accuracy" +``` + +## 4. Inference + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +Firstly, we can convert DB trained model to inference model: + +```bash linenums="1" +python3 tools/export_model.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model="./output/det_db/best_accuracy" Global.save_inference_dir="./output/det_db_inference/" +``` + +The detection inference model prediction: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="DB" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +If it is other detection algorithms, such as the EAST, the det_algorithm parameter needs to be modified to EAST, and the default is the DB algorithm: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="EAST" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +## 5. FAQ + +Q1: The prediction results of trained model and inference model are inconsistent? + +**A**: Most of the problems are caused by the inconsistency of the pre-processing and post-processing parameters during the prediction of the trained model and the pre-processing and post-processing parameters during the prediction of the inference model. Taking the model trained by the det_mv3_db.yml configuration file as an example, the solution to the problem of inconsistent prediction results between the training model and the inference model is as follows: + +- Check whether the [trained model preprocessing](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116) is consistent with the prediction [preprocessing function of the inference model](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42). When the algorithm is evaluated, the input image size will affect the accuracy. In order to be consistent with the paper, the image is resized to [736, 1280] in the training icdar15 configuration file, but there is only a set of default parameters when the inference model predicts, which will be considered To predict the speed problem, the longest side of the image is limited to 960 for resize by default. The preprocessing function of the training model preprocessing and the inference model is located in [ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147) +- Check whether the [post-processing of the trained model](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51) is consistent with the [post-processing parameters of the inference](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50). diff --git a/docs/ppocr/model_train/detection.md b/docs/ppocr/model_train/detection.md new file mode 100644 index 0000000000..3980a9fd9d --- /dev/null +++ b/docs/ppocr/model_train/detection.md @@ -0,0 +1,234 @@ +--- +comments: true +--- + + +# 文字检测 + +本节以icdar2015数据集为例,介绍PaddleOCR中检测模型训练、评估、测试的使用方式。 + +## 1. 准备数据和模型 + +### 1.1 准备数据集 + +准备数据集可参考 [ocr_datasets](../../datasets/ocr_datasets.md) 。 + +### 1.2 下载预训练模型 + +首先下载模型backbone的pretrain model,PaddleOCR的检测模型目前支持两种backbone,分别是MobileNetV3、ResNet_vd系列, +您可以根据需求使用[PaddleClas](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.0/ppcls/modeling/architectures)中的模型更换backbone, +对应的backbone预训练模型可以从[PaddleClas repo 主页中找到下载链接](https://github.com/PaddlePaddle/PaddleClas/blob/release%2F2.0/README_cn.md#resnet%E5%8F%8A%E5%85%B6vd%E7%B3%BB%E5%88%97)。 + +```bash linenums="1" +cd PaddleOCR/ +# 根据backbone的不同选择下载对应的预训练模型 +# 下载MobileNetV3的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/MobileNetV3_large_x0_5_pretrained.pdparams +# 或,下载ResNet18_vd的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams +# 或,下载ResNet50_vd的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet50_vd_ssld_pretrained.pdparams +``` + +## 2. 开始训练 + +### 2.1 启动训练 + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +```bash linenums="1" +# 单机单卡训练 mv3_db 模型 +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained + +# 单机多卡训练,通过 --gpus 参数设置使用的GPU ID +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained + +``` + +上述指令中,通过-c 选择训练使用configs/det/det_mv3_db.yml配置文件。 +有关配置文件的详细解释,请参考[链接](../blog/config.md)。 + +您也可以通过-o参数在不需要修改yml文件的情况下,改变训练的参数,比如,调整训练的学习率为0.0001 + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml -o Optimizer.base_lr=0.0001 +``` + +### 2.2 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrained_model`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrained_model`指定的模型。 + +### 2.3 更换Backbone 训练 + +PaddleOCR将网络划分为四部分,分别在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones-> +necks->heads)依次通过这四个部分。 + +```bash linenums="1" +├── architectures # 网络的组网代码 +├── transforms # 网络的图像变换模块 +├── backbones # 网络的特征提取模块 +├── necks # 网络的特征增强模块 +└── heads # 网络的输出模块 +``` + +如果要更换的Backbone 在PaddleOCR中有对应实现,直接修改配置yml文件中`Backbone`部分的参数即可。 + +如果要使用新的Backbone,更换backbones的例子如下: + +1. 在 [ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。 +2. 在 my_backbone.py 文件内添加相关代码,示例代码如下: + +```python linenums="1" +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. 在 [ppocr/modeling/backbones/\_*init\_*.py](../../ppocr/modeling/backbones/__init__.py)文件内导入添加的`MyBackbone`模块,然后修改配置文件中Backbone进行配置即可使用,格式如下: + +```yaml linenums="1" +Backbone: +name: MyBackbone +args1: args1 +``` + +**注意**:如果要更换网络的其他模块,可以参考[文档](../../algorithm/add_new_algorithm.md)。 + +### 2.4 混合精度训练 + +如果您想进一步加快训练速度,可以使用[自动混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), 以单机单卡为例,命令如下: + +```bash linenums="1" +python3 tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True +``` + +### 2.5 分布式训练 + +多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/det/det_mv3_db.yml \ + -o Global.pretrained_model=./pretrain_models/MobileNetV3_large_x0_5_pretrained +``` + +**注意:** (1)采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通;(2)训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`;(3)更多关于分布式训练的性能优势等信息,请参考:[分布式训练教程](../blog/distributed_training.md)。 + +### 2.6 知识蒸馏训练 + +PaddleOCR支持了基于知识蒸馏的检测模型训练过程,更多内容可以参考[知识蒸馏说明文档](../model_compress/knowledge_distillation.md)。 + +**注意:** 知识蒸馏训练目前只支持PP-OCR使用的`DB`和`CRNN`算法。 + +### 2.7 其他训练环境 + +- Windows GPU/CPU +在Windows平台上与Linux平台略有不同: +Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUDA_VISIBLE_DEVICES=0` +在Windows平台,DataLoader只支持单进程模式,因此需要设置 `num_workers` 为0; + +- macOS +不支持GPU模式,需要在配置文件中设置`use_gpu`为False,其余训练评估预测命令与Linux GPU完全相同。 + +- Linux DCU +DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。 + +### 2.8 模型微调 + +实际使用过程中,建议加载官方提供的预训练模型,在自己的数据集中进行微调,关于检测模型的微调方法,请参考:[模型微调教程](./finetune.md)。 + +## 3. 模型评估与预测 + +### 3.1 指标评估 + +PaddleOCR计算三个OCR检测相关的指标,分别是:Precision、Recall、Hmean(F-Score)。 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。 + +```bash linenums="1" +python3 tools/eval.py -c configs/det/det_mv3_db.yml -o Global.checkpoints="{path/to/weights}/best_accuracy" +``` + +### 3.2 测试检测效果 + +测试单张图像的检测效果: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" +``` + +测试DB模型时,调整后处理阈值: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/img_10.jpg" Global.pretrained_model="./output/det_db/best_accuracy" PostProcess.box_thresh=0.6 PostProcess.unclip_ratio=2.0 +``` + +- 注:`box_thresh`、`unclip_ratio`是DB后处理参数,其他检测模型不支持。 + +测试文件夹下所有图像的检测效果: + +```bash linenums="1" +python3 tools/infer_det.py -c configs/det/det_mv3_db.yml -o Global.infer_img="./doc/imgs_en/" Global.pretrained_model="./output/det_db/best_accuracy" +``` + +## 4. 模型导出与预测 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +检测模型转inference 模型方式: + +```bash linenums="1" +# 加载配置文件`det_mv3_db.yml`,从`output/det_db`目录下加载`best_accuracy`模型,inference模型保存在`./output/det_db_inference`目录下 +python3 tools/export_model.py -c configs/det/det_mv3_db.yml -o Global.pretrained_model="./output/det_db/best_accuracy" Global.save_inference_dir="./output/det_db_inference/" +``` + +DB检测模型inference 模型预测: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="DB" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +如果是其他检测,比如EAST模型,det_algorithm参数需要修改为EAST,默认为DB算法: + +```bash linenums="1" +python3 tools/infer/predict_det.py --det_algorithm="EAST" --det_model_dir="./output/det_db_inference/" --image_dir="./doc/imgs/" --use_gpu=True +``` + +更多关于推理超参数的配置与解释,请参考:[模型推理超参数解释教程](../blog/inference_args.md)。 + +## 5. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? + +**A**:此类问题出现较多,问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。以det_mv3_db.yml配置文件训练的模型为例,训练模型、inference模型预测结果不一致问题解决方式如下: + +- 检查[trained model预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L116),和[inference model的预测预处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/predict_det.py#L42)函数是否一致。算法在评估的时候,输入图像大小会影响精度,为了和论文保持一致,训练icdar15配置文件中将图像resize到[736, 1280],但是在inference model预测的时候只有一套默认参数,会考虑到预测速度问题,默认限制图像最长边为960做resize的。训练模型预处理和inference模型的预处理函数位于[ppocr/data/imaug/operators.py](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/ppocr/data/imaug/operators.py#L147) +- 检查[trained model后处理](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/configs/det/det_mv3_db.yml#L51),和[inference 后处理参数](https://github.com/PaddlePaddle/PaddleOCR/blob/c1ed243fb68d5d466258243092e56cbae32e2c14/tools/infer/utility.py#L50)是否一致。 + +Q1: 训练EAST模型提示找不到lanms库? + +**A**:执行pip3 install lanms-nova 即可。 diff --git a/docs/ppocr/model_train/finetune.en.md b/docs/ppocr/model_train/finetune.en.md new file mode 100644 index 0000000000..b683578586 --- /dev/null +++ b/docs/ppocr/model_train/finetune.en.md @@ -0,0 +1,227 @@ +--- +comments: true +--- + +# Fine-tune + +## 1. background and meaning + +The PP-OCR series models provided by PaddleOCR have excellent performance in general scenarios and can solve detection and recognition problems in most cases. In vertical scenarios, if you want to obtain better model, you can further improve the accuracy of the PP-OCR series detection and recognition models through fine-tune. + +This article mainly introduces some precautions when fine-tuning the text detection and recognition model. Finally, you can obtain a text detection and recognition model with higher accuracy through model fine-tuning in your own scenarios. + +The core points of this article are as follows: + +1. The pre-trained model provided by PP-OCR has better generalization ability +2. Adding a small amount of real data (detection:>=500, recognition:>=5000) will greatly improve the detection and recognition effect of vertical scenes +3. When fine-tuning the model, adding real general scene data can further improve the model accuracy and generalization performance +4. In the text detection task, increasing the prediction shape of the image can further improve the detection effect of the smaller text area +5. When fine-tuning the model, it is necessary to properly adjust the hyperparameters (learning rate, batch size are the most important) to obtain a better fine-tuning effect. + +For more details, please refer to Chapter 2 and Chapter 3。 + +## 2. Text detection model fine-tuning + +### 2.1 Dataset + +* Dataset: It is recommended to prepare at least 500 text detection datasets for model fine-tuning. + +* Dataset annotation: single-line text annotation format, it is recommended that the labeled detection frame be consistent with the actual semantic content. For example, in the train ticket scene, the surname and first name may be far apart, but they belong to the same detection field semantically. Here, the entire name also needs to be marked as a detection frame. + +### 2.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),pre-trained model: [ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar), its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](../models_list.en.md)。 + +Note: When using the above pre-trained model, you need to use the `student.pdparams` file in the folder as the pre-trained model, that is, only use the student model. + +### 2.3 Training hyperparameter + +When fine-tuning the model, the most important hyperparameter is the pre-training model path `pretrained_model`, `learning_rate`与`batch_size`,some hyperparameters are as follows: + +```yaml linenums="1" +Global: + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # pre-training model path +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # learning_rate + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +Train: + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 # single gpu batch size + num_workers: 4 +``` + +In the above configuration file, you need to specify the `pretrained_model` field as the `student.pdparams` file path. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*8=64`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example + +* If your scenario is single-gpu training, single gpu batch_size=8, then the total batch_size=8, it is recommended to adjust the learning rate to about `1e-4`. +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=4 for a single gpu, and the total batch_size=4. It is recommended to adjust the learning rate to about `5e-5`. + +### 2.4 Prediction hyperparameter + +When exporting and inferring the trained model, you can further adjust the predicted image scale to improve the detection effect of small-area text. The following are some hyperparameters during DBNet inference, which can be adjusted appropriately to improve the effect. + +| hyperparameter | type | default | meaning | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, pixels with a score greater than the threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | When the average score of all pixels within the frame of the detection result is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion coefficient of `Vatti clipping`, using this method to expand the text area | +| max_batch_size | int | 10 | batch size | +| use_dilation | bool | False | Whether to expand the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB's detection result score calculation method supports `fast` and `slow`. `fast` calculates the average score based on all pixels in the polygon’s circumscribed rectangle border, and `slow` calculates the average score based on all pixels in the original polygon. The calculation speed is relatively slower, but more accurate. | + +For more information on inference methods, please refer to[Paddle Inference doc](../infer_deploy/python_infer.en.md)。 + +## 3. Text recognition model fine-tuning + +### 3.1 Dataset + +* Dataset:If the dictionary is not changed, it is recommended to prepare at least 5,000 text recognition datasets for model fine-tuning; if the dictionary is changed (not recommended), more quantities are required. + +* Data distribution: It is recommended that the distribution be as consistent as possible with the actual measurement scenario. If the actual scene contains a lot of short text, it is recommended to include more short text in the training data. If the actual scene has high requirements for the recognition effect of spaces, it is recommended to include more text content with spaces in the training data. + +* Data synthesis: In the case of some character recognition errors, it is recommended to obtain a batch of specific character dataset, add it to the original dataset and use a small learning rate for fine-tuning. The ratio of original dataset to new dataset can be 10:1 to 5:1 to avoid overfitting of the model caused by too much data in a single scene. At the same time, try to balance the word frequency of the corpus to ensure that the frequency of common words will not be too low. + + Specific characters can be generated using the TextRenderer tool, for synthesis examples, please refer to [data synthesis](../../applications/光功率计数码管字符识别.md) + . The synthetic data corpus should come from real usage scenarios as much as possible, and keep the richness of fonts and backgrounds on the basis of being close to the real scene, which will help improve the model effect. + +* Common Chinese and English data: During training, common real data can be added to the training set (for example, in the fine-tuning scenario without changing the dictionary, it is recommended to add real data such as LSVT, RCTW, MTWI) to further improve the generalization performance of the model. + +### 3.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),pre-trained model: [ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar),its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](../model_list.en.md)。 + +The PP-OCRv3 model uses the GTC strategy. The SAR branch has a large number of parameters. When the training data is a simple scene, the model is easy to overfit, resulting in poor fine-tuning effect. It is recommended to remove the GTC strategy. The configuration file of the model structure is modified as follows: + +```yaml linenums="1" +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: False + Head: + name: CTCHead + fc_decay: 0.00001 +Loss: + name: CTCLoss + +Train: + dataset: + ...... + transforms: + # remove RecConAug + # - RecConAug: + # prob: 0.5 + # ext_data_num: 2 + # image_shape: [48, 320, 3] + # max_text_length: *max_text_length + - RecAug: + # modify Encode + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + +Eval: + dataset: + ... + transforms: + ... + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + + +``` + +### 3.3 Training hyperparameter + +Similar to text detection task fine-tuning, when fine-tuning the recognition model, the most important hyperparameters are the pre-trained model path `pretrained_model`, `learning_rate` and `batch_size`, some default configuration files are shown below. + +```yaml linenums="1" +Global: + pretrained_model: # pre-training model path +Optimizer: + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] # learning_rate + warmup_epoch: 5 + regularizer: + name: 'L2' + factor: 0 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: [1.0] # Sampling ratio, the default value is [1.0] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 128 # single gpu batch size + num_workers: 8 + +``` + +In the above configuration file, you first need to specify the `pretrained_model` field as the `ch_PP-OCRv3_rec_train/best_accuracy.pdparams` file path decompressed in Chapter 3.2. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*128=1024`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example: + +* If your scenario is single-gpu training, single gpu batch_size=128, then the total batch_size=128, in the case of loading the pre-trained model, it is recommended to adjust the learning rate to about `[1e-4, 2e-5]` (For the piecewise learning rate strategy, two values need to be set, the same below). +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=64 for a single gpu, and the total batch_size=64. When loading the pre-trained model, it is recommended to adjust the learning rate to `[5e-5 , 1e-5]`about. + +If there is general real scene data added, it is recommended that in each epoch, the amount of vertical scene data and real scene data should be kept at about 1:1. + +For example: your own vertical scene recognition data volume is 1W, the data label file is `vertical.txt`, the collected general scene recognition data volume is 10W, and the data label file is `general.txt`. + +Then, the `label_file_list` and `ratio_list` parameters can be set as shown below. In each epoch, `vertical.txt` will be fully sampled (sampling ratio is 1.0), including 1W pieces of data; `general.txt` will be sampled according to a sampling ratio of 0.1, including `10W*0.1=1W` pieces of data, the final ratio of the two is `1:1`. + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - vertical.txt + - general.txt + ratio_list: [1.0, 0.1] +``` + +### 3.4 training optimization + +The training process does not happen overnight. After completing a stage of training evaluation, it is recommended to collect and analyze the badcase of the current model in the real scene, adjust the proportion of training data in a targeted manner, or further add synthetic data. Through multiple iterations of training, the model effect is continuously optimized. + +If you modify the custom dictionary during training, since the parameters of the last layer of FC cannot be loaded, it is normal for acc=0 at the beginning of the iteration. Don't worry, loading the pre-trained model can still speed up the model convergence. diff --git a/docs/ppocr/model_train/finetune.md b/docs/ppocr/model_train/finetune.md new file mode 100644 index 0000000000..a04a589a77 --- /dev/null +++ b/docs/ppocr/model_train/finetune.md @@ -0,0 +1,227 @@ +--- +comments: true +--- + +# 模型微调 + +## 1. 模型微调背景与意义 + +PaddleOCR提供的PP-OCR系列模型在通用场景中性能优异,能够解决绝大多数情况下的检测与识别问题。在垂类场景中,如果希望获取更优的模型效果,可以通过模型微调的方法,进一步提升PP-OCR系列检测与识别模型的精度。 + +本文主要介绍文本检测与识别模型在模型微调时的一些注意事项,最终希望您在自己的场景中,通过模型微调,可以获取精度更高的文本检测与识别模型。 + +本文核心要点如下所示。 + +1. PP-OCR提供的预训练模型有较好的泛化能力 +2. 加入少量真实数据(检测任务>=500张, 识别任务>=5000张),会大幅提升垂类场景的检测与识别效果 +3. 在模型微调时,加入真实通用场景数据,可以进一步提升模型精度与泛化性能 +4. 在图像检测任务中,增大图像的预测尺度,能够进一步提升较小文字区域的检测效果 +5. 在模型微调时,需要适当调整超参数(学习率,batch size最为重要),以获得更优的微调效果。 + +更多详细内容,请参考第2章与第3章。 + +## 2. 文本检测模型微调 + +### 2.1 数据选择 + +* 数据量:建议至少准备500张的文本检测数据集用于模型微调。 + +* 数据标注:单行文本标注格式,建议标注的检测框与实际语义内容一致。如在火车票场景中,姓氏与名字可能离得较远,但是它们在语义上属于同一个检测字段,这里也需要将整个姓名标注为1个检测框。 + +### 2.2 模型选择 + +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),预训练模型:[ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 + +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](../model_list.md)。 + +注意:在使用上述预训练模型的时候,需要使用文件夹中的`student.pdparams`文件作为预训练模型,即,仅使用学生模型。 + +### 2.3 训练超参选择 + +在模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`,部分配置文件如下所示。 + +```yaml linenums="1" +Global: + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # 预训练模型路径 +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # 学习率 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +Train: + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 # 单卡batch size + num_workers: 4 +``` + +上述配置文件中,首先需要将`pretrained_model`字段指定为`student.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*8=64`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 + +* 如果您的场景中是单卡训练,单卡batch_size=8,则总的batch_size=8,建议将学习率调整为`1e-4`左右。 +* 如果您的场景中是单卡训练,由于显存限制,只能设置单卡batch_size=4,则总的batch_size=4,建议将学习率调整为`5e-5`左右。 + +### 2.4 预测超参选择 + +对训练好的模型导出并进行推理时,可以通过进一步调整预测的图像尺度,来提升小面积文本的检测效果,下面是DBNet推理时的一些超参数,可以通过适当调整,提升效果。 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | DB输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 | +| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 | +| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 | +| max_batch_size | int | 10 | 预测的batch size | +| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 | +| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + +更多关于推理方法的介绍可以参考[Paddle Inference推理教程](../infer_deploy/python_infer.md)。 + +## 3. 文本识别模型微调 + +### 3.1 数据选择 + +* 数据量:不更换字典的情况下,建议至少准备5000张的文本识别数据集用于模型微调;如果更换了字典(不建议),需要的数量更多。 + +* 数据分布:建议分布与实测场景尽量一致。如果实测场景包含大量短文本,则训练数据中建议也包含较多短文本,如果实测场景对于空格识别效果要求较高,则训练数据中建议也包含较多带空格的文本内容。 + +* 数据合成:针对部分字符识别有误的情况,建议获取一批特定字符数据,加入到原数据中使用小学习率微调。其中原始数据与新增数据比例可尝试 10:1 ~ 5:1, 避免单一场景数据过多导致模型过拟合,同时尽量平衡语料词频,确保常用字的出现频率不会过低。 + + 特定字符生成可以使用 TextRenderer 工具,合成例子可参考 [数码管数据合成](../../applications/光功率计数码管字符识别.md) + ,合成数据语料尽量来自真实使用场景,在贴近真实场景的基础上保持字体、背景的丰富性,有助于提升模型效果。 + +* 通用中英文数据:在训练的时候,可以在训练集中添加通用真实数据(如在不更换字典的微调场景中,建议添加LSVT、RCTW、MTWI等真实数据),进一步提升模型的泛化性能。 + +### 3.2 模型选择 + +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),预训练模型:[ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 + +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](../model_list.md)。 + +PP-OCRv3 模型使用了GTC策略,其中SAR分支参数量大,当训练数据为简单场景时模型容易过拟合,导致微调效果不佳,建议去除GTC策略,模型结构部分配置文件修改如下: + +```yaml linenums="1" +Architecture: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: False + Head: + name: CTCHead + fc_decay: 0.00001 +Loss: + name: CTCLoss + +Train: + dataset: + ...... + transforms: + # 去除 RecConAug 增广 + # - RecConAug: + # prob: 0.5 + # ext_data_num: 2 + # image_shape: [48, 320, 3] + # max_text_length: *max_text_length + - RecAug: + # 修改 Encode 方式 + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + +Eval: + dataset: + ... + transforms: + ... + - CTCLabelEncode: + - KeepKeys: + keep_keys: + - image + - label + - length +... + + +``` + +### 3.3 训练超参选择 + +与文本检测任务微调相同,在识别模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`,部分默认配置文件如下所示。 + +```yaml linenums="1" +Global: + pretrained_model: # 预训练模型路径 +Optimizer: + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] # 学习率 + warmup_epoch: 5 + regularizer: + name: 'L2' + factor: 0 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: [1.0] # 采样比例,默认值是[1.0] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 128 # 单卡batch size + num_workers: 8 + +``` + +上述配置文件中,首先需要将`pretrained_model`字段指定为3.2章节中解压得到的`ch_PP-OCRv3_rec_train/best_accuracy.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*128=1024`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如: + +* 如果您的场景中是单卡训练,单卡batch_size=128,则总的batch_size=128,在加载预训练模型的情况下,建议将学习率调整为`[1e-4, 2e-5]`左右(piecewise学习率策略,需设置2个值,下同)。 +* 如果您的场景中是单卡训练,因为显存限制,只能设置单卡batch_size=64,则总的batch_size=64,在加载预训练模型的情况下,建议将学习率调整为`[5e-5, 1e-5]`左右。 + +如果有通用真实场景数据加进来,建议每个epoch中,垂类场景数据与真实场景的数据量保持在1:1左右。 + +比如:您自己的垂类场景识别数据量为1W,数据标签文件为`vertical.txt`,收集到的通用场景识别数据量为10W,数据标签文件为`general.txt`, + +那么,可以设置`label_file_list`和`ratio_list`参数如下所示。每个epoch中,`vertical.txt`中会进行全采样(采样比例为1.0),包含1W条数据;`general.txt`中会按照0.1的采样比例进行采样,包含`10W*0.1=1W`条数据,最终二者的比例为`1:1`。 + +```yaml linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - vertical.txt + - general.txt + ratio_list: [1.0, 0.1] +``` + +### 3.4 训练调优 + +训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。通过多次迭代训练,不断优化模型效果。 + +如果在训练时修改了自定义字典,由于无法加载最后一层FC的参数,在迭代初期acc=0是正常的情况,不必担心,加载预训练模型依然可以加快模型收敛。 diff --git a/docs/ppocr/model_train/images/angle_class_example.jpg b/docs/ppocr/model_train/images/angle_class_example.jpg new file mode 100644 index 0000000000..8e683be32c Binary files /dev/null and b/docs/ppocr/model_train/images/angle_class_example.jpg differ diff --git a/docs/ppocr/model_train/images/icdar_rec.png b/docs/ppocr/model_train/images/icdar_rec.png new file mode 100644 index 0000000000..a840d6af59 Binary files /dev/null and b/docs/ppocr/model_train/images/icdar_rec.png differ diff --git a/docs/ppocr/model_train/images/image-20240710082046188.jpg b/docs/ppocr/model_train/images/image-20240710082046188.jpg new file mode 100644 index 0000000000..4f3e945978 Binary files /dev/null and b/docs/ppocr/model_train/images/image-20240710082046188.jpg differ diff --git a/docs/ppocr/model_train/images/word_1-20240704092705543.png b/docs/ppocr/model_train/images/word_1-20240704092705543.png new file mode 100644 index 0000000000..7b915fd6da Binary files /dev/null and b/docs/ppocr/model_train/images/word_1-20240704092705543.png differ diff --git a/docs/ppocr/model_train/images/word_1-20240704092713071.jpg b/docs/ppocr/model_train/images/word_1-20240704092713071.jpg new file mode 100644 index 0000000000..cb5451e15a Binary files /dev/null and b/docs/ppocr/model_train/images/word_1-20240704092713071.jpg differ diff --git a/docs/ppocr/model_train/kie.en.md b/docs/ppocr/model_train/kie.en.md new file mode 100644 index 0000000000..8b3be83795 --- /dev/null +++ b/docs/ppocr/model_train/kie.en.md @@ -0,0 +1,476 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Key Information Extraction + +This tutorial provides a guide to the whole process of key information extraction using PaddleOCR, including data preparation, model training, optimization, evaluation, prediction of semantic entity recognition (SER) and relationship extraction (RE) tasks. + +## 1. Data Preparation + +### 1.1. Prepare for dataset + +PaddleOCR supports the following data format when training KIE models. + +- `general data` is used to train a dataset whose annotation is stored in a text file (SimpleDataset). + +The default storage path of training data is `PaddleOCR/train_data`. If you already have datasets on your disk, you only need to create a soft link to the dataset directory. + +```bash linenums="1" +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +### 1.2. Custom Dataset + +The training process generally includes the training set and the evaluation set. The data formats of the two sets are same. + +#### (1) Training set + +It is recommended to put the training images into the same folder, record the path and annotation of images in a text file. The contents of the text file are as follows: + +```text linenums="1" +" image path annotation information " +zh_train_0.jpg [{"transcription": "汇丰晋信", "label": "other", "points": [[104, 114], [530, 114], [530, 175], [104, 175]], "id": 1, "linking": []}, {"transcription": "受理时间:", "label": "question", "points": [[126, 267], [266, 267], [266, 305], [126, 305]], "id": 7, "linking": [[7, 13]]}, {"transcription": "2020.6.15", "label": "answer", "points": [[321, 239], [537, 239], [537, 285], [321, 285]], "id": 13, "linking": [[7, 13]]}] +zh_train_1.jpg [{"transcription": "中国人体器官捐献", "label": "other", "points": [[544, 459], [954, 459], [954, 517], [544, 517]], "id": 1, "linking": []}, {"transcription": ">编号:MC545715483585", "label": "other", "points": [[1462, 470], [2054, 470], [2054, 543], [1462, 543]], "id": 10, "linking": []}, {"transcription": "CHINAORGANDONATION", "label": "other", "points": [[543, 516], [958, 516], [958, 551], [543, 551]], "id": 14, "linking": []}, {"transcription": "中国人体器官捐献志愿登记表", "label": "header", "points": [[635, 793], [1892, 793], [1892, 904], [635, 904]], "id": 18, "linking": []}] +... +``` + +**Note:** In the text file, please split the image path and annotation with `\t`. Otherwise, error will happen when training. + +The annotation can be parsed by `json` into a list of sub-annotations. Each element in the list is a dict, which stores the required information of each text line. The required fields are as follows. + +- transcription: stores the text content of the text line +- label: the category of the text line content +- points: stores the four point position information of the text line +- id: stores the ID information of the text line for RE model training +- linking: stores the connection information between text lines for RE model training + +#### (2) Evaluation set + +The evaluation set is constructed in the same way as the training set. + +#### (3) Dictionary file + +The textlines in the training set and the evaluation set contain label information. The list of all labels is stored in the dictionary file (such as `class_list.txt`). Each line in the dictionary file is represented as a label name. + +For example, FUND_zh data contains four categories. The contents of the dictionary file are as follows. + +```text linenums="1" +OTHER +QUESTION +ANSWER +HEADER +``` + +In the annotation file, the annotation information of the `label` field of the text line content of each annotation needs to belong to the dictionary content. + +The final dataset shall have the following file structure. + +```text linenums="1" +|-train_data + |-data_name + |- train.json + |- train + |- zh_train_0.png + |- zh_train_1.jpg + | ... + |- val.json + |- val + |- zh_val_0.png + |- zh_val_1.jpg + | ... +``` + +**Note:** + +-The category information in the annotation file is not case sensitive. For example, 'HEADER' and 'header' will be seen as the same category ID. + +- In the dictionary file, it is recommended to put the `other` category (other textlines that need not be paid attention to can be labeled as `other`) on the first line. When parsing, the category ID of the 'other' category will be resolved to 0, and the textlines predicted as `other` will not be visualized later. + +### 1.3. Download data + +If you do not have local dataset, you can donwload the source files of [XFUND](https://github.com/doc-analysis/XFUND) or [FUNSD](https://guillaumejaume.github.io/FUNSD) and use the scripts of [XFUND](../../ppstructure/kie/tools/trans_xfun_data.py) or [FUNSD](../../ppstructure/kie/tools/trans_funsd_label.py) for tranform them into PaddleOCR format. Then you can use the public dataset to quick experience KIE. + +For more information about public KIE datasets, please refer to [KIE dataset tutorial](../../datasets/kie_datasets.en.md). + +PaddleOCR also supports the annotation of KIE models. Please refer to [PPOCRLabel tutorial](https://github.com/PFCCLab/PPOCRLabel/blob/main/README.md). + +## 2. Training + +PaddleOCR provides training scripts, evaluation scripts and inference scripts. We will introduce based on VI-LayoutXLM model in this section. +This section will take the VI layoutxlm multimodal pre training model as an example to explain. + +> If you want to use the SDMGR based KIE algorithm, please refer to: [SDMGR tutorial](../../algorithm/kie/algorithm_kie_sdmgr.en.md). + +### 2.1. Start Training + +If you do not use a custom dataset, you can use XFUND_zh that has been processed in PaddleOCR dataset for quick experience. + +```bash linenums="1" +mkdir train_data +cd train_data +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +If you don't want to train, and want to directly experience the process of model evaluation, prediction, and inference, you can download the training model provided in PaddleOCR and skip section 2.1. + +Use the following command to download the trained model. + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# download and uncompress SER model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar & tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# download and uncompress RE model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar & tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +Start training: + +- If your paddlepaddle version is `CPU`, you need to set `Global.use_gpu=False` in your config file. +- During training, PaddleOCR will download the VI-LayoutXLM pretraining model by default. There is no need to download it in advance. + +```bash linenums="1" +# GPU training, support single card and multi-cards +# The training log will be save in "{Global.save_model_dir}/train.log" + +# train SER model using single card +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# train SER model using multi-cards, you can use --gpus to assign the GPU ids. +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# train RE model using single card +python3 tools/train.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml +``` + +Take the SER model training as an example. After the training is started, you will see the following log output. + +```bash linenums="1" +[2022/08/08 16:28:28] ppocr INFO: epoch: [1/200], global_step: 10, lr: 0.000006, loss: 1.871535, avg_reader_cost: 0.28200 s, avg_batch_cost: 0.82318 s, avg_samples: 8.0, ips: 9.71838 samples/s, eta: 0:51:59 +[2022/08/08 16:28:33] ppocr INFO: epoch: [1/200], global_step: 19, lr: 0.000018, loss: 1.461939, avg_reader_cost: 0.00042 s, avg_batch_cost: 0.32037 s, avg_samples: 6.9, ips: 21.53773 samples/s, eta: 0:37:55 +[2022/08/08 16:28:39] ppocr INFO: cur metric, precision: 0.11526348939743859, recall: 0.19776657060518732, hmean: 0.14564265817747712, fps: 34.008392345050055 +[2022/08/08 16:28:45] ppocr INFO: save best model is to ./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +[2022/08/08 16:28:45] ppocr INFO: best metric, hmean: 0.14564265817747712, precision: 0.11526348939743859, recall: 0.19776657060518732, fps: 34.008392345050055, best_epoch: 1 +[2022/08/08 16:28:51] ppocr INFO: save model in ./output/ser_vi_layoutxlm_xfund_zh/latest +``` + +The following information will be automatically printed. + +|Field | meaning| +| :----: | :------: | +|epoch | current iteration round| +|iter | current iteration times| +|lr | current learning rate| +|loss | current loss function| +| reader_cost | current batch data processing time| +| batch_ Cost | total current batch time| +|samples | number of samples in the current batch| +|ips | number of samples processed per second| + +PaddleOCR supports evaluation during training. you can modify `eval_batch_step` in the config file `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` (default as 19 iters). Trained model with best hmean will be saved as `output/ser_vi_layoutxlm_xfund_zh/best_accuracy/`. + +If the evaluation dataset is very large, it's recommended to enlarge the eval interval or evaluate the model after training. + +**Note:** for more KIE models training and configuration files, you can go into `configs/kie/` or refer to [Frontier KIE algorithms](./algorithm_overview_en.md). + +If you want to train model on your own dataset, you need to modify the data path, dictionary file and category number in the configuration file. + +Take `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` as an example, contents we need to fix is as follows. + +```yaml linenums="1" +Architecture: + # ... + Backbone: + name: LayoutXLMForSer + pretrained: True + mode: vi + # Assuming that n categroies are included in the dictionary file (other is included), the the num_classes is set as 2n-1 + num_classes: &num_classes 7 + +PostProcess: + name: kieSerTokenLayoutLMPostProcess + # Modify the dictionary file path for your custom dataset + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Train: + dataset: + name: SimpleDataSet + # Modify the data path for your training dataset + data_dir: train_data/XFUND/zh_train/image + # Modify the data annotation path for your training dataset + label_file_list: + - train_data/XFUND/zh_train/train.json + ... + loader: + # batch size for single card when training + batch_size_per_card: 8 + ... + +Eval: + dataset: + name: SimpleDataSet + # Modify the data path for your evaluation dataset + data_dir: train_data/XFUND/zh_val/image + # Modify the data annotation path for your evaluation dataset + label_file_list: + - train_data/XFUND/zh_val/val.json + ... + loader: + # batch size for single card when evaluation + batch_size_per_card: 8 +``` + +**Note that the configuration file for prediction/evaluation must be consistent with the training file.** + +### 2.2. Resume Training + +If the training process is interrupted and you want to load the saved model to resume training, you can specify the path of the model to be loaded by specifying `Architecture.Backbone.checkpoints`. + +```bash linenums="1" +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +**Note:** + +- Priority of `Architecture.Backbone.checkpoints` is higher than`Architecture.Backbone.pretrained`. You need to set `Architecture.Backbone.checkpoints` for model finetuning, resume and evalution. If you want to train with the NLP pretrained model, you need to set `Architecture.Backbone.pretrained` as `True` and set `Architecture.Backbone.checkpoints` as null (`null`). +- PaddleNLP pretrained models are used here for LayoutXLM series models, the model loading and saving logic is same as those in PaddleNLP. Therefore we do not need to set `Global.pretrained_model` or `Global.checkpoints` here. +- If you use knowledge distillation to train the LayoutXLM series models, resuming training is not supported now. + +### 2.3. Mixed Precision Training + +coming soon! + +### 2.4. Distributed Training + +During multi-machine multi-gpu training, use the `--ips` parameter to set the used machine IP address, and the `--gpus` parameter to set the used GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +``` + +**Note:** (1) When using multi-machine and multi-gpu training, you need to replace the ips value in the above command with the address of your machine, and the machines need to be able to ping each other. (2) Training needs to be launched separately on multiple machines. The command to view the ip address of the machine is `ifconfig`. (3) For more details about the distributed training speedup ratio, please refer to [Distributed Training Tutorial](../blog/distributed_training.en.md). + +### 2.5. Train with Knowledge Distillation + +Knowledge distillation is supported in PaddleOCR for KIE model training process. The configuration file is [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml). For more information, please refer to [doc](../model_compress/knowledge_distillation.en.md). + +**Note:** The saving and loading logic of the LayoutXLM series KIE models in PaddleOCR is consistent with PaddleNLP, so only the parameters of the student model are saved in the distillation process. If you want to use the saved model for evaluation, you need to use the configuration of the student model (the student model corresponding to the distillation file above is [ser_vi_layoutxlm_xfund_zh.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml). + +### 2.6. Training on other platform + +- Windows GPU/CPU +The Windows platform is slightly different from the Linux platform: +Windows platform only supports `single gpu` training and inference, specify GPU for training `set CUDA_VISIBLE_DEVICES=0` +On the Windows platform, DataLoader only supports single-process mode, so you need to set `num_workers` to 0; + +- macOS +GPU mode is not supported, you need to set `use_gpu` to False in the configuration file, and the rest of the training evaluation prediction commands are exactly the same as Linux GPU. + +- Linux DCU +Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +## 3. Evaluation and Test + +### 3.1. Evaluation + +The trained model will be saved in `Global.save_model_dir`. When evaluation, you need to set `Architecture.Backbone.checkpoints` as your model directroy. The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` file. + +```bash linenums="1" +# GPU evaluation, Global.checkpoints is the weight to be tested +python3 tools/eval.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +The following information will be printed such as precision, recall, hmean and so on. + +```bash linenums="1" +[2022/08/09 07:59:28] ppocr INFO: metric eval *************** +[2022/08/09 07:59:28] ppocr INFO: precision:0.697476609016161 +[2022/08/09 07:59:28] ppocr INFO: recall:0.8861671469740634 +[2022/08/09 07:59:28] ppocr INFO: hmean:0.7805806758686339 +[2022/08/09 07:59:28] ppocr INFO: fps:17.367364606899105 +``` + +### 3.2. Test + +Using the model trained by PaddleOCR, we can quickly get prediction through the following script. + +The default prediction image is stored in `Global.infer_img`, and the trained model weight is specified via `-o Global.checkpoints`. + +According to the `Global.save_model_dir` and `save_epoch_step` fields set in the configuration file, the following parameters will be saved. + +```text linenums="1" +output/ser_vi_layoutxlm_xfund_zh/ +├── best_accuracy + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── best_accuracy.pdopt +├── config.yml +├── train.log +├── latest + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── latest.pdopt +``` + +Among them, best_accuracy.*is the best model on the evaluation set; latest.* is the model of the last epoch. + +The configuration file for prediction must be consistent with the training file. If you finish the training process using `python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml`. You can use the following command for prediction. + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg +``` + +The output image is as follows, which is also saved in `Global.save_res_path`. + +![image-20240710082046188](./images/image-20240710082046188.jpg) + +During the prediction process, the detection and recognition model of PP-OCRv3 will be loaded by default for information extraction of OCR. If you want to load the OCR results obtained in advance, you can use the following method to predict, and specify `Global.infer_img` as the annotation file, which contains the image path and OCR information, and specifies `Global.infer_mode` as False, indicating that the OCR inference engine is not used at this time. + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +For the above image, if information extraction is performed using the labeled OCR results, the prediction results are as follows. + +![image-20240710082059968](./images/image-20240710082046188.jpg) + +It can be seen that part of the detection information is more accurate, but the overall information extraction results are basically the same. + +In RE model prediction, the SER model result needs to be given first, so the configuration file and model weight of SER need to be loaded at the same time, as shown in the following example. + +```bash linenums="1" +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/image/ \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ \ + ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +The result is as follows. + +![image-20240710082109713](./images/image-20240710082046188.jpg) + +If you want to load the OCR results obtained in advance, you can use the following method to predict, and specify `Global.infer_img` as the annotation file, which contains the image path and OCR information, and specifies `Global.infer_mode` as False, indicating that the OCR inference engine is not used at this time. + +```bash linenums="1" +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +`c_ser` denotes SER configurations file, `o_ser` denotes the SER model configurations that will override corresponding content in the file. + +The result is as follows. + +![image-20240710082117146](./images/image-20240710082046188.jpg) + +It can be seen that the re prediction results directly using the annotated OCR results are more accurate. + +## 4. Model inference + +### 4.1 Export the model + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +The SER model can be converted to the inference model using the following command. + +```bash linenums="1" +# -c Set the training algorithm yml configuration file. +# -o Set optional parameters. +# Architecture.Backbone.checkpoints Set the training model address. +# Global.save_inference_dir Set the address where the converted model will be saved. +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm +``` + +After the conversion is successful, there are three files in the model save directory: + +```text linenums="1" +inference/ser_vi_layoutxlm/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition +``` + +The RE model can be converted to the inference model using the following command. + +```bash linenums="1" +# -c Set the training algorithm yml configuration file. +# -o Set optional parameters. +# Architecture.Backbone.checkpoints Set the training model address. +# Global.save_inference_dir Set the address where the converted model will be saved. +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/re_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm +``` + +After the conversion is successful, there are three files in the model save directory: + +```text linenums="1" +inference/re_vi_layoutxlm/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition +``` + +### 4.2 Model inference + +The VI layoutxlm model performs reasoning based on the ser task, and can execute the following commands: + +Using the following command to infer the VI-LayoutXLM SER model. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visualized result will be saved in `./output`, which is shown as follows. + +![image-20240710082128694](./images/image-20240710082046188.jpg) + +Using the following command to infer the VI-LayoutXLM RE model. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visualized result will be saved in `./output`, which is shown as follows. + +![image-20240710082147184](./images/image-20240710082046188.jpg) + +## 5. FAQ + +Q1: After the training model is transferred to the inference model, the prediction effect is inconsistent? + +**A**:The problems are mostly caused by inconsistent preprocessing and postprocessing parameters when the trained model predicts and the preprocessing and postprocessing parameters when the inference model predicts. You can compare whether there are differences in preprocessing, postprocessing, and prediction in the configuration files used for training. diff --git a/docs/ppocr/model_train/kie.md b/docs/ppocr/model_train/kie.md new file mode 100644 index 0000000000..14e3f9a0ee --- /dev/null +++ b/docs/ppocr/model_train/kie.md @@ -0,0 +1,469 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 关键信息抽取 + +本文提供了PaddleOCR关键信息抽取的全流程指南,包括语义实体识别 (Semantic Entity Recognition) 以及关系抽取 (Relation Extraction, RE) 任务的数据准备、模型训练、调优、评估、预测,各个阶段的详细说明。 + +## 1. 数据准备 + +### 1.1. 准备数据集 + +在训练信息抽取相关模型时,PaddleOCR支持以下数据格式。 + +- `通用数据` 用于训练以文本文件存储的数据集(SimpleDataSet); + +训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +```bash linenums="1" +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +### 1.2. 自定义数据集 + +训练过程中一般包含训练集与验证集,二者数据格式相同,下面介绍如何自定义数据集。 + +#### (1)训练集 + +建议将训练图片放入同一个文件夹,并用一个文本文件记录图片路径和标签,文本文件里的内容如下: + +```python linenums="1" +" 图像文件名 图像标注信息 " +zh_train_0.jpg [{"transcription": "汇丰晋信", "label": "other", "points": [[104, 114], [530, 114], [530, 175], [104, 175]], "id": 1, "linking": []}, {"transcription": "受理时间:", "label": "question", "points": [[126, 267], [266, 267], [266, 305], [126, 305]], "id": 7, "linking": [[7, 13]]}, {"transcription": "2020.6.15", "label": "answer", "points": [[321, 239], [537, 239], [537, 285], [321, 285]], "id": 13, "linking": [[7, 13]]}] +zh_train_1.jpg [{"transcription": "中国人体器官捐献", "label": "other", "points": [[544, 459], [954, 459], [954, 517], [544, 517]], "id": 1, "linking": []}, {"transcription": ">编号:MC545715483585", "label": "other", "points": [[1462, 470], [2054, 470], [2054, 543], [1462, 543]], "id": 10, "linking": []}, {"transcription": "CHINAORGANDONATION", "label": "other", "points": [[543, 516], [958, 516], [958, 551], [543, 551]], "id": 14, "linking": []}, {"transcription": "中国人体器官捐献志愿登记表", "label": "header", "points": [[635, 793], [1892, 793], [1892, 904], [635, 904]], "id": 18, "linking": []}] +... +``` + +**注意:** 文本文件中默认请将图片路径和图片标签用 `\t` 分割,如用其他方式分割将造成训练报错。 + +其中图像标注信息字符串经过json解析之后可以得到一个列表信息,列表中每个元素是一个字典,存储了每个文本行的需要信息,各个字段的含义如下。 + +- transcription: 存储了文本行的文字内容 +- label: 该文本行内容所属的类别 +- points: 存储文本行的四点位置信息 +- id: 存储文本行的id信息,用于RE任务的训练 +- linking: 存储文本行的之间的连接信息,用于RE任务的训练 + +#### (2)验证集 + +验证集构建方式与训练集相同。 + +#### (3)字典文件 + +训练集与验证集中的文本行包含标签信息,所有标签的列表存在字典文件中(如`class_list.txt`),字典文件中的每一行表示为一个类别名称。 + +以XFUND_zh数据为例,共包含4个类别,字典文件内容如下所示。 + +```text linenums="1" +OTHER +QUESTION +ANSWER +HEADER +``` + +在标注文件中,每个标注的文本行内容的`label`字段标注信息需要属于字典内容。 + +最终数据集应有如下文件结构: + +```text linenums="1" +|-train_data + |-data_name + |- train.json + |- train + |- zh_train_0.png + |- zh_train_1.jpg + | ... + |- val.json + |- val + |- zh_val_0.png + |- zh_val_1.jpg + | ... +``` + +**注:** + +- 标注文件中的类别信息不区分大小写,如`HEADER`与`header`会被解析为相同的类别id,因此在标注的时候,不能使用小写处理后相同的字符串表示不同的类别。 +- 在整理标注文件的时候,建议将other这个类别(其他,无需关注的文本行可以标注为other)放在第一行,在解析的时候,会将`other`类别的类别id解析为0,后续不会对该类进行可视化。 + +### 1.3. 数据下载 + +如果你没有本地数据集,可以从[XFUND](https://github.com/doc-analysis/XFUND)或者[FUNSD](https://guillaumejaume.github.io/FUNSD/)官网下载数据,然后使用XFUND与FUNSD的处理脚本([XFUND](../../ppstructure/kie/tools/trans_xfun_data.py), [FUNSD](../../ppstructure/kie/tools/trans_funsd_label.py)),生成用于PaddleOCR训练的数据格式,并使用公开数据集快速体验关键信息抽取的流程。 + +更多关于公开数据集的介绍,请参考[关键信息抽取数据集说明文档](../../datasets/kie_datasets.md)。 + +PaddleOCR也支持了关键信息抽取模型的标注,具体使用方法请参考:[PPOCRLabel使用文档](https://github.com/PFCCLab/PPOCRLabel/blob/main/README_ch.md)。 + +## 2. 开始训练 + +PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 VI-LayoutXLM 多模态预训练模型为例进行讲解。 + +> 如果希望使用基于SDMGR的关键信息抽取算法,请参考:[SDMGR使用](../../algorithm/kie/algorithm_kie_sdmgr.md)。 + +### 2.1. 启动训练 + +如果你没有使用自定义数据集,可以使用PaddleOCR中已经处理好的XFUND_zh数据集进行快速体验。 + +```bash linenums="1" +mkdir train_data +cd train_data +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载PaddleOCR中提供的预训练模型,并跳过2.1部分。 + +使用下面的方法,下载基于XFUND数据的SER与RE任务预训练模型。 + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# 下载并解压SER预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar & tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# 下载并解压RE预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar & tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +开始训练: + +- 如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false +- PaddleOCR在训练时,会默认下载VI-LayoutXLM预训练模型,这里无需预先下载。 + +```bash linenums="1" +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存到 配置文件中"{Global.save_model_dir}" 下的train.log文件中 + +# SER单卡训练 +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# SER多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml + +# RE任务单卡训练 +python3 tools/train.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml +``` + +以SER任务为例,正常启动训练后,会看到以下log输出: + +```bash linenums="1" +[2022/08/08 16:28:28] ppocr INFO: epoch: [1/200], global_step: 10, lr: 0.000006, loss: 1.871535, avg_reader_cost: 0.28200 s, avg_batch_cost: 0.82318 s, avg_samples: 8.0, ips: 9.71838 samples/s, eta: 0:51:59 +[2022/08/08 16:28:33] ppocr INFO: epoch: [1/200], global_step: 19, lr: 0.000018, loss: 1.461939, avg_reader_cost: 0.00042 s, avg_batch_cost: 0.32037 s, avg_samples: 6.9, ips: 21.53773 samples/s, eta: 0:37:55 +[2022/08/08 16:28:39] ppocr INFO: cur metric, precision: 0.11526348939743859, recall: 0.19776657060518732, hmean: 0.14564265817747712, fps: 34.008392345050055 +[2022/08/08 16:28:45] ppocr INFO: save best model is to ./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +[2022/08/08 16:28:45] ppocr INFO: best metric, hmean: 0.14564265817747712, precision: 0.11526348939743859, recall: 0.19776657060518732, fps: 34.008392345050055, best_epoch: 1 +[2022/08/08 16:28:51] ppocr INFO: save model in ./output/ser_vi_layoutxlm_xfund_zh/latest +``` + +log 中自动打印如下信息: + +| 字段 | 含义 | +| :----: | :------: | +| epoch | 当前迭代轮次 | +| iter | 当前迭代次数 | +| lr | 当前学习率 | +| loss | 当前损失函数 | +| reader_cost | 当前 batch 数据处理耗时 | +| batch_cost | 当前 batch 总耗时 | +| samples | 当前 batch 内的样本数 | +| ips | 每秒处理图片的数量 | + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 中修改 `eval_batch_step` 设置评估频率,默认每19个iter评估一次。评估过程中默认将最佳hmean模型,保存为 `output/ser_vi_layoutxlm_xfund_zh/best_accuracy/` 。 + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +**提示:** 可通过 -c 参数选择 `configs/kie/` 路径下的多种模型配置进行训练,PaddleOCR支持的信息抽取算法可以参考[前沿算法列表](../../algorithm/overview.md)。 + +如果你希望训练自己的数据集,需要修改配置文件中的数据配置、字典文件以及类别数。 + +以 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 为例,修改的内容如下所示。 + +```yaml linenums="1" + +Architecture: + # ... + Backbone: + name: LayoutXLMForSer + pretrained: True + mode: vi + # 由于采用BIO标注,假设字典中包含n个字段(包含other)时,则类别数为2n-1; 假设字典中包含n个字段(不含other)时,则类别数为2n+1。否则在train过程会报:IndexError: (OutOfRange) label value should less than the shape of axis dimension 。 + num_classes: &num_classes 7 + +PostProcess: + name: kieSerTokenLayoutLMPostProcess + # 修改字典文件的路径为你自定义的数据集的字典路径 + class_path: &class_path train_data/XFUND/class_list_xfun.txt + +Train: + dataset: + name: SimpleDataSet + # 修改为你自己的训练数据目录 + data_dir: train_data/XFUND/zh_train/image + # 修改为你自己的训练数据标签文件 + label_file_list: + - train_data/XFUND/zh_train/train.json + ... + loader: + # 训练时的单卡batch_size + batch_size_per_card: 8 + ... + +Eval: + dataset: + name: SimpleDataSet + # 修改为你自己的验证数据目录 + data_dir: train_data/XFUND/zh_val/image + # 修改为你自己的验证数据标签文件 + label_file_list: + - train_data/XFUND/zh_val/val.json + ... + loader: + # 验证时的单卡batch_size + batch_size_per_card: 8 +``` + +**注意,预测/评估时的配置文件请务必与训练一致。** + +### 2.2. 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定`Architecture.Backbone.checkpoints`指定要加载的模型路径: + +```bash linenums="1" +python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +**注意**: + +- `Architecture.Backbone.checkpoints`的优先级高于`Architecture.Backbone.pretrained`,需要加载之前训练好的训练模型进行模型微调、恢复训练、模型评估时,需要使用`Architecture.Backbone.checkpoints`指定模型参数路径;如果需要使用默认提供的通用预训练模型进行训练,则需要指定`Architecture.Backbone.pretrained`为`True`,同时指定`Architecture.Backbone.checkpoints`为空(`null`)。 +- LayoutXLM系列模型均是调用了PaddleNLP中的预训练模型,模型加载与保存的逻辑与PaddleNLP基本一致,因此在这里不需要指定`Global.pretrained_model`或者`Global.checkpoints`参数;此外,LayoutXLM系列模型的蒸馏训练目前不支持断点训练。 + +### 2.3. 混合精度训练 + +coming soon! + +### 2.4. 分布式训练 + +多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml +``` + +**注意:** (1)采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通;(2)训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`;(3)更多关于分布式训练的性能优势等信息,请参考:[分布式训练教程](../blog/distributed_training.md)。 + +### 2.5. 知识蒸馏训练 + +PaddleOCR支持了基于U-DML知识蒸馏的关键信息抽取模型训练过程,配置文件请参考:[ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml),更多关于知识蒸馏的说明文档请参考:[知识蒸馏说明文档](../model_compress/knowledge_distillation.md)。 + +**注意**: PaddleOCR中LayoutXLM系列关键信息抽取模型的保存与加载逻辑与PaddleNLP保持一致,因此在蒸馏的过程中仅保存了学生模型的参数,如果希望使用保存的模型进行评估,需要使用学生模型的配置(上面的蒸馏文件对应的学生模型为[ser_vi_layoutxlm_xfund_zh.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml)) + +### 2.6. 其他训练环境 + +- Windows GPU/CPU +在Windows平台上与Linux平台略有不同: +Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUDA_VISIBLE_DEVICES=0` +在Windows平台,DataLoader只支持单进程模式,因此需要设置 `num_workers` 为0; + +- macOS +不支持GPU模式,需要在配置文件中设置`use_gpu`为False,其余训练评估预测命令与Linux GPU完全相同。 + +- Linux DCU +DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。 + +## 3. 模型评估与预测 + +### 3.1. 指标评估 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Architecture.Backbone.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 修改Eval中的 `label_file_path` 设置。 + +```bash linenums="1" +# GPU 评估, Global.checkpoints 为待测权重 +python3 tools/eval.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy +``` + +会输出以下信息,打印出precision、recall、hmean等信息。 + +```bash linenums="1" +[2022/08/09 07:59:28] ppocr INFO: metric eval *************** +[2022/08/09 07:59:28] ppocr INFO: precision:0.697476609016161 +[2022/08/09 07:59:28] ppocr INFO: recall:0.8861671469740634 +[2022/08/09 07:59:28] ppocr INFO: hmean:0.7805806758686339 +[2022/08/09 07:59:28] ppocr INFO: fps:17.367364606899105 +``` + +### 3.2. 测试信息抽取结果 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +默认预测的图片存储在 `infer_img` 里,通过 `-o Architecture.Backbone.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +```text linenums="1" +output/ser_vi_layoutxlm_xfund_zh/ +├── best_accuracy + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── best_accuracy.pdopt +├── config.yml +├── train.log +├── latest + ├── metric.states + ├── model_config.json + ├── model_state.pdparams +├── latest.pdopt +``` + +其中 best_accuracy.*是评估集上的最优模型;latest.* 是最新保存的一个模型。 + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml` 完成了模型的训练过程。 + +您可以使用如下命令进行中文模型预测。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg +``` + +预测图片如下所示,图片会存储在`Global.save_res_path`路径中。 + +![image-20240710082046188](./images/image-20240710082046188.jpg) + +预测过程中,默认会加载PP-OCRv3的检测识别模型,用于OCR的信息抽取,如果希望加载预先获取的OCR结果,可以使用下面的方式进行预测,指定`Global.infer_img`为标注文件,其中包含图片路径以及OCR信息,同时指定`Global.infer_mode`为False,表示此时不使用OCR预测引擎。 + +```bash linenums="1" +python3 tools/infer_kie_token_ser.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False +``` + +对于上述图片,如果使用标注的OCR结果进行信息抽取,预测结果如下。 + +![image-20240710082059968](./images/image-20240710082046188.jpg) + +可以看出,部分检测框信息更加准确,但是整体信息抽取识别结果基本一致。 + +在RE任务模型预测时,需要先给出模型SER结果,因此需要同时加载SER的配置文件与模型权重,示例如下。 + +```bash linenums="1" +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/best_accuracy/ \ + Global.infer_img=./train_data/XFUND/zh_val/image/ \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=pretrain_models/ \ + ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +预测结果如下所示。 + +![image-20240710082109713](./images/image-20240710082046188.jpg) + +如果希望使用标注或者预先获取的OCR信息进行关键信息抽取,同上,可以指定`Global.infer_mode`为False,指定`Global.infer_img`为标注文件。 + +```bash linenums="1" +python3 ./tools/infer_kie_token_ser_re.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./pretrain_models/re_vi_layoutxlm_udml_xfund_zh/re_layoutxlm_xfund_zh_v4_udml/best_accuracy/ Global.infer_img=./train_data/XFUND/zh_val/val.json Global.infer_mode=False -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o_ser Architecture.Backbone.checkpoints=pretrain_models/ser_vi_layoutxlm_udml_xfund_zh/best_accuracy/ +``` + +其中`c_ser`表示SER的配置文件,`o_ser` 后面需要加上待修改的SER模型与配置文件,如预训练权重等。 + +预测结果如下所示。 + +![image-20240710082117146](./images/image-20240710082046188.jpg) + +可以看出,直接使用标注的OCR结果的RE预测结果要更加准确一些。 + +## 4. 模型导出与预测 + +### 4.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +信息抽取模型中的SER任务转inference模型步骤如下: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Architecture.Backbone.checkpoints 参数设置待转换的训练模型地址 +# Global.save_inference_dir 参数设置转换的模型将保存的地址 + +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/ser_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/ser_vi_layoutxlm +``` + +转换成功后,在目录下有三个文件: + +```text linenums="1" +inference/ser_vi_layoutxlm/ + ├── inference.pdiparams # inference模型的参数文件 + ├── inference.pdiparams.info # inference模型的参数信息,可忽略 + └── inference.pdmodel # inference模型的模型结构文件 +``` + +信息抽取模型中的RE任务转inference模型步骤如下: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Architecture.Backbone.checkpoints 参数设置待转换的训练模型地址 +# Global.save_inference_dir 参数设置转换的模型将保存的地址 + +python3 tools/export_model.py -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml -o Architecture.Backbone.checkpoints=./output/re_vi_layoutxlm_xfund_zh/best_accuracy Global.save_inference_dir=./inference/re_vi_layoutxlm +``` + +转换成功后,在目录下有三个文件: + +```text linenums="1" +inference/re_vi_layoutxlm/ + ├── inference.pdiparams # inference模型的参数文件 + ├── inference.pdiparams.info # inference模型的参数信息,可忽略 + └── inference.pdmodel # inference模型的模型结构文件 +``` + +### 4.2 模型推理 + +VI-LayoutXLM模型基于SER任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化SER结果结果默认保存到`./output`文件夹里面。结果示例如下: + +![image-20240710082128694](./images/image-20240710082046188.jpg) + +VI-LayoutXLM模型基于RE任务进行推理,可以执行如下命令: + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm \ + --ser_model_dir=../inference/ser_vi_layoutxlm \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +RE可视化结果默认保存到`./output`文件夹里面,结果示例如下: + +![image-20240710082147184](./images/image-20240710082046188.jpg) + +## 5. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? + +**A**:该问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。可以对比训练使用的配置文件中的预处理、后处理和预测时是否存在差异。 diff --git a/docs/ppocr/model_train/recognition.en.md b/docs/ppocr/model_train/recognition.en.md new file mode 100644 index 0000000000..f22e3632e5 --- /dev/null +++ b/docs/ppocr/model_train/recognition.en.md @@ -0,0 +1,471 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# Text Recognition + +## 1. Data Preparation + +### 1.1 DataSet Preparation + +To prepare datasets, refer to [ocr_datasets](../../datasets/datasets.en.md) . + +PaddleOCR provides label files for training the icdar2015 dataset, which can be downloaded in the following ways: + +```bash linenums="1" +# Training set label +wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt +# Test Set Label +wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt +``` + +PaddleOCR also provides a data format conversion script, which can convert ICDAR official website label to a data format +supported by PaddleOCR. The data conversion tool is in `ppocr/utils/gen_label.py`, here is the training set as an example: + +```bash linenums="1" +# convert the official gt to rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + +The data format is as follows, (a) is the original picture, (b) is the Ground Truth text file corresponding to each picture: + +![img](./images/icdar_rec.png) + +- Multilingual dataset + +The multi-language model training method is the same as the Chinese model. The training data set is 100w synthetic data. A small amount of fonts and test data can be downloaded using the following two methods. + +- [Baidu Netdisk](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) ,Extraction code:frgi. +- [Google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) + +### 1.2 Dictionary + +Finally, a dictionary ({word_dict_name}.txt) needs to be provided so that when the model is trained, all the characters that appear can be mapped to the dictionary index. + +Therefore, the dictionary needs to contain all the characters that you want to be recognized correctly. {word_dict_name}.txt needs to be written in the following format and saved in the `utf-8` encoding format: + +```text linenums="1" +l +d +a +d +r +n +``` + +In `word_dict.txt`, there is a single word in each line, which maps characters and numeric indexes together, e.g "and" will be mapped to [2 5 1] + +PaddleOCR has built-in dictionaries, which can be used on demand. + +`ppocr/utils/ppocr_keys_v1.txt` is a Chinese dictionary with 6623 characters. + +`ppocr/utils/ic15_dict.txt` is an English dictionary with 36 characters + +`ppocr/utils/dict/french_dict.txt` is a French dictionary with 118 characters + +`ppocr/utils/dict/japan_dict.txt` is a Japanese dictionary with 4399 characters + +`ppocr/utils/dict/korean_dict.txt` is a Korean dictionary with 3636 characters + +`ppocr/utils/dict/german_dict.txt` is a German dictionary with 131 characters + +`ppocr/utils/en_dict.txt` is a English dictionary with 96 characters + +The current multi-language model is still in the demo stage and will continue to optimize the model and add languages. **You are very welcome to provide us with dictionaries and fonts in other languages**, +If you like, you can submit the dictionary file to [dict](../../ppocr/utils/dict) and we will thank you in the Repo. + +To customize the dict file, please modify the `character_dict_path` field in `configs/rec/rec_icdar15_train.yml` . + +- Custom dictionary + +If you need to customize dic file, please add character_dict_path field in configs/rec/rec_icdar15_train.yml to point to your dictionary path. And set character_type to ch. + +### 1.4 Add Space Category + +If you want to support the recognition of the `space` category, please set the `use_space_char` field in the yml file to `True`. + +### 1.5 Data Augmentation + +PaddleOCR provides a variety of data augmentation methods. All the augmentation methods are enabled by default. + +The default perturbation methods are: cvtColor, blur, jitter, Gasuss noise, random crop, perspective, color reverse, TIA augmentation. + +Each disturbance method is selected with a 40% probability during the training process. For specific code implementation, please refer to: [rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) + +## 2.Training + +PaddleOCR provides training scripts, evaluation scripts, and prediction scripts. In this section, the CRNN recognition model will be used as an example: + +### 2.1 Start Training + +First download the pretrain model, you can download the trained model to finetune on the icdar2015 data: + +```bash linenums="1" +cd PaddleOCR/ +# Download the pre-trained model of en_PP-OCRv3 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar +# Decompress model parameters +cd pretrain_models +tar -xf en_PP-OCRv3_rec_train.tar && rm -rf en_PP-OCRv3_rec_train.tar +``` + +Start training: + +```bash linenums="1" +# GPU training Support single card and multi-card training +# Training icdar15 English data and The training log will be automatically saved as train.log under "{save_model_dir}" + +#specify the single card training(Long training time, not recommended) +python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy + +#specify the card number through --gpus +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy +``` + +PaddleOCR supports alternating training and evaluation. You can modify `eval_batch_step` in `configs/rec/rec_icdar15_train.yml` to set the evaluation frequency. By default, it is evaluated every 500 iter and the best acc model is saved under `output/rec_CRNN/best_accuracy` during the evaluation process. + +If the evaluation set is large, the test will be time-consuming. It is recommended to reduce the number of evaluations, or evaluate after training. + +- Tip: You can use the `-c` parameter to select multiple model configurations under the `configs/rec/` path for training. The recognition algorithms supported at [rec_algorithm](../../algorithm/overview.en.md): + +For training Chinese data, it is recommended to use +[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file: + +Take `ch_PP-OCRv3_rec_distillation.yml` as an example: + +```yaml linenums="1" +Global: + ... + # Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + # Modify character type + ... + # Whether to recognize spaces + use_space_char: True + + +Optimizer: + ... + # Add learning rate decay strategy + lr: + name: Cosine + learning_rate: 0.001 + ... + +... + +Train: + dataset: + # Type of dataset,we support LMDBDataSet and SimpleDataSet + name: SimpleDataSet + # Path of dataset + data_dir: ./train_data/ + # Path of train list + label_file_list: ["./train_data/train_list.txt"] + transforms: + ... + - RecResizeImg: + # Modify image_shape to fit long text + image_shape: [3, 48, 320] + ... + loader: + ... + # Train batch_size for Single card + batch_size_per_card: 256 + ... + +Eval: + dataset: + # Type of dataset,we support LMDBDataSet and SimpleDataSet + name: SimpleDataSet + # Path of dataset + data_dir: ./train_data + # Path of eval list + label_file_list: ["./train_data/val_list.txt"] + transforms: + ... + - RecResizeImg: + # Modify image_shape to fit long text + image_shape: [3, 48, 320] + ... + loader: + # Eval batch_size for Single card + batch_size_per_card: 256 + ... +``` + +**Note that the configuration file for prediction/evaluation must be consistent with the training.** + +### 2.2 Load Trained Model and Continue Training + +If you expect to load trained model and continue the training again, you can specify the parameter `Global.checkpoints` as the model path to be loaded. + +For example: + +```bash linenums="1" +python3 tools/train.py -c configs/rec/rec_icdar15_train.yml -o Global.checkpoints=./your/trained/model +``` + +**Note**: The priority of `Global.checkpoints` is higher than that of `Global.pretrained_model`, that is, when two parameters are specified at the same time, the model specified by `Global.checkpoints` will be loaded first. If the model path specified by `Global.checkpoints` is wrong, the one specified by `Global.pretrained_model` will be loaded. + +### 2.3 Training with New Backbone + +The network part completes the construction of the network, and PaddleOCR divides the network into four parts, which are under [ppocr/modeling](../../ppocr/modeling). The data entering the network will pass through these four parts in sequence(transforms->backbones-> +necks->heads). + +```bash linenums="1" +├── architectures # Code for building network +├── transforms # Image Transformation Module +├── backbones # Feature extraction module +├── necks # Feature enhancement module +└── heads # Output module +``` + +If the Backbone to be replaced has a corresponding implementation in PaddleOCR, you can directly modify the parameters in the `Backbone` part of the configuration yml file. + +However, if you want to use a new Backbone, an example of replacing the backbones is as follows: + +1. Create a new file under the [ppocr/modeling/backbones](../../ppocr/modeling/backbones) folder, such as my_backbone.py. +2. Add code in the my_backbone.py file, the sample code is as follows: + +```python linenums="1" +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. Import the added module in the [ppocr/modeling/backbones/\__init\__.py](../../ppocr/modeling/backbones/__init__.py) file. + +After adding the four-part modules of the network, you only need to configure them in the configuration file to use, such as: + +```yaml linenums="1" + Backbone: + name: MyBackbone + args1: args1 +``` + +**NOTE**: More details about replace Backbone and other mudule can be found in [doc](../../algorithm/add_new_algorithm.en.md). + +### 2.4 Mixed Precision Training + +If you want to speed up your training further, you can use [Auto Mixed Precision Training](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), taking a single machine and a single gpu as an example, the commands are as follows: + +```bash linenums="1" +python3 tools/train.py -c configs/rec/rec_icdar15_train.yml \ + -o Global.pretrained_model=./pretrain_models/rec_mv3_none_bilstm_ctc_v2.0_train \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True +``` + +### 2.5 Distributed Training + +During multi-machine multi-gpu training, use the `--ips` parameter to set the used machine IP address, and the `--gpus` parameter to set the used GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/rec/rec_icdar15_train.yml \ + -o Global.pretrained_model=./pretrain_models/rec_mv3_none_bilstm_ctc_v2.0_train +``` + +**Note:** (1) When using multi-machine and multi-gpu training, you need to replace the ips value in the above command with the address of your machine, and the machines need to be able to ping each other. (2) Training needs to be launched separately on multiple machines. The command to view the ip address of the machine is `ifconfig`. (3) For more details about the distributed training speedup ratio, please refer to [Distributed Training Tutorial](../blog/distributed_training.en.md). + +### 2.6 Training with Knowledge Distillation + +Knowledge distillation is supported in PaddleOCR for text recognition training process. For more details, please refer to [doc](../model_compress/knowledge_distillation.en.md). + +### 2.7 Multi-language Training + +Currently, the multi-language algorithms supported by PaddleOCR are: + +| Configuration file | Algorithm name | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | chinese traditional | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | English(Case sensitive) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | French | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | German | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Japanese | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Korean | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | Latin | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | arabic | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | cyrillic | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | devanagari | + +For more supported languages, please refer to : [Multi-language model](../blog/multi_languages.en.md) + +If you want to finetune on the basis of the existing model effect, please refer to the following instructions to modify the configuration file: + +Take `rec_french_lite_train` as an example: + +```yaml linenums="1" +Global: + ... + # Add a custom dictionary, such as modify the dictionary, please point the path to the new dictionary + character_dict_path: ./ppocr/utils/dict/french_dict.txt + ... + # Whether to recognize spaces + use_space_char: True + +... + +Train: + dataset: + # Type of dataset,we support LMDBDataSet and SimpleDataSet + name: SimpleDataSet + # Path of dataset + data_dir: ./train_data/ + # Path of train list + label_file_list: ["./train_data/french_train.txt"] + ... + +Eval: + dataset: + # Type of dataset,we support LMDBDataSet and SimpleDataSet + name: SimpleDataSet + # Path of dataset + data_dir: ./train_data + # Path of eval list + label_file_list: ["./train_data/french_val.txt"] + ... +``` + +### 2.8 Training on other platform(Windows/macOS/Linux DCU) + +- Windows GPU/CPU +The Windows platform is slightly different from the Linux platform: +Windows platform only supports `single gpu` training and inference, specify GPU for training `set CUDA_VISIBLE_DEVICES=0` +On the Windows platform, DataLoader only supports single-process mode, so you need to set `num_workers` to 0; + +- macOS +GPU mode is not supported, you need to set `use_gpu` to False in the configuration file, and the rest of the training evaluation prediction commands are exactly the same as Linux GPU. + +- Linux DCU +Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +## 2.9 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the recognition model, please refer to: [Model Fine-tuning Tutorial](./finetune.en.md). + +## 3. Evaluation and Test + +### 3.1 Evaluation + +The model parameters during training are saved in the `Global.save_model_dir` directory by default. When evaluating indicators, you need to set `Global.checkpoints` to point to the saved parameter file. The evaluation dataset can be set by modifying the `Eval.dataset.label_file_list` field in the `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` file. + +```bash linenums="1" +# GPU evaluation, Global.checkpoints is the weight to be tested +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +### 3.2 Test + +Using the model trained by paddleocr, you can quickly get prediction through the following script. + +The default prediction picture is stored in `infer_img`, and the trained weight is specified via `-o Global.checkpoints`: + +According to the `save_model_dir` and `save_epoch_step` fields set in the configuration file, the following parameters will be saved: + +```text linenums="1" +output/rec/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── iter_epoch_3.pdopt +├── iter_epoch_3.pdparams +├── iter_epoch_3.states +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` + +Among them, best_accuracy._is the best model on the evaluation set; iter_epoch_x._ is the model saved at intervals of `save_epoch_step`; latest.* is the model of the last epoch. + +```bash linenums="1" +# Predict English results +python3 tools/infer_rec.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +Input image: + +![img](./images/word_1-20240704092705543.png) + +Get the prediction result of the input image: + +```bash linenums="1" +infer_img: doc/imgs_words/en/word_1.png + result: ('joint', 0.9998967) +``` + +The configuration file used for prediction must be consistent with the training. For example, you completed the training of the Chinese model with `python3 tools/train.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml`, you can use the following command to predict the Chinese model: + +```bash linenums="1" +# Predict Chinese results +python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg +``` + +Input image: + +![img](./images/word_1-20240704092713071.jpg) + +Get the prediction result of the input image: + +```bash linenums="1" +infer_img: doc/imgs_words/ch/word_1.jpg + result: ('韩国小馆', 0.997218) +``` + +## 4. Inference + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +The recognition model is converted to the inference model in the same way as the detection, as follows: + +```bash linenums="1" +# -c Set the training algorithm yml configuration file +# -o Set optional parameters +# Global.pretrained_model parameter Set the training model address to be converted without adding the file suffix .pdmodel, .pdopt or .pdparams. +# Global.save_inference_dir Set the address where the converted model will be saved. + +python3 tools/export_model.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=en_PP-OCRv3_rec_train/best_accuracy Global.save_inference_dir=./inference/en_PP-OCRv3_rec/ +``` + +If you have a model trained on your own dataset with a different dictionary file, please make sure that you modify the `character_dict_path` in the configuration file to your dictionary file path. + +After the conversion is successful, there are three files in the model save directory: + +```text linenums="1" +inference/en_PP-OCRv3_rec/ + ├── inference.pdiparams # The parameter file of recognition inference model + ├── inference.pdiparams.info # The parameter information of recognition inference model, which can be ignored + └── inference.pdmodel # The program file of recognition model +``` + +- Text recognition model Inference using custom characters dictionary + + If the text dictionary is modified during training, when using the inference model to predict, you need to specify the dictionary path used by `--rec_char_dict_path` + + ```bash linenums="1" + python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 32, 100" --rec_char_dict_path="your text dict path" + ``` + +## 5. FAQ + +Q1: After the training model is transferred to the inference model, the prediction effect is inconsistent? + +**A**: There are many such problems, and the problems are mostly caused by inconsistent preprocessing and postprocessing parameters when the trained model predicts and the preprocessing and postprocessing parameters when the inference model predicts. You can compare whether there are differences in preprocessing, postprocessing, and prediction in the configuration files used for training. diff --git a/docs/ppocr/model_train/recognition.md b/docs/ppocr/model_train/recognition.md new file mode 100644 index 0000000000..893b07a127 --- /dev/null +++ b/docs/ppocr/model_train/recognition.md @@ -0,0 +1,568 @@ +--- +comments: true +typora-copy-images-to: images +--- + +# 文字识别 + +本文提供了PaddleOCR文本识别任务的全流程指南,包括数据准备、模型训练、调优、评估、预测,各个阶段的详细说明: + +## 1. 数据准备 + +### 1.1. 准备数据集 + +PaddleOCR 支持两种数据格式: + +- `lmdb` 用于训练以lmdb格式存储的数据集(LMDBDataSet); +- `通用数据` 用于训练以文本文件存储的数据集(SimpleDataSet); + +训练数据的默认存储路径是 `PaddleOCR/train_data`,如果您的磁盘上已有数据集,只需创建软链接至数据集目录: + +```bash linenums="1" +# linux and mac os +ln -sf /train_data/dataset +# windows +mklink /d /train_data/dataset +``` + +### 1.2. 自定义数据集 + +下面以通用数据集为例, 介绍如何准备数据集: + +- 训练集 + +建议将训练图片放入同一个文件夹,并用一个txt文件(rec_gt_train.txt)记录图片路径和标签,txt文件里的内容如下: + +**注意:** txt文件中默认请将图片路径和图片标签用 \t 分割,如用其他方式分割将造成训练报错。 + +```text linenums="1" +" 图像文件名 图像标注信息 " + +train_data/rec/train/word_001.jpg 简单可依赖 +train_data/rec/train/word_002.jpg 用科技让复杂的世界更简单 +... +``` + +最终训练集应有如下文件结构: + +```text linenums="1" +|-train_data + |-rec + |- rec_gt_train.txt + |- train + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... +``` + +除上述单张图像为一行格式之外,PaddleOCR也支持对离线增广后的数据进行训练,为了防止相同样本在同一个batch中被多次采样,我们可以将相同标签对应的图片路径写在一行中,以列表的形式给出,在训练中,PaddleOCR会随机选择列表中的一张图片进行训练。对应地,标注文件的格式如下: + +```text linenums="1" +["11.jpg", "12.jpg"] 简单可依赖 +["21.jpg", "22.jpg", "23.jpg"] 用科技让复杂的世界更简单 +3.jpg ocr +``` + +上述示例标注文件中,"11.jpg"和"12.jpg"的标签相同,都是`简单可依赖`,在训练的时候,对于该行标注,会随机选择其中的一张图片进行训练。 + +- 验证集 + +同训练集类似,验证集也需要提供一个包含所有图片的文件夹(test)和一个rec_gt_test.txt,验证集的结构如下所示: + +```text linenums="1" +|-train_data + |-rec + |- rec_gt_test.txt + |- test + |- word_001.jpg + |- word_002.jpg + |- word_003.jpg + | ... +``` + +### 1.3. 数据下载 + +- ICDAR2015 + +若您本地没有数据集,可以在官网下载 [ICDAR2015](http://rrc.cvc.uab.es/?ch=4&com=downloads) 数据,用于快速验证。也可以参考[DTRB](https://github.com/clovaai/deep-text-recognition-benchmark#download-lmdb-dataset-for-traininig-and-evaluation-from-here) ,下载 benchmark 所需的lmdb格式数据集。 + +如果你使用的是icdar2015的公开数据集,PaddleOCR 提供了一份用于训练 ICDAR2015 数据集的标签文件,通过以下方式下载: + +```bash linenums="1" +# 训练集标签 +wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_train.txt +# 测试集标签 +wget -P ./train_data/ic15_data https://paddleocr.bj.bcebos.com/dataset/rec_gt_test.txt +``` + +PaddleOCR 也提供了数据格式转换脚本,可以将ICDAR官网 label 转换为PaddleOCR支持的数据格式。 数据转换工具在 `ppocr/utils/gen_label.py`, 这里以训练集为例: + +```bash linenums="1" +# 将官网下载的标签文件转换为 rec_gt_label.txt +python gen_label.py --mode="rec" --input_path="{path/of/origin/label}" --output_label="rec_gt_label.txt" +``` + +数据样式格式如下,(a)为原始图片,(b)为每张图片对应的 Ground Truth 文本文件: +![img](./images/icdar_rec.png) + +- 多语言数据集 + +多语言模型的训练数据集均为100w的合成数据,使用了开源合成工具 [text_renderer](https://github.com/Sanster/text_renderer) ,少量的字体可以通过下面两种方式下载。 + +- [百度网盘](https://pan.baidu.com/s/1bS_u207Rm7YbY33wOECKDA) 提取码:frgi +- [google drive](https://drive.google.com/file/d/18cSWX7wXSy4G0tbKJ0d9PuIaiwRLHpjA/view) + +### 1.4. 字典 + +最后需要提供一个字典({word_dict_name}.txt),使模型在训练时,可以将所有出现的字符映射为字典的索引。 + +因此字典需要包含所有希望被正确识别的字符,{word_dict_name}.txt需要写成如下格式,并以 `utf-8` 编码格式保存: + +```text linenums="1" +l +d +a +d +r +n +``` + +word_dict.txt 每行有一个单字,将字符与数字索引映射在一起,“and” 将被映射成 [2 5 1] + +- 内置字典 + +PaddleOCR内置了一部分字典,可以按需使用。 + +`ppocr/utils/ppocr_keys_v1.txt` 是一个包含6623个字符的中文字典 + +`ppocr/utils/ic15_dict.txt` 是一个包含36个字符的英文字典 + +`ppocr/utils/dict/french_dict.txt` 是一个包含118个字符的法文字典 + +`ppocr/utils/dict/japan_dict.txt` 是一个包含4399个字符的日文字典 + +`ppocr/utils/dict/korean_dict.txt` 是一个包含3636个字符的韩文字典 + +`ppocr/utils/dict/german_dict.txt` 是一个包含131个字符的德文字典 + +`ppocr/utils/en_dict.txt` 是一个包含96个字符的英文字典 + +目前的多语言模型仍处在demo阶段,会持续优化模型并补充语种,**非常欢迎您为我们提供其他语言的字典和字体**, +如您愿意可将字典文件提交至 [dict](../../ppocr/utils/dict),我们会在Repo中感谢您。 + +- 自定义字典 + +如需自定义dic文件,请在 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 中添加 `character_dict_path` 字段, 指向您的字典路径。 + +### 1.5. 添加空格类别 + +如果希望支持识别"空格"类别, 请将yml文件中的 `use_space_char` 字段设置为 `True`。 + +### 1.6. 数据增强 + +PaddleOCR提供了多种数据增强方式,默认配置文件中已经添加了数据增广。 + +默认的扰动方式有:颜色空间转换(cvtColor)、模糊(blur)、抖动(jitter)、噪声(Gasuss noise)、随机切割(random crop)、透视(perspective)、颜色反转(reverse)、TIA数据增广。 + +训练过程中每种扰动方式以40%的概率被选择,具体代码实现请参考:[rec_img_aug.py](../../ppocr/data/imaug/rec_img_aug.py) + +*由于OpenCV的兼容性问题,扰动操作暂时只支持Linux* + +## 2. 开始训练 + +PaddleOCR提供了训练脚本、评估脚本和预测脚本,本节将以 PP-OCRv3 英文识别模型为例: + +### 2.1. 启动训练 + +首先下载pretrain model,您可以下载训练好的模型在 icdar2015 数据上进行finetune + +```bash linenums="1" +cd PaddleOCR/ +# 下载英文PP-OCRv3的预训练模型 +wget -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar +# 解压模型参数 +cd pretrain_models +tar -xf en_PP-OCRv3_rec_train.tar && rm -rf en_PP-OCRv3_rec_train.tar +``` + +开始训练: + +*如果您安装的是cpu版本,请将配置文件中的 `use_gpu` 字段修改为false* + +```bash linenums="1" +# GPU训练 支持单卡,多卡训练 +# 训练icdar15英文数据 训练日志会自动保存为 "{save_model_dir}" 下的train.log + +#单卡训练(训练周期长,不建议) +python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy + +# 多卡训练,通过--gpus参数指定卡号 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy +``` + +正常启动训练后,会看到以下log输出: + +```bash linenums="1" +[2022/02/22 07:58:05] root INFO: epoch: [1/800], iter: 10, lr: 0.000000, loss: 0.754281, acc: 0.000000, norm_edit_dis: 0.000008, reader_cost: 0.55541 s, batch_cost: 0.91654 s, samples: 1408, ips: 153.62133 +[2022/02/22 07:58:13] root INFO: epoch: [1/800], iter: 20, lr: 0.000001, loss: 0.924677, acc: 0.000000, norm_edit_dis: 0.000008, reader_cost: 0.00236 s, batch_cost: 0.28528 s, samples: 1280, ips: 448.68599 +[2022/02/22 07:58:23] root INFO: epoch: [1/800], iter: 30, lr: 0.000002, loss: 0.967231, acc: 0.000000, norm_edit_dis: 0.000008, reader_cost: 0.14527 s, batch_cost: 0.42714 s, samples: 1280, ips: 299.66507 +[2022/02/22 07:58:31] root INFO: epoch: [1/800], iter: 40, lr: 0.000003, loss: 0.895318, acc: 0.000000, norm_edit_dis: 0.000008, reader_cost: 0.00173 s, batch_cost: 0.27719 s, samples: 1280, ips: 461.77252 +``` + +log 中自动打印如下信息: + +| 字段 | 含义 | +| :----: | :------: | +| epoch | 当前迭代轮次 | +| iter | 当前迭代次数 | +| lr | 当前学习率 | +| loss | 当前损失函数 | +| acc | 当前batch的准确率 | +| norm_edit_dis | 当前 batch 的编辑距离 | +| reader_cost | 当前 batch 数据处理耗时 | +| batch_cost | 当前 batch 总耗时 | +| samples | 当前 batch 内的样本数 | +| ips | 每秒处理图片的数量 | + +PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 中修改 `eval_batch_step` 设置评估频率,默认每500个iter评估一次。评估过程中默认将最佳acc模型,保存为 `output/en_PP-OCRv3_rec/best_accuracy` 。 + +如果验证集很大,测试将会比较耗时,建议减少评估次数,或训练完再进行评估。 + +**提示:** 可通过 -c 参数选择 `configs/rec/` 路径下的多种模型配置进行训练,PaddleOCR支持的识别算法可以参考[前沿算法列表](../../algorithm/overview.md): + +训练中文数据,推荐使用[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: + +以 `ch_PP-OCRv3_rec_distillation.yml` 为例: + +```yaml linenums="1" +Global: + ... + # 添加自定义字典,如修改字典请将路径指向新字典 + character_dict_path: ppocr/utils/ppocr_keys_v1.txt + ... + # 识别空格 + use_space_char: True + + +Optimizer: + ... + # 添加学习率衰减策略 + lr: + name: Cosine + learning_rate: 0.001 + ... + +... + +Train: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data/ + # 训练集标签文件 + label_file_list: ["./train_data/train_list.txt"] + transforms: + ... + - RecResizeImg: + # 修改 image_shape 以适应长文本 + image_shape: [3, 48, 320] + ... + loader: + ... + # 单卡训练的batch_size + batch_size_per_card: 256 + ... + +Eval: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data + # 验证集标签文件 + label_file_list: ["./train_data/val_list.txt"] + transforms: + ... + - RecResizeImg: + # 修改 image_shape 以适应长文本 + image_shape: [3, 48, 320] + ... + loader: + # 单卡验证的batch_size + batch_size_per_card: 256 + ... +``` + +**注意,预测/评估时的配置文件请务必与训练一致。** + +### 2.2. 断点训练 + +如果训练程序中断,如果希望加载训练中断的模型从而恢复训练,可以通过指定Global.checkpoints指定要加载的模型路径: + +```bash linenums="1" +python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints=./your/trained/model +``` + +**注意**:`Global.checkpoints`的优先级高于`Global.pretrained_model`的优先级,即同时指定两个参数时,优先加载`Global.checkpoints`指定的模型,如果`Global.checkpoints`指定的模型路径有误,会加载`Global.pretrained_model`指定的模型。 + +### 2.3. 更换Backbone 训练 + +PaddleOCR将网络划分为四部分,分别在[ppocr/modeling](../../ppocr/modeling)下。 进入网络的数据将按照顺序(transforms->backbones->necks->heads)依次通过这四个部分。 + +```bash linenums="1" +├── architectures # 网络的组网代码 +├── transforms # 网络的图像变换模块 +├── backbones # 网络的特征提取模块 +├── necks # 网络的特征增强模块 +└── heads # 网络的输出模块 +``` + +如果要更换的Backbone 在PaddleOCR中有对应实现,直接修改配置yml文件中`Backbone`部分的参数即可。 + +如果要使用新的Backbone,更换backbones的例子如下: + +1. 在 [ppocr/modeling/backbones](../../ppocr/modeling/backbones) 文件夹下新建文件,如my_backbone.py。 +2. 在 my_backbone.py 文件内添加相关代码,示例代码如下: + +```python linenums="1" +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class MyBackbone(nn.Layer): + def __init__(self, *args, **kwargs): + super(MyBackbone, self).__init__() + # your init code + self.conv = nn.xxxx + + def forward(self, inputs): + # your network forward + y = self.conv(inputs) + return y +``` + +3. 在 [ppocr/modeling/backbones/\_*init\_*.py](../../ppocr/modeling/backbones/__init__.py)文件内导入添加的`MyBackbone`模块,然后修改配置文件中Backbone进行配置即可使用,格式如下: + +```yaml linenums="1" +Backbone: +name: MyBackbone +args1: args1 +``` + +**注意**:如果要更换网络的其他模块,可以参考[文档](../../algorithm/add_new_algorithm.md)。 + +### 2.4. 混合精度训练 + +如果您想进一步加快训练速度,可以使用[自动混合精度训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/01_paddle2.0_introduction/basic_concept/amp_cn.html), 以单机单卡为例,命令如下: + +```bash linenums="1" +python3 tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml \ + -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy \ + Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True +``` + +### 2.5. 分布式训练 + +多机多卡训练时,通过 `--ips` 参数设置使用的机器IP地址,通过 `--gpus` 参数设置使用的GPU ID: + +```bash linenums="1" +python3 -m paddle.distributed.launch --ips="xx.xx.xx.xx,xx.xx.xx.xx" --gpus '0,1,2,3' tools/train.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml \ + -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy +``` + +**注意:** (1)采用多机多卡训练时,需要替换上面命令中的ips值为您机器的地址,机器之间需要能够相互ping通;(2)训练时需要在多个机器上分别启动命令。查看机器ip地址的命令为`ifconfig`;(3)更多关于分布式训练的性能优势等信息,请参考:[分布式训练教程](../blog/distributed_training.md)。 + +### 2.6. 知识蒸馏训练 + +PaddleOCR支持了基于知识蒸馏的文本识别模型训练过程,更多内容可以参考[知识蒸馏说明文档](../model_compress/knowledge_distillation.md)。 + +### 2.7. 多语言模型训练 + +PaddleOCR目前已支持80种(除中文外)语种识别,`configs/rec/multi_languages` 路径下提供了一个多语言的配置文件模版: [rec_multi_language_lite_train.yml](../../configs/rec/multi_language/rec_multi_language_lite_train.yml)。 + +按语系划分,目前PaddleOCR支持的语种有: + +| 配置文件 | 算法名称 | backbone | trans | seq | pred | language | +| :--------: | :-------: | :-------: | :-------: | :-----: | :-----: | :-----: | +| rec_chinese_cht_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 中文繁体 | +| rec_en_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 英语(区分大小写) | +| rec_french_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 法语 | +| rec_ger_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 德语 | +| rec_japan_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 日语 | +| rec_korean_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 韩语 | +| rec_latin_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 拉丁字母 | +| rec_arabic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 阿拉伯字母 | +| rec_cyrillic_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 斯拉夫字母 | +| rec_devanagari_lite_train.yml | CRNN | Mobilenet_v3 small 0.5 | None | BiLSTM | ctc | 梵文字母 | + +更多支持语种请参考: [多语言模型](../blog/multi_languages.md) + +如您希望在现有模型效果的基础上调优,请参考下列说明修改配置文件: + +以 `rec_french_lite_train` 为例: + +```yaml linenums="1" +Global: + ... + # 添加自定义字典,如修改字典请将路径指向新字典 + character_dict_path: ./ppocr/utils/dict/french_dict.txt + ... + # 识别空格 + use_space_char: True + +... + +Train: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data/ + # 训练集标签文件 + label_file_list: ["./train_data/french_train.txt"] + ... + +Eval: + dataset: + # 数据集格式,支持LMDBDataSet以及SimpleDataSet + name: SimpleDataSet + # 数据集路径 + data_dir: ./train_data + # 验证集标签文件 + label_file_list: ["./train_data/french_val.txt"] + ... +``` + +### 2.8. 其他训练环境 + +- Windows GPU/CPU +在Windows平台上与Linux平台略有不同: +Windows平台只支持`单卡`的训练与预测,指定GPU进行训练`set CUDA_VISIBLE_DEVICES=0` +在Windows平台,DataLoader只支持单进程模式,因此需要设置 `num_workers` 为0; + +- macOS +不支持GPU模式,需要在配置文件中设置`use_gpu`为False,其余训练评估预测命令与Linux GPU完全相同。 + +- Linux DCU +DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3`,其余训练评估预测命令与Linux GPU完全相同。 + +### 2.9 模型微调 + +实际使用过程中,建议加载官方提供的预训练模型,在自己的数据集中进行微调,关于识别模型的微调方法,请参考:[模型微调教程](./finetune.md)。 + +## 3. 模型评估与预测 + +### 3.1. 指标评估 + +训练中模型参数默认保存在`Global.save_model_dir`目录下。在评估指标时,需要设置`Global.checkpoints`指向保存的参数文件。评估数据集可以通过 `configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml` 修改Eval中的 `label_file_path` 设置。 + +```bash linenums="1" +# GPU 评估, Global.checkpoints 为待测权重 +python3 -m paddle.distributed.launch --gpus '0' tools/eval.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.checkpoints={path/to/weights}/best_accuracy +``` + +### 3.2. 测试识别效果 + +使用 PaddleOCR 训练好的模型,可以通过以下脚本进行快速预测。 + +默认预测图片存储在 `infer_img` 里,通过 `-o Global.checkpoints` 加载训练好的参数文件: + +根据配置文件中设置的 `save_model_dir` 和 `save_epoch_step` 字段,会有以下几种参数被保存下来: + +```text linenums="1" +output/rec/ +├── best_accuracy.pdopt +├── best_accuracy.pdparams +├── best_accuracy.states +├── config.yml +├── iter_epoch_3.pdopt +├── iter_epoch_3.pdparams +├── iter_epoch_3.states +├── latest.pdopt +├── latest.pdparams +├── latest.states +└── train.log +``` + +其中 best_accuracy.*是评估集上的最优模型;iter_epoch_x.* 是以 `save_epoch_step` 为间隔保存下来的模型;latest.* 是最后一个epoch的模型。 + +```bash linenums="1" +# 预测英文结果 +python3 tools/infer_rec.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/en/word_1.png +``` + +预测图片: + +![img](./images/word_1-20240704092705543.png) + +得到输入图像的预测结果: + +```bash linenums="1" +infer_img: doc/imgs_words/en/word_1.png + result: ('joint', 0.9998967) +``` + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml` 完成了中文模型的训练, +您可以使用如下命令进行中文模型预测。 + +```bash linenums="1" +# 预测中文结果 +python3 tools/infer_rec.py -c configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.infer_img=doc/imgs_words/ch/word_1.jpg +``` + +预测图片: + +![img](./images/word_1-20240704092713071.jpg) + +得到输入图像的预测结果: + +```bash linenums="1" +infer_img: doc/imgs_words/ch/word_1.jpg + result: ('韩国小馆', 0.997218) +``` + +## 4. 模型导出与预测 + +inference 模型(`paddle.jit.save`保存的模型) +一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 +训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 +与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +识别模型转inference模型与检测的方式相同,如下: + +```bash linenums="1" +# -c 后面设置训练算法的yml配置文件 +# -o 配置可选参数 +# Global.pretrained_model 参数设置待转换的训练模型地址,不用添加文件后缀 .pdmodel,.pdopt或.pdparams。 +# Global.save_inference_dir参数设置转换的模型将保存的地址。 + +python3 tools/export_model.py -c configs/rec/PP-OCRv3/en_PP-OCRv3_rec.yml -o Global.pretrained_model=./pretrain_models/en_PP-OCRv3_rec_train/best_accuracy Global.save_inference_dir=./inference/en_PP-OCRv3_rec/ +``` + +**注意:**如果您是在自己的数据集上训练的模型,并且调整了中文字符的字典文件,请注意修改配置文件中的`character_dict_path`为自定义字典文件。 + +转换成功后,在目录下有三个文件: + +```text linenums="1" +inference/en_PP-OCRv3_rec/ + ├── inference.pdiparams # 识别inference模型的参数文件 + ├── inference.pdiparams.info # 识别inference模型的参数信息,可忽略 + └── inference.pdmodel # 识别inference模型的program文件 +``` + +- 自定义模型推理 + + 如果训练时修改了文本的字典,在使用inference模型预测时,需要通过`--rec_char_dict_path`指定使用的字典路径,更多关于推理超参数的配置与解释,请参考:[模型推理超参数解释教程](../blog/inference_args.md)。 + + ```bash linenums="1" + python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words_en/word_336.png" --rec_model_dir="./your inference model" --rec_image_shape="3, 48, 320" --rec_char_dict_path="your text dict path" + ``` + +## 5. FAQ + +Q1: 训练模型转inference 模型之后预测效果不一致? + +**A**:此类问题出现较多,问题多是trained model预测时候的预处理、后处理参数和inference model预测的时候的预处理、后处理参数不一致导致的。可以对比训练使用的配置文件中的预处理、后处理和预测时是否存在差异。 diff --git a/docs/ppocr/model_train/training.en.md b/docs/ppocr/model_train/training.en.md new file mode 100644 index 0000000000..88f12a2d9e --- /dev/null +++ b/docs/ppocr/model_train/training.en.md @@ -0,0 +1,132 @@ +--- +comments: true +--- + +# Model Training + +This article will introduce the basic concepts that is necessary for model training and tuning. + +At the same time, it will briefly introduce the structure of the training data and how to prepare the data to fine-tune model in vertical scenes. + +## 1. Yml Configuration + +The PaddleOCR uses configuration files to control network training and evaluation parameters. In the configuration file, you can set the model, optimizer, loss function, and pre- and post-processing parameters of the model. PaddleOCR reads these parameters from the configuration file, and then builds a complete training process to train the model. Fine-tuning can also be completed by modifying the parameters in the configuration file, which is simple and convenient. + +For the complete configuration file description, please refer to [Configuration File](../blog/config.en.md) + +## 2. Basic Concepts + +During the model training process, some hyper-parameters can be manually specified to obtain the optimal result at the least cost. Different data volumes may require different hyper-parameters. When you want to fine-tune the model based on your own data, there are several parameter adjustment strategies for reference: + +### 2.1 Learning Rate + +The learning rate is one of the most important hyper-parameters for training neural networks. It represents the step length of the gradient moving towards the optimal solution of the loss function in each iteration. +A variety of learning rate update strategies are provided by PaddleOCR, which can be specified in configuration files. For example, + +```yaml linenums="1" +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +`Piecewise` stands for piece-wise constant attenuation. Different learning rates are specified in different learning stages, and the learning rate stay the same in each stage. + +`warmup_epoch` means that in the first 5 epochs, the learning rate will be increased gradually from 0 to base_lr. For all strategies, please refer to the code [learning_rate.py](../../ppocr/optimizer/learning_rate.py). + +### 2.2 Regularization + +Regularization can effectively avoid algorithm over-fitting. PaddleOCR provides L1 and L2 regularization methods. +L1 and L2 regularization are the most widely used regularization methods. +L1 regularization adds a regularization term to the objective function to reduce the sum of absolute values of the parameters; +while in L2 regularization, the purpose of adding a regularization term is to reduce the sum of squared parameters. +The configuration method is as follows: + +```yaml linenums="1" +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + +### 2.3 Evaluation Indicators + +(1) Detection stage: First, evaluate according to the IOU of the detection frame and the labeled frame. If the IOU is greater than a certain threshold, it is judged that the detection is accurate. Here, the detection frame and the label frame are different from the general general target detection frame, and they are represented by polygons. Detection accuracy: the percentage of the correct detection frame number in all detection frames is mainly used to judge the detection index. Detection recall rate: the percentage of correct detection frames in all marked frames, which is mainly an indicator of missed detection. + +(2) Recognition stage: Character recognition accuracy, that is, the ratio of correctly recognized text lines to the number of marked text lines. Only the entire line of text recognition pairs can be regarded as correct recognition. + +(3) End-to-end statistics: End-to-end recall rate: accurately detect and correctly identify the proportion of text lines in all labeled text lines; End-to-end accuracy rate: accurately detect and correctly identify the number of text lines in the detected text lines The standard for accurate detection is that the IOU of the detection box and the labeled box is greater than a certain threshold, and the text in the correctly identified detection box is the same as the labeled text. + +## 3. Data and Vertical Scenes + +### 3.1 Training Data + +The current open source models, data sets and magnitudes are as follows: + +- Detection: + - English data set, ICDAR2015 + - Chinese data set, LSVT street view data set training data 3w pictures + +- Identification: + - English data set, MJSynth and SynthText synthetic data, the data volume is tens of millions. + - Chinese data set, LSVT street view data set crops the image according to the truth value, and performs position calibration, a total of 30w images. In addition, based on the LSVT corpus, 500w of synthesized data. + - Small language data set, using different corpora and fonts, respectively generated 100w synthetic data set, and using ICDAR-MLT as the verification set. + +Among them, the public data sets are all open source, users can search and download by themselves, or refer to [Chinese data set](../../datasets/datasets.en.md), synthetic data is not open source, users can use open source synthesis tools to synthesize by themselves. Synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) etc. + +### 3.2 Vertical Scene + +PaddleOCR mainly focuses on general OCR. If you have vertical requirements, you can use PaddleOCR + vertical data to train yourself; +If there is a lack of labeled data, or if you do not want to invest in research and development costs, it is recommended to directly call the open API, which covers some of the more common vertical categories. + +### 3.3 Build Your Own Dataset + +There are several experiences for reference when constructing the data set: + +(1) The amount of data in the training set: + +a. The data required for detection is relatively small. For Fine-tune based on the PaddleOCR model, 500 sheets are generally required to achieve good results. + +b. Recognition is divided into English and Chinese. Generally, English scenarios require hundreds of thousands of data to achieve good results, while Chinese requires several million or more. + +(2) When the amount of training data is small, you can try the following three ways to get more data: + +a. Manually collect more training data, the most direct and effective way. + +b. Basic image processing or transformation based on PIL and opencv. For example, the three modules of ImageFont, Image, ImageDraw in PIL write text into the background, opencv's rotating affine transformation, Gaussian filtering and so on. + +c. Use data generation algorithms to synthesize data, such as algorithms such as pix2pix. + +## 4. FAQ + +**Q**: How to choose a suitable network input shape when training CRNN recognition? + + A: The general height is 32, the longest width is selected, there are two methods: + + (1) Calculate the aspect ratio distribution of training sample images. The selection of the maximum aspect ratio considers 80% of the training samples. + + (2) Count the number of texts in training samples. The selection of the longest number of characters considers the training sample that satisfies 80%. Then the aspect ratio of Chinese characters is approximately considered to be 1, and that of English is 3:1, and the longest width is estimated. + +**Q**: During the recognition training, the accuracy of the training set has reached 90, but the accuracy of the verification set has been kept at 70, what should I do? + + A: If the accuracy of the training set is 90 and the test set is more than 70, it should be over-fitting. There are two methods to try: + + (1) Add more augmentation methods or increase the [probability] of augmented prob (https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341), The default is 0.4. + + (2) Increase the [l2 dcay value] of the system (https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: When the recognition model is trained, loss can drop normally, but acc is always 0 + + A: It is normal for the acc to be 0 at the beginning of the recognition model training, and the indicator will come up after a longer training period. + +*** + +Click the following links for detailed training tutorial: + +- [text detection model training](./detection.en.md) +- [text recognition model training](./recognition.en.md) +- [text direction classification model training](./angle_class.en.md) diff --git a/docs/ppocr/model_train/training.md b/docs/ppocr/model_train/training.md new file mode 100644 index 0000000000..4ce0e2dffa --- /dev/null +++ b/docs/ppocr/model_train/training.md @@ -0,0 +1,128 @@ +--- +comments: true +--- + +# PP-OCR模型训练 + +本文将介绍模型训练时需掌握的基本概念,和训练时的调优方法。 + +同时会简单介绍PaddleOCR模型训练数据的组成部分,以及如何在垂类场景中准备数据finetune模型。 + +## 1. 配置文件说明 + +PaddleOCR模型使用配置文件管理网络训练、评估的参数。在配置文件中,可以设置组建模型、优化器、损失函数、模型前后处理的参数,PaddleOCR从配置文件中读取到这些参数,进而组建出完整的训练流程,完成模型训练,在需要对模型进行优化的时,可以通过修改配置文件中的参数完成配置,使用简单且方便修改。 + +完整的配置文件说明可以参考[配置文件](../blog/config.md) + +## 2. 基本概念 + +模型训练过程中需要手动调整一些超参数,帮助模型以最小的代价获得最优指标。不同的数据量可能需要不同的超参,当您希望在自己的数据上finetune或对模型效果调优时,有以下几个参数调整策略可供参考: + +### 2.1 学习率 + +学习率是训练神经网络的重要超参数之一,它代表在每一次迭代中梯度向损失函数最优解移动的步长。 +在PaddleOCR中提供了多种学习率更新策略,可以通过配置文件修改,例如: + +```yaml linenums="1" +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +Piecewise 代表分段常数衰减,在不同的学习阶段指定不同的学习率,在每段内学习率相同。 +warmup_epoch 代表在前5个epoch中,学习率将逐渐从0增加到base_lr。全部策略可以参考代码[learning_rate.py](../../ppocr/optimizer/learning_rate.py) 。 + +### 2.2 正则化 + +正则化可以有效的避免算法过拟合,PaddleOCR中提供了L1、L2正则方法,L1 和 L2 正则化是最常用的正则化方法。L1 正则化向目标函数添加正则化项,以减少参数的绝对值总和;而 L2 正则化中,添加正则化项的目的在于减少参数平方的总和。配置方法如下: + +```yaml linenums="1" +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + +### 2.3 评估指标 + +(1)检测阶段:先按照检测框和标注框的IOU评估,IOU大于某个阈值判断为检测准确。这里检测框和标注框不同于一般的通用目标检测框,是采用多边形进行表示。检测准确率:正确的检测框个数在全部检测框的占比,主要是判断检测指标。检测召回率:正确的检测框个数在全部标注框的占比,主要是判断漏检的指标。 + +(2)识别阶段: 字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。 + +(3)端到端统计: 端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; 端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; 准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的检测框中的文本与标注的文本相同。 + +## 3. 数据与垂类场景 + +### 3.1 训练数据 + +目前开源的模型,数据集和量级如下: + +- 检测: + - 英文数据集,ICDAR2015 + - 中文数据集,LSVT街景数据集训练数据3w张图片 + +- 识别: + - 英文数据集,MJSynth和SynthText合成数据,数据量上千万。 + - 中文数据集,LSVT街景数据集根据真值将图crop出来,并进行位置校准,总共30w张图像。此外基于LSVT的语料,合成数据500w。 + - 小语种数据集,使用不同语料和字体,分别生成了100w合成数据集,并使用ICDAR-MLT作为验证集。 + +其中,公开数据集都是开源的,用户可自行搜索下载,也可参考[中文数据集](../../datasets/datasets.md),合成数据暂不开源,用户可使用开源合成工具自行合成,可参考的合成工具包括[text_renderer](https://github.com/Sanster/text_renderer) 、[SynthText](https://github.com/ankush-me/SynthText) 、[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) 等。 + +### 3.2 垂类场景 + +PaddleOCR主要聚焦通用OCR,如果有垂类需求,您可以用PaddleOCR+垂类数据自己训练; +如果缺少带标注的数据,或者不想投入研发成本,建议直接调用开放的API,开放的API覆盖了目前比较常见的一些垂类。 + +### 3.3 自己构建数据集 + +在构建数据集时有几个经验可供参考: + +(1) 训练集的数据量: + +a. 检测需要的数据相对较少,在PaddleOCR模型的基础上进行Fine-tune,一般需要500张可达到不错的效果。 + +b. 识别分英文和中文,一般英文场景需要几十万数据可达到不错的效果,中文则需要几百万甚至更多。 + +(2)当训练数据量少时,可以尝试以下三种方式获取更多的数据: + +a. 人工采集更多的训练数据,最直接也是最有效的方式。 + +b. 基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。 + +c. 利用数据生成算法合成数据,例如pix2pix或[StyleText](https://github.com/PFCCLab/StyleText)等算法。 + +## 4. 常见问题 + +**Q**:训练CRNN识别时,如何选择合适的网络输入shape? + + A:一般高度采用32,最长宽度的选择,有两种方法: + + (1)统计训练样本图像的宽高比分布。最大宽高比的选取考虑满足80%的训练样本。 + + (2)统计训练样本文字数目。最长字符数目的选取考虑满足80%的训练样本。然后中文字符长宽比近似认为是1,英文认为3:1,预估一个最长宽度。 + +**Q**:识别训练时,训练集精度已经到达90了,但验证集精度一直在70,涨不上去怎么办? + + A:训练集精度90,测试集70多的话,应该是过拟合了,有两个可尝试的方法: + + (1)加入更多的增广方式或者调大增广prob的[概率](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341),默认为0.4。 + + (2)调大系统的[l2 dcay值](https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: 识别模型训练时,loss能正常下降,但acc一直为0 + + A:识别模型训练初期acc为0是正常的,多训一段时间指标就上来了。 + +*** + +具体的训练教程可点击下方链接跳转: + +- [文本检测模型训练](./detection.md) +- [文本识别模型训练](./recognition.md) +- [文本方向分类器训练](./angle_class.md) +- [知识蒸馏](../model_compress/knowledge_distillation.md) diff --git a/docs/ppocr/overview.en.md b/docs/ppocr/overview.en.md new file mode 100644 index 0000000000..84111afc2c --- /dev/null +++ b/docs/ppocr/overview.en.md @@ -0,0 +1,95 @@ +--- +comments: true +--- + +# PP-OCR + +## 1. Introduction + +PP-OCR is a self-developed practical ultra-lightweight OCR system, which is slimed and optimized based on the reimplemented [academic algorithms](../algorithm/overview.en.md), considering the balance between **accuracy** and **speed**. + +### PP-OCR + +PP-OCR is a two-stage OCR system, in which the text detection algorithm is [DB](../algorithm/text_detection/algorithm_det_db.en.md), and the text recognition algorithm is [CRNN](../algorithm/text_recognition/algorithm_rec_crnn.en.md). Besides, a [text direction classifier](./model_train/angle_class.en.md) is added between the detection and recognition modules to deal with text in different directions. + +PP-OCR pipeline is as follows: + +![](./images/ppocrv2_framework.jpg) + +PP-OCR system is in continuous optimization. At present, PP-OCR and PP-OCRv2 have been released: + +PP-OCR adopts 19 effective strategies from 8 aspects including backbone network selection and adjustment, prediction head design, data augmentation, learning rate transformation strategy, regularization parameter selection, pre-training model use, and automatic model tailoring and quantization to optimize and slim down the models of each module (as shown in the green box above). The final results are an ultra-lightweight Chinese and English OCR model with an overall size of 3.5M and a 2.8M English digital OCR model. For more details, please refer to [PP-OCR technical report](https://arxiv.org/abs/2009.09941). + +### PP-OCRv2 + +On the basis of PP-OCR, PP-OCRv2 is further optimized in five aspects. The detection model adopts CML(Collaborative Mutual Learning) knowledge distillation strategy and CopyPaste data expansion strategy. The recognition model adopts LCNet lightweight backbone network, U-DML knowledge distillation strategy and [enhanced CTC loss](./blog/enhanced_ctc_loss.en.md) function improvement (as shown in the red box above), which further improves the inference speed and prediction effect. For more details, please refer to [PP-OCRv2 technical report](https://arxiv.org/abs/2109.03144). + +### PP-OCRv3 + +PP-OCRv3 upgraded the detection model and recognition model in 9 aspects based on PP-OCRv2: + +- PP-OCRv3 detector upgrades the CML(Collaborative Mutual Learning) text detection strategy proposed in PP-OCRv2, and further optimizes the effect of teacher model and student model respectively. In the optimization of teacher model, a pan module with large receptive field named LK-PAN is proposed and the DML distillation strategy is adopted; In the optimization of student model, a FPN module with residual attention mechanism named RSE-FPN is proposed. +- PP-OCRv3 recognizer is optimized based on text recognition algorithm [SVTR](https://arxiv.org/abs/2205.00159). SVTR no longer adopts RNN by introducing transformers structure, which can mine the context information of text line image more effectively, so as to improve the ability of text recognition. PP-OCRv3 adopts lightweight text recognition network SVTR_LCNet, guided training of CTC by attention, data augmentation strategy TextConAug, better pre-trained model by self-supervised TextRotNet, UDML(Unified Deep Mutual Learning), and UIM (Unlabeled Images Mining) to accelerate the model and improve the effect. + +PP-OCRv3 pipeline is as follows: + +![](./images/ppocrv3_framework.png) + +For more details, please refer to [PP-OCRv3 technical report](https://arxiv.org/abs/2206.03001v2). + +## 2. Features + +- Ultra lightweight PP-OCRv3 series models: detection (3.6M) + direction classifier (1.4M) + recognition 12M) = 17.0M +- Ultra lightweight PP-OCRv2 series models: detection (3.1M) + direction classifier (1.4M) + recognition 8.5M) = 13.0M +- Ultra lightweight PP-OCR mobile series models: detection (3.0M) + direction classifier (1.4M) + recognition (5.0M) = 9.4M +- General PP-OCR server series models: detection (47.1M) + direction classifier (1.4M) + recognition (94.9M) = 143.4M +- Support Chinese, English, and digit recognition, vertical text recognition, and long text recognition +- Support multi-lingual recognition: about 80 languages like Korean, Japanese, German, French, etc + +## 3. benchmark + +For the performance comparison between PP-OCR series models, please check the [benchmark](./infer_deploy/benchmark.en.md) documentation. + +## 4. Visualization [more](./visualization.en.md) + +### PP-OCRv3 Chinese model + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg) +![](./images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg) +![](./images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg) + +### PP-OCRv3 English model + +![](./images/PP-OCRv3/en/en_1.png) +![](./images/PP-OCRv3/en/en_2.png) + +### PP-OCRv3 Multilingual model + +![](./images/PP-OCRv3/multi_lang/japan_2.jpg) +![](./images/PP-OCRv3/multi_lang/korean_1.jpg) + +## 5. Tutorial + +### 5.1 Quick start + +- You can also quickly experience the ultra-lightweight OCR : [Online Experience](https://www.paddlepaddle.org.cn/hub/scene/ocr) +- Mobile DEMO experience (based on EasyEdge and Paddle-Lite, supports iOS and Android systems): [Sign in to the website to obtain the QR code for installing the App](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite) +- One line of code quick use: [Quick Start](./quickstart_en.md) + +### 5.2 Model training / compression / deployment + +For more tutorials, including model training, model compression, deployment, etc., please refer to [tutorials](../../README.md#Tutorials)。 + +## 6. Model zoo + +## PP-OCR Series Model List(Update on 2022.04.28) + +| Model introduction | Model name | Recommended scene | Detection model | Direction classifier | Recognition model | +| ----- | --- | --- | ------ | ---- | ------ | +| Chinese and English ultra-lightweight PP-OCRv3 model(16.2M) | ch_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| English ultra-lightweight PP-OCRv3 model(13.4M) | en_PP-OCRv3_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +| Chinese and English ultra-lightweight PP-OCRv2 model(11.6M) | ch_PP-OCRv2_xx | Mobile & Server | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +| Chinese and English ultra-lightweight PP-OCR model (9.4M) | ch_ppocr_mobile_v2.0_xx | Mobile & server | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar) | +| Chinese and English general PP-OCR model (143.4M) | ch_ppocr_server_v2.0_xx | Server | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_train.tar) | + +For more model downloads (including multiple languages), please refer to [PP-OCR series model downloads](./model_list.en.md). diff --git a/docs/ppocr/overview.md b/docs/ppocr/overview.md new file mode 100644 index 0000000000..592df555f0 --- /dev/null +++ b/docs/ppocr/overview.md @@ -0,0 +1,104 @@ +--- +comments: true +--- + +# PP-OCR + +## 1. 简介 + +PP-OCR是PaddleOCR自研的实用的超轻量OCR系统。在实现[前沿算法](../algorithm/overview.md)的基础上,考虑精度与速度的平衡,进行**模型瘦身**和**深度优化**,使其尽可能满足产业落地需求。 + +### PP-OCR + +PP-OCR是一个两阶段的OCR系统,其中文本检测算法选用[DB](../algorithm/text_detection/algorithm_det_db.md),文本识别算法选用[CRNN](../algorithm/text_recognition/algorithm_rec_crnn.md),并在检测和识别模块之间添加[文本方向分类器](./model_train/angle_class.md),以应对不同方向的文本识别。 + +PP-OCR系统pipeline如下: + +![](./images/ppocrv2_framework.jpg) + +PP-OCR系统在持续迭代优化,目前已发布PP-OCR和PP-OCRv2两个版本: + +PP-OCR从骨干网络选择和调整、预测头部的设计、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型自动裁剪量化8个方面,采用19个有效策略,对各个模块的模型进行效果调优和瘦身(如绿框所示),最终得到整体大小为3.5M的超轻量中英文OCR和2.8M的英文数字OCR。更多细节请参考[PP-OCR技术报告](https://arxiv.org/abs/2009.09941)。 + +### PP-OCRv2 + +PP-OCRv2在PP-OCR的基础上,进一步在5个方面重点优化,检测模型采用CML协同互学习知识蒸馏策略和CopyPaste数据增广策略;识别模型采用LCNet轻量级骨干网络、UDML 改进知识蒸馏策略和[Enhanced CTC loss](./blog/enhanced_ctc_loss.md)损失函数改进(如上图红框所示),进一步在推理速度和预测效果上取得明显提升。更多细节请参考[PP-OCRv2技术报告](https://arxiv.org/abs/2109.03144)。 + +### PP-OCRv3 + +PP-OCRv3在PP-OCRv2的基础上,针对检测模型和识别模型,进行了共计9个方面的升级: + +- PP-OCRv3检测模型对PP-OCRv2中的CML协同互学习文本检测蒸馏策略进行了升级,分别针对教师模型和学生模型进行进一步效果优化。其中,在对教师模型优化时,提出了大感受野的PAN结构LK-PAN和引入了DML蒸馏策略;在对学生模型优化时,提出了残差注意力机制的FPN结构RSE-FPN。 +- PP-OCRv3的识别模块是基于文本识别算法[SVTR](https://arxiv.org/abs/2205.00159)优化。SVTR不再采用RNN结构,通过引入Transformers结构更加有效地挖掘文本行图像的上下文信息,从而提升文本识别能力。PP-OCRv3通过轻量级文本识别网络SVTR_LCNet、Attention损失指导CTC损失训练策略、挖掘文字上下文信息的数据增广策略TextConAug、TextRotNet自监督预训练模型、UDML联合互学习策略、UIM无标注数据挖掘方案,6个方面进行模型加速和效果提升。 + +PP-OCRv3系统pipeline如下: + +![](./images/ppocrv3_framework.png) + +更多细节请参考[PP-OCRv3技术报告](https://arxiv.org/abs/2206.03001v2) 👉[中文简洁版](./blog/PP-OCRv3_introduction.md) + +### PP-OCRv4 + +- PP-OCRv4-mobile:速度可比情况下,中文场景效果相比于 PP-OCRv3 再提升 4.5%,英文场景提升 10%,80 语种多语言模型平均识别准确率提升 8%以上 +- PP-OCRv4-server:发布了目前精度最高的 OCR 模型,中英文场景上检测模型精度提升 4.9%, 识别模型精度提升 2% + +更多细节参见:[PP-OCRv4技术报告](./blog/PP-OCRv4_introduction.md) + +## 2. 特性 + +- 超轻量PP-OCRv3系列:检测(3.6M)+ 方向分类器(1.4M)+ 识别(12M)= 17.0M +- 超轻量PP-OCRv2系列:检测(3.1M)+ 方向分类器(1.4M)+ 识别(8.5M)= 13.0M +- 超轻量PP-OCR mobile移动端系列:检测(3.0M)+方向分类器(1.4M)+ 识别(5.0M)= 9.4M +- 通用PP-OCR server系列:检测(47.1M)+方向分类器(1.4M)+ 识别(94.9M)= 143.4M +- 支持中英文数字组合识别、竖排文本识别、长文本识别 +- 支持多语言识别:韩语、日语、德语、法语等约80种语言 + +## 3. benchmark + +关于PP-OCR系列模型之间的性能对比,请查看[benchmark](./infer_deploy/benchmark.md)文档。 + +## 4. 效果展示 [more](./visualization.md) + +### PP-OCRv3 中文模型 + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg) + +### PP-OCRv3 英文模型 + +![](./images/PP-OCRv3/en/en_1.png) +![](./images/PP-OCRv3/en/en_2.png) + +### PP-OCRv3 多语言模型 + +![](./images/PP-OCRv3/multi_lang/japan_2.jpg) +![](./images/PP-OCRv3/multi_lang/korean_1.jpg) + +## 5. 使用教程 + +### 5.1 快速体验 + +- 在线网站体验:超轻量PP-OCR mobile模型体验地址: +- 移动端demo体验:[安装包DEMO下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于EasyEdge和Paddle-Lite, 支持iOS和Android系统) +- 一行命令快速使用:[快速开始(中英文/多语言)](./quick_start.md) + +### 5.2 模型训练、压缩、推理部署 + +更多教程,包括模型训练、模型压缩、推理部署等, + +## 6. 模型库 + +PP-OCR中英文模型列表如下: + +| 模型简介 | 模型名称 | 推荐场景 | 检测模型 | 方向分类器 | 识别模型 | +| ----- | ----- | --------------- | ---- | -------------- | --- | +| 中英文超轻量PP-OCRv3模型(16.2M) | ch_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar) | +| 英文超轻量PP-OCRv3模型(13.4M) | en_PP-OCRv3_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_train.tar) | +| 中英文超轻量PP-OCRv2模型(13.0M) | ch_PP-OCRv2_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar) | +| 中英文超轻量PP-OCR mobile模型(9.4M) | ch_ppocr_mobile_v2.0_xx | 移动端&服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_pre.tar) | +| 中英文通用PP-OCR server模型(143.4M) | ch_ppocr_server_v2.0_xx | 服务器端 | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_train.tar) | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_infer.tar) / [预训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_rec_pre.tar) | + +更多模型下载(包括英文数字模型、多语言模型、Paddle-Lite模型等),可以参考[PP-OCR 系列模型下载](./model_list.md)。 diff --git a/docs/ppocr/quick_start.en.md b/docs/ppocr/quick_start.en.md new file mode 100644 index 0000000000..079408e815 --- /dev/null +++ b/docs/ppocr/quick_start.en.md @@ -0,0 +1,286 @@ +--- +comments: true +--- + +# PaddleOCR Quick Start + +**Note:** This tutorial mainly introduces the usage of PP-OCR series models, please refer to [PP-Structure Quick Start](../ppstructure/overview.en.md) for the quick use of document analysis related functions. + +## 1. Installation + +### 1.1 Install PaddlePaddle + +> If you do not have a Python environment, please refer to [Environment Preparation](./environment.en.md). + +- If you have CUDA 11 installed on your machine, please run the following command to install + + ```bash linenums="1" + pip install paddlepaddle-gpu + ``` + +- If you have no available GPU on your machine, please run the following command to install the CPU version + + ```bash linenums="1" + python -m pip install paddlepaddle + ``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/en/install/quick) for operation. + +### 1.2 Install PaddleOCR Whl Package + +```bash linenums="1" +pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+ +``` + +- **For windows users:** If you getting this error `OSError: [WinError 126] The specified module could not be found` when you install shapely on windows. Please try to download Shapely whl file [here](http://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely). + + Reference: [Solve shapely installation on windows](https://stackoverflow.com/questions/44398265/install-shapely-oserror-winerror-126-the-specified-module-could-not-be-found) + +## 2. Easy-to-Use + +### 2.1 Use by Command Line + +PaddleOCR provides a series of test images, click [here](https://paddleocr.bj.bcebos.com/dygraph_v2.1/ppocr_img.zip) to download, and then switch to the corresponding directory in the terminal + +```bash linenums="1" +cd /path/to/ppocr_img +``` + +If you do not use the provided test image, you can replace the following `--image_dir` parameter with the corresponding test image path + +#### 2.1.1 Chinese and English Model + +- Detection, direction classification and recognition: set the parameter`--use_gpu false` to disable the gpu device + + ```bash linenums="1" + paddleocr --image_dir ./imgs_en/img_12.jpg --use_angle_cls true --lang en --use_gpu false + ``` + + Output will be a list, each item contains bounding box, text and recognition confidence + + ```bash linenums="1" + [[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)] + [[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)] + [[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)] + ...... + ``` + + pdf file is also supported, you can infer the first few pages by using the `page_num` parameter, the default is 0, which means infer all pages + + ```bash linenums="1" + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + +- Only detection: set `--rec` to `false` + + ```bash linenums="1" + paddleocr --image_dir ./imgs_en/img_12.jpg --rec false + ``` + + Output will be a list, each item only contains bounding box + + ```bash linenums="1" + [[397.0, 802.0], [1092.0, 802.0], [1092.0, 841.0], [397.0, 841.0]] + [[397.0, 750.0], [1211.0, 750.0], [1211.0, 789.0], [397.0, 789.0]] + [[397.0, 702.0], [1209.0, 698.0], [1209.0, 734.0], [397.0, 738.0]] + ...... + ``` + +- Only recognition: set `--det` to `false` + + ```bash linenums="1" + paddleocr --image_dir ./imgs_words_en/word_10.png --det false --lang en + ``` + + Output will be a list, each item contains text and recognition confidence + + ```bash linenums="1" + ['PAIN', 0.9934559464454651] + ``` + +**Version** +paddleocr uses the PP-OCRv4 model by default(`--ocr_version PP-OCRv4`). If you want to use other versions, you can set the parameter `--ocr_version`, the specific version description is as follows: + +| version name | description | +| --- | --- | +| PP-OCRv4 | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | +| PP-OCRv3 | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | +| PP-OCRv2 | only supports Chinese and English detection and recognition, direction classifier, multilingual model is not updated | +| PP-OCR | support Chinese and English detection and recognition, direction classifier, support multilingual recognition | + +If you want to add your own trained model, you can add model links and keys in [paddleocr](https://github.com/PaddlePaddle/PaddleOCR/blob/c65a66c5fd37dee64916a8b2a2c84ea273d98cac/paddleocr.py) and recompile. + +More whl package usage can be found in [whl package](./blog/whl.en.md) + +#### 2.1.2 Multi-language Model + +PaddleOCR currently supports 80 languages, which can be switched by modifying the `--lang` parameter. + +``` bash +paddleocr --image_dir ./doc/imgs_en/254.jpg --lang=en +``` + +![](./images/254.jpg) + +![](./images/multi_lang/img_02.jpg) + +The result is a list, each item contains a text box, text and recognition confidence + +```text linenums="1" +[[[67.0, 51.0], [327.0, 46.0], [327.0, 74.0], [68.0, 80.0]], ('PHOCAPITAL', 0.9944712519645691)] +[[[72.0, 92.0], [453.0, 84.0], [454.0, 114.0], [73.0, 122.0]], ('107 State Street', 0.9744491577148438)] +[[[69.0, 135.0], [501.0, 125.0], [501.0, 156.0], [70.0, 165.0]], ('Montpelier Vermont', 0.9357033967971802)] +...... +``` + +Commonly used multilingual abbreviations include + +| Language | Abbreviation | | Language | Abbreviation | | Language | Abbreviation | +| ------------------- | ------------ | ---- | -------- | ------------ | ---- | -------- | ------------ | +| Chinese & English | ch | | French | fr | | Japanese | japan | +| English | en | | German | german | | Korean | korean | +| Chinese Traditional | chinese_cht | | Italian | it | | Russian | ru | + +A list of all languages and their corresponding abbreviations can be found in [Multi-Language Model Tutorial](./blog/multi_languages.en.md) + +### 2.2 Use by Code + +#### 2.2.1 Chinese & English Model and Multilingual Model + +- detection, angle classification and recognition: + +```python linenums="1" +from paddleocr import PaddleOCR,draw_ocr +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory +img_path = './imgs_en/img_12.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + + +# draw result +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +Output will be a list, each item contains bounding box, text and recognition confidence + +```bash linenums="1" +[[[441.0, 174.0], [1166.0, 176.0], [1165.0, 222.0], [441.0, 221.0]], ('ACKNOWLEDGEMENTS', 0.9971134662628174)] + [[[403.0, 346.0], [1204.0, 348.0], [1204.0, 384.0], [402.0, 383.0]], ('We would like to thank all the designers and', 0.9761400818824768)] + [[[403.0, 396.0], [1204.0, 398.0], [1204.0, 434.0], [402.0, 433.0]], ('contributors who have been involved in the', 0.9791957139968872)] + ...... +``` + +Visualization of results + +![](./images/11_det_rec.jpg) + +If the input is a PDF file, you can refer to the following code for visualization + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr supports Chinese, English, French, German, Korean and Japanese. +# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan` +# to switch the language model in order. +PAGE_NUM = 10 # Set the recognition page number +pdf_path = 'default.pdf' +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM) # need to run only once to download and load model into memory +# ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM,use_gpu=0) # To Use GPU,uncomment this line and comment the above one. +result = ocr.ocr(pdf_path, cls=True) +for idx in range(len(result)): + res = result[idx] + if res == None: # Skip when empty result detected to avoid TypeError:NoneType + print(f"[DEBUG] Empty page {idx+1} detected, skip it.") + continue + for line in res: + print(line) + +# draw the result +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(pdf_path) as pdf: + for pg in range(0, PAGE_NUM): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.get_pixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + if res == None: + continue + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +- Detection and Recognition Using Sliding Windows + +To perform OCR using sliding windows, the following code snippet can be employed: + +```python linenums="1" +from paddleocr import PaddleOCR +from PIL import Image, ImageDraw, ImageFont + +# Initialize OCR engine +ocr = PaddleOCR(use_angle_cls=True, lang="en") + +img_path = "./very_large_image.jpg" +slice = {'horizontal_stride': 300, 'vertical_stride': 500, 'merge_x_thres': 50, 'merge_y_thres': 35} +results = ocr.ocr(img_path, cls=True, slice=slice) + +# Load image +image = Image.open(img_path).convert("RGB") +draw = ImageDraw.Draw(image) +font = ImageFont.truetype("./doc/fonts/simfang.ttf", size=20) # Adjust size as needed + +# Process and draw results +for res in results: + for line in res: + box = [tuple(point) for point in line[0]] + # Finding the bounding box + box = [(min(point[0] for point in box), min(point[1] for point in box)), + (max(point[0] for point in box), max(point[1] for point in box))] + txt = line[1][0] + draw.rectangle(box, outline="red", width=2) # Draw rectangle + draw.text((box[0][0], box[0][1] - 25), txt, fill="blue", font=font) # Draw text above the box + +# Save result +image.save("result.jpg") + +``` + +This example initializes the PaddleOCR instance with angle classification enabled and sets the language to English. The `ocr` method is then called with several parameters to customize the detection and recognition process, including the `slice` parameter for handling image slices. + +For a more comprehensive understanding of the slicing operation, please refer to the [slice operation documentation](./blog/slice.en.md). + +## 3. Summary + +In this section, you have mastered the use of PaddleOCR whl package. + +PaddleX provides a high-quality ecological model of the paddle. It is a one-stop full-process high-efficiency development platform for training, pressing and pushing. Its mission is to help AI technology to be implemented quickly. The vision is to make everyone an AI Developer! Currently PP-OCRv4 has been launched on PaddleX, you can enter [General OCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286) to experience the whole process of model training, compression and inference deployment. diff --git a/docs/ppocr/quick_start.md b/docs/ppocr/quick_start.md new file mode 100644 index 0000000000..ee60180626 --- /dev/null +++ b/docs/ppocr/quick_start.md @@ -0,0 +1,278 @@ +--- +comments: true +--- + +# PaddleOCR 快速开始 + +**说明:** 本文主要介绍PaddleOCR wheel包对PP-OCR系列模型的快速使用,如要体验文档分析相关功能,请参考[PP-Structure快速使用教程](../ppstructure/overview.md)。 + +## 1. 安装 + +### 1.1 安装PaddlePaddle + +> 如果您没有基础的Python运行环境,请参考[运行环境准备](./environment.md)。 + +- 您的机器安装的是CUDA 11,请运行以下命令安装 + + ```bash linenums="1" + pip install paddlepaddle-gpu + ``` + +- 您的机器是CPU,请运行以下命令安装 + + ```bash linenums="1" + pip install paddlepaddle + ``` + +更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 1.2 安装PaddleOCR whl包 + +```bash linenums="1" +pip install paddleocr +``` + +- 对于Windows环境用户:直接通过pip安装的shapely库可能出现`[winRrror 126] 找不到指定模块的问题`。建议从[这里](https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely)下载shapely安装包完成安装。 + +## 2. 便捷使用 + +### 2.1 命令行使用 + +PaddleOCR提供了一系列测试图片,点击[这里](https://paddleocr.bj.bcebos.com/dygraph_v2.1/ppocr_img.zip)下载并解压,然后在终端中切换到相应目录 + +```bash linenums="1" +cd /path/to/ppocr_img +``` + +如果不使用提供的测试图片,可以将下方`--image_dir`参数替换为相应的测试图片路径。 + +#### 2.1.1 中英文模型 + +- 检测+方向分类器+识别全流程:`--use_angle_cls true`设置使用方向分类器识别180度旋转文字,`--use_gpu false`设置不使用GPU + + ```bash linenums="1" + paddleocr --image_dir ./imgs/11.jpg --use_angle_cls true --use_gpu false + ``` + + 结果是一个list,每个item包含了文本框,文字和识别置信度 + + ```bash linenums="1" + [[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)] + ...... + ``` + + 此外,paddleocr也支持输入pdf文件,并且可以通过指定参数`page_num`来控制推理前面几页,默认为0,表示推理所有页。 + + ```bash linenums="1" + paddleocr --image_dir ./xxx.pdf --use_angle_cls true --use_gpu false --page_num 2 + ``` + +- 单独使用检测:设置`--rec`为`false` + + ```bash linenums="1" + paddleocr --image_dir ./imgs/11.jpg --rec false + ``` + + 结果是一个list,每个item只包含文本框 + + ```bash linenums="1" + [[27.0, 459.0], [136.0, 459.0], [136.0, 479.0], [27.0, 479.0]] + [[28.0, 429.0], [372.0, 429.0], [372.0, 445.0], [28.0, 445.0]] + ...... + ``` + +- 单独使用识别:设置`--det`为`false` + + ```bash linenums="1" + paddleocr --image_dir ./imgs_words/ch/word_1.jpg --det false + ``` + + 结果是一个list,每个item只包含识别结果和识别置信度 + + ```bash linenums="1" + ['韩国小馆', 0.994467] + ``` + +**版本说明** +paddleocr默认使用PP-OCRv4模型(`--ocr_version PP-OCRv4`),如需使用其他版本可通过设置参数`--ocr_version`,具体版本说明如下: + +| 版本名称 | 版本说明 | +| -------- | -------------------------------------------------- | +| PP-OCRv4 | 支持中、英文检测和识别,方向分类器,支持多语种识别 | +| PP-OCRv3 | 支持中、英文检测和识别,方向分类器,支持多语种识别 | +| PP-OCRv2 | 支持中英文的检测和识别,方向分类器,多语言暂未更新 | +| PP-OCR | 支持中、英文检测和识别,方向分类器,支持多语种识别 | + +如需新增自己训练的模型,可以在[paddleocr](https://github.com/PaddlePaddle/PaddleOCR/blob/c65a66c5fd37dee64916a8b2a2c84ea273d98cac/paddleocr.py)中增加模型链接和字段,重新编译即可。 + +更多whl包使用可参考[whl包文档](./blog/whl.md) + +#### 2.1.2 多语言模型 + +PaddleOCR目前支持80个语种,可以通过修改`--lang`参数进行切换,对于英文模型,指定`--lang=en`。 + +``` bash +paddleocr --image_dir ./imgs_en/254.jpg --lang=en +``` + +![](./images/254.jpg) + +![](./images/multi_lang/img_02.jpg) + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```text linenums="1" +[[[67.0, 51.0], [327.0, 46.0], [327.0, 74.0], [68.0, 80.0]], ('PHOCAPITAL', 0.9944712519645691)] +[[[72.0, 92.0], [453.0, 84.0], [454.0, 114.0], [73.0, 122.0]], ('107 State Street', 0.9744491577148438)] +[[[69.0, 135.0], [501.0, 125.0], [501.0, 156.0], [70.0, 165.0]], ('Montpelier Vermont', 0.9357033967971802)] +...... +``` + +常用的多语言简写包括 + +| 语种 | 缩写 | | 语种 | 缩写 | | 语种 | 缩写 | +| -------- | ----------- | --- | -------- | ------ | --- | -------- | ------ | +| 中文 | ch | | 法文 | fr | | 日文 | japan | +| 英文 | en | | 德文 | german | | 韩文 | korean | +| 繁体中文 | chinese_cht | | 意大利文 | it | | 俄罗斯文 | ru | + +全部语种及其对应的缩写列表可查看[多语言模型教程](./blog/multi_languages.md) + +### 2.2 Python脚本使用 + +#### 2.2.1 中英文与多语言使用 + +通过Python脚本使用PaddleOCR whl包,whl包会自动下载ppocr轻量级模型作为默认模型。 + +- 检测+方向分类器+识别全流程 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = './imgs/11.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +from PIL import Image +result = result[0] +image = Image.open(img_path).convert('RGB') +boxes = [line[0] for line in result] +txts = [line[1][0] for line in result] +scores = [line[1][1] for line in result] +im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +结果是一个list,每个item包含了文本框,文字和识别置信度 + +```bash linenums="1" +[[[28.0, 37.0], [302.0, 39.0], [302.0, 72.0], [27.0, 70.0]], ('纯臻营养护发素', 0.9658738374710083)] +...... +``` + +结果可视化 + +![](./images/11_det_rec.jpg) + +如果输入是PDF文件,那么可以参考下面代码进行可视化 + +```python linenums="1" +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +PAGE_NUM = 10 # 将识别页码前置作为全局,防止后续打开pdf的参数和前文识别参数不一致 / Set the recognition page number +pdf_path = 'default.pdf' +ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM) # need to run only once to download and load model into memory +# ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=PAGE_NUM,use_gpu=0) # 如果需要使用GPU,请取消此行的注释 并注释上一行 / To Use GPU,uncomment this line and comment the above one. +result = ocr.ocr(pdf_path, cls=True) +for idx in range(len(result)): + res = result[idx] + if res == None: # 识别到空页就跳过,防止程序报错 / Skip when empty result detected to avoid TypeError:NoneType + print(f"[DEBUG] Empty page {idx+1} detected, skip it.") + continue + for line in res: + print(line) +# 显示结果 +import fitz +from PIL import Image +import cv2 +import numpy as np +imgs = [] +with fitz.open(pdf_path) as pdf: + for pg in range(0, PAGE_NUM): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.get_pixmap(matrix=mat, alpha=False) + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) +for idx in range(len(result)): + res = result[idx] + if res == None: + continue + image = imgs[idx] + boxes = [line[0] for line in res] + txts = [line[1][0] for line in res] + scores = [line[1][1] for line in res] + im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') + im_show = Image.fromarray(im_show) + im_show.save('result_page_{}.jpg'.format(idx)) +``` + +- 使用滑动窗口进行检测和识别 + +要使用滑动窗口进行光学字符识别(OCR),可以使用以下代码片段: + +```python linenums="1" +from paddleocr import PaddleOCR +from PIL import Image, ImageDraw, ImageFont + +# 初始化OCR引擎 +ocr = PaddleOCR(use_angle_cls=True, lang="en") + +img_path = "./very_large_image.jpg" +slice = {'horizontal_stride': 300, 'vertical_stride': 500, 'merge_x_thres': 50, 'merge_y_thres': 35} +results = ocr.ocr(img_path, cls=True, slice=slice) + +# 加载图像 +image = Image.open(img_path).convert("RGB") +draw = ImageDraw.Draw(image) +font = ImageFont.truetype("./doc/fonts/simfang.ttf", size=20) # 根据需要调整大小 + +# 处理并绘制结果 +for res in results: + for line in res: + box = [tuple(point) for point in line[0]] + # 找出边界框 + box = [(min(point[0] for point in box), min(point[1] for point in box)), + (max(point[0] for point in box), max(point[1] for point in box))] + txt = line[1][0] + draw.rectangle(box, outline="red", width=2) # 绘制矩形 + draw.text((box[0][0], box[0][1] - 25), txt, fill="blue", font=font) # 在矩形上方绘制文本 + +# 保存结果 +image.save("result.jpg") + +``` + +此示例初始化了启用角度分类的PaddleOCR实例,并将语言设置为英语。然后调用`ocr`方法,并使用多个参数来自定义检测和识别过程,包括处理图像切片的`slice`参数。 + +要更全面地了解切片操作,请参考[切片操作文档](./blog/slice.md)。 + +## 3. 小结 + +通过本节内容,相信您已经熟练掌握PaddleOCR whl包的使用方法并获得了初步效果。 + +飞桨AI套件(PaddleX)提供了飞桨生态优质模型,是训压推一站式全流程高效率开发平台,其使命是助力AI技术快速落地,愿景是使人人成为AI Developer!目前PP-OCRv4已上线PaddleX,您可以进入[通用OCR](https://aistudio.baidu.com/aistudio/modelsdetail?modelId=286)体验模型训练、压缩和推理部署全流程。 diff --git a/docs/ppocr/visualization.en.md b/docs/ppocr/visualization.en.md new file mode 100644 index 0000000000..9fd6601089 --- /dev/null +++ b/docs/ppocr/visualization.en.md @@ -0,0 +1,55 @@ +--- +comments: true +--- + +# Visualization + +## PP-OCRv3 + +### PP-OCRv3 Chinese model + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg) + +### PP-OCRv3 English model + +![](./images/PP-OCRv3/en/en_1.png) + +![](./images/PP-OCRv3/en/en_2.png) + +![](./images/PP-OCRv3/en/en_3.png) + +### PP-OCRv3 Multilingual model + +![](./images/PP-OCRv3/multi_lang/japan_2.jpg) + +![](./images/PP-OCRv3/multi_lang/korean_1.jpg) + +## PP-OCRv2 + +![](./images/PP-OCRv2/PP-OCRv2-pic001.jpg) + +![](./images/PP-OCRv2/PP-OCRv2-pic002.jpg) + +![](./images/PP-OCRv2/PP-OCRv2-pic003.jpg) + +## ch_ppocr_server_2.0 + +![](./images/ch_ppocr_mobile_v2.0/00006737.jpg) + +![](./images/ch_ppocr_mobile_v2.0/00009282.jpg) + +![](./images/ch_ppocr_mobile_v2.0/00015504.jpg) + +## en_ppocr_mobile_2.0 + +![](./images/ch_ppocr_mobile_v2.0/img_12.jpg) + +## (multilingual)_ppocr_mobile_2.0 + +![](./images/multi_lang/french_0.jpg) + +![](./images/multi_lang/korean_0.jpg) diff --git a/docs/ppocr/visualization.md b/docs/ppocr/visualization.md new file mode 100644 index 0000000000..2e6180dd79 --- /dev/null +++ b/docs/ppocr/visualization.md @@ -0,0 +1,55 @@ +--- +comments: true +--- + +# 效果展示 + +## 超轻量PP-OCRv3效果展示 + +### PP-OCRv3中文模型 + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic001.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic002.jpg) + +![](./images/PP-OCRv3/ch/PP-OCRv3-pic003.jpg) + +### PP-OCRv3英文数字模型 + +![](./images/PP-OCRv3/en/en_1.png) + +![](./images/PP-OCRv3/en/en_2.png) + +![](./images/PP-OCRv3/en/en_3.png) + +### PP-OCRv3多语言模型 + +![](./images/PP-OCRv3/multi_lang/japan_2.jpg) + +![](./images/PP-OCRv3/multi_lang/korean_1.jpg) + +## 超轻量PP-OCRv2效果展示 + +![](./images/PP-OCRv2/PP-OCRv2-pic001.jpg) + +![](./images/PP-OCRv2/PP-OCRv2-pic002.jpg) + +![](./images/PP-OCRv2/PP-OCRv2-pic003.jpg) + +## 通用PP-OCR server 效果展示 + +![](./images/ch_ppocr_mobile_v2.0/00006737.jpg) + +![](./images/ch_ppocr_mobile_v2.0/00009282.jpg) + +![](./images/ch_ppocr_mobile_v2.0/00015504.jpg) + +## 英文识别模型效果展示 + +![](./images/ch_ppocr_mobile_v2.0/img_12.jpg) + +## 多语言识别模型效果展示 + +![](./images/multi_lang/french_0.jpg) + +![](./images/multi_lang/korean_0.jpg) diff --git a/docs/ppstructure/blog/how_to_do_kie.en.md b/docs/ppstructure/blog/how_to_do_kie.en.md new file mode 100644 index 0000000000..2b0cd75f98 --- /dev/null +++ b/docs/ppstructure/blog/how_to_do_kie.en.md @@ -0,0 +1,150 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Key Information Extraction Pipeline + +## 1. Introduction + +### 1.1 Background + +Key information extraction (KIE) refers to extracting key information from text or images. As the downstream task of OCR, KIE of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. However, it is time-consuming and laborious to extract key information from these document images by manpower. It's challengable but also valuable to combine multi-modal features (visual, layout, text, etc) together and complete KIE tasks. + +For the document images in a specific scene, the position and layout of the key information are relatively fixed. Therefore, in the early stage of the research, there are many methods based on template matching to extract the key information. This method is still widely used in many simple scenarios at present. However, it takes long time to adjut the template for different scenarios. + +The KIE in the document image generally contains 2 subtasks, which is as shown follows. + +* (1) SER: semantic entity recognition, which classifies each detected textline, such as dividing it into name and ID No. As shown in the red boxes in the following figure. + +* (2) RE: relationship extraction, which matches the question and answer based on SER results. As shown in the figure below, the yellow arrows match the question and answer. + +![](./images/184588654-d87f54f3-13ab-42c4-afc0-da79bead3f14.png) + +### 1.2 Mainstream Deep-learning Solutions + +General KIE methods are based on Named Entity Recognition (NER), but such methods only use text information and ignore location and visual feature information, which leads to limited accuracy. In recent years, most scholars have started to combine mutil-modal features to improve the accuracy of KIE model. The main methods are as follows: + +* (1) Grid based methods. These methods mainly focus on the fusion of multi-modal information at the image level. Most texts are of character granularity. The text and structure information embedding method is simple, such as the algorithm of chargrid [1]. + +* (2) Token based methods. These methods refer to the NLP methods such as Bert, which encode the position, vision and other feature information into the multi-modal model, and conduct pre-training on large-scale datasets, so that in downstream tasks, only a small amount of annotation data is required to obtain excellent results. The representative algorithms are layoutlm [2], layoutlmv2 [3], layoutxlm [4], structext [5], etc. + +* (3) GCN based methods. These methods try to learn the structural information between images and characters, so as to solve the problem of extracting open set information (templates not seen in the training set), such as GCN [6], SDMGR [7] and other algorithms. + +* (4) End to end based methods: these methods put the existing OCR character recognition and KIE information extraction tasks into a unified network for common learning, and strengthen each other in the learning process. Such as TRIE [8]. + +For more detailed introduction of the algorithms, please refer to Chapter 6 of [Diving into OCR](https://aistudio.baidu.com/aistudio/education/group/info/25207). + +## 2. KIE Pipeline + +Token based methods such as LayoutXLM are implemented in PaddleOCR. What's more, in PP-StructureV2, we simplify the LayoutXLM model and proposed VI-LayoutXLM, in which the visual feature extraction module is removed for speed-up. The textline sorting strategy conforming to the human reading order and UDML knowledge distillation strategy are utilized for higher model accuracy. + +In the non end-to-end KIE method, KIE needs at least **2 steps**. Firstly, the OCR model is used to extract the text and its position. Secondly, the KIE model is used to extract the key information according to the image, text position and text content. + +### 2.1 Train OCR Models + +#### 2.1.1 Text Detection + +##### (1) Data + +Most of the models provided in PaddleOCR are general models. In the process of text detection, the detection of adjacent text lines is generally based on the distance of the position. As shown in the figure above, when using PP-OCRv3 general English detection model for text detection, it is easy to detect the two fields representing different propoerties as one. Therefore, it is suggested to finetune a detection model according to your scenario firstly during the KIE task. + +During data annotation, the different key information needs to be separated. Otherwise, it will increase the difficulty of subsequent KIE tasks. + +For downstream tasks, generally speaking, `200~300` training images can guarantee the basic training effect. If there is not too much prior knowledge, **`200~300`** images can be labeled firstly for subsequent text detection model training. + +##### (2) Model + +In terms of model selection, PP-OCRv3 detection model is recommended. For more information about the training methods of the detection model, please refer to: [Text detection tutorial](../../doc/doc_en/detection_en.md) and [PP-OCRv3 detection model tutorial](../../doc/doc_ch/PPOCRv3_det_train.md). + +#### 2.1.2 Text recognition + +Compared with the natural scene, the text recognition in the document image is generally relatively easier (the background is not too complex), so **it is suggested to** try the PP-OCRv3 general text recognition model provided in PaddleOCR ([PP-OCRv3 model list](../../doc/doc_en/models_list_en.md)) + +##### (1) Data + +However, there are also some challenges in some document scenarios, such as rare words in ID card scenarios and special fonts in invoice and other scenarios. These problems will increase the difficulty of text recognition. At this time, if you want to ensure or further improve the model accuracy, it is recommended to load PP-OCRv3 model based on the text recognition dataset of specific document scenarios for finetuning. + +In the process of model finetuning, it is recommended to prepare at least `5000` vertical scene text recognition images to ensure the basic model fine-tuning effect. If you want to improve the accuracy and generalization ability of the model, you can synthesize more text recognition images similar to the scene, collect general real text recognition data from the public data set, and add them to the text recognition training process. In the training process, it is suggested that the ratio of real data, synthetic data and general data of each epoch should be around `1:1:1`, which can be controlled by setting the sampling ratio of different data sources. If there are 3 training text files, including 10k, 20k and 50k pieces of data respectively, the data can be set in the configuration file as follows: + +```yaml linenums="1" linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_10k.txt + - ./train_data/train_list_10k.txt + - ./train_data/train_list_50k.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +##### (2) Model + +In terms of model selection, PP-OCRv3 recognition model is recommended. For more information about the training methods of the recognition model, please refer to: [Text recognition tutorial](../../doc/doc_en/recognition_en.md) and [PP-OCRv3 model list](../../doc/doc_en/models_list_en.md). + +### 2.2 Train KIE Models + +There are two main methods to extract the key information from the recognized texts. + +(1) Directly use SER model to obtain the key information category. For example, in the ID card scenario, we mark "name" and "Geoff Sample" as "name_key" and "name_value", respectively. The **text field** corresponding to the category "name_value" finally identified is the key information we need. + +(2) Joint use SER and RE models. For this case, we firstly use SER model to obtain all questions (keys) and questions (values) for the image text, and then use RE model to match all keys and values to find the relationship, so as to complete the extraction of key information. + +#### 2.2.1 SER + +Take the ID card scenario as an example. The key information generally includes `name`, `DOB`, etc. We can directly mark the corresponding fields as specific categories, as shown in the following figure. + +
+ +
+ +**Note:** + +* In the labeling process, text content without key information about KIE shall be labeled as`other`, which is equivalent to background information. For example, in the ID card scenario, if we do not pay attention to `DOB` information, we can mark the categories of `DOB` and `Area manager` as `other`. +* In the annotation process of, it is required to annotate the **textline** position rather than the character. + +In terms of data, generally speaking, for relatively fixed scenes, **50** training images can achieve acceptable effects. You can refer to [PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel/blob/main/README.md) for finish the labeling process. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + +#### 2.2.2 SER + RE + +The SER model is mainly used to identify all keys and values in the document image, and the RE model is mainly used to match all keys and values. + +Taking the ID card scenario as an example, the key information generally includes key information such as `name`, `DOB`, etc. in the SER stage, we need to identify all questions (keys) and answers (values). The demo annotation is as follows. All keys can be annotated as `question`, and all values can be annotated as `answer`. + +![](./images/184526785-c3d2d310-cd57-4d31-b933-912716b29856.jpg) + +In the RE stage, the ID and connection information of each field need to be marked, as shown in the following figure. + +![](./images/184528728-626f77eb-fd9f-4709-a7dc-5411cc417dab.jpg) + +For each textline, you need to add 'ID' and 'linking' field information. The 'ID' records the unique identifier of the textline. Different text contents in the same images cannot be repeated. The 'linking' is a list that records the connection information between different texts. If the ID of the field "name" is 0 and the ID of the field "Geoff Sample" is 1, then they all have [[0, 1]] 'linking' marks, indicating that the fields with `id=0` and `id=1` form a key value relationship (the fields such as DOB and Expires are similar, and will not be repeated here). + +**Note:** + +During annotation, if value is multiple text lines, a key-value pair can be added in linking, such as `[[0, 1], [0, 2]]`. + +In terms of data, generally speaking, for relatively fixed scenes, about **50** training images can achieve acceptable effects. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + +## 3. Reference + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/docs/ppstructure/blog/how_to_do_kie.md b/docs/ppstructure/blog/how_to_do_kie.md new file mode 100644 index 0000000000..51ff3a7337 --- /dev/null +++ b/docs/ppstructure/blog/how_to_do_kie.md @@ -0,0 +1,146 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 怎样完成基于图像数据的信息抽取任务 + +## 1. 简介 + +### 1.1 背景 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。然而,使用人力从这些文档图像中提取或者收集关键信息耗时费力,怎样自动化融合图像中的视觉、布局、文字等特征并完成关键信息抽取是一个价值与挑战并存的问题。 + +对于特定场景的文档图像,其中的关键信息位置、版式等较为固定,因此在研究早期有很多基于模板匹配的方法进行关键信息的抽取,考虑到其流程较为简单,该方法仍然被广泛应用在目前的很多场景中。但是这种基于模板匹配的方法在应用到不同的场景中时,需要耗费大量精力去调整与适配模板,迁移成本较高。 + +文档图像中的KIE一般包含2个子任务,示意图如下图所示。 + +* (1)SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。 +* (2)RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题 (key) 和答案 (value) 。然后对每一个问题找到对应的答案,相当于完成key-value的匹配过程。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。 + +![](./images/184588654-d87f54f3-13ab-42c4-afc0-da79bead3f14.png) + +### 1.2 基于深度学习的主流方法 + +一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)来展开研究,但是此类方法仅使用了文本信息而忽略了位置与视觉特征信息,因此精度受限。近几年大多学者开始融合多个模态的输入信息,进行特征融合,并对多模态信息进行处理,从而提升KIE的精度。主要方法有以下几种 + +* (1)基于Grid的方法:此类方法主要关注图像层面多模态信息的融合,文本大多大多为字符粒度,对文本与结构结构信息的嵌入方式较为简单,如Chargrid[1]等算法。 +* (2)基于Token的方法:此类方法参考NLP中的BERT等方法,将位置、视觉等特征信息共同编码到多模态模型中,并且在大规模数据集上进行预训练,从而在下游任务中,仅需要少量的标注数据便可以获得很好的效果。如LayoutLM[2], LayoutLMv2[3], LayoutXLM[4], StrucText[5]等算法。 +* (3)基于GCN的方法:此类方法尝试学习图像、文字之间的结构信息,从而可以解决开集信息抽取的问题(训练集中没有见过的模板),如GCN[6]、SDMGR[7]等算法。 +* (4)基于End-to-end的方法:此类方法将现有的OCR文字识别以及KIE信息抽取2个任务放在一个统一的网络中进行共同学习,并在学习过程中相互加强。如Trie[8]等算法。 + +更多关于该系列算法的详细介绍,请参考“动手学OCR·十讲”课程的课节六部分:[文档分析理论与实践](https://aistudio.baidu.com/aistudio/education/group/info/25207)。 + +## 2. 关键信息抽取任务流程 + +PaddleOCR中实现了LayoutXLM等算法(基于Token),同时,在PP-StructureV2中,对LayoutXLM多模态预训练模型的网络结构进行简化,去除了其中的Visual backbone部分,设计了视觉无关的VI-LayoutXLM模型,同时引入符合人类阅读顺序的排序逻辑以及UDML知识蒸馏策略,最终同时提升了关键信息抽取模型的精度与推理速度。 + +下面介绍怎样基于PaddleOCR完成关键信息抽取任务。 + +在非End-to-end的KIE方法中,完成关键信息抽取,至少需要**2个步骤**:首先使用OCR模型,完成文字位置与内容的提取,然后使用KIE模型,根据图像、文字位置以及文字内容,提取出其中的关键信息。 + +### 2.1 训练OCR模型 + +#### 2.1.1 文本检测 + +##### (1)数据 + +PaddleOCR中提供的模型大多数为通用模型,在进行文本检测的过程中,相邻文本行的检测一般是根据位置的远近进行区分,如上图,使用PP-OCRv3通用中英文检测模型进行文本检测时,容易将”民族“与“汉”这2个代表不同的字段检测到一起,从而增加后续KIE任务的难度。因此建议在做KIE任务的过程中,首先训练一个针对该文档数据集的检测模型。 + +在数据标注时,关键信息的标注需要隔开,比上图中的 “民族汉” 3个字相隔较近,此时需要将”民族“与”汉“标注为2个文本检测框,否则会增加后续KIE任务的难度。 + +对于下游任务,一般来说,`200~300`张的文本训练数据即可保证基本的训练效果,如果没有太多的先验知识,可以先标注 **`200~300`** 张图片,进行后续文本检测模型的训练。 + +##### (2)模型 + +在模型选择方面,推荐使用PP-OCRv3_det,关于更多关于检测模型的训练方法介绍,请参考:[OCR文本检测模型训练教程](../../ppocr/model_train/detection.md)。 + +#### 2.1.2 文本识别 + +相对自然场景,文档图像中的文本内容识别难度一般相对较低(背景相对不太复杂),因此**优先建议**尝试PaddleOCR中提供的PP-OCRv3通用文本识别模型([PP-OCRv3模型库链接](../../ppocr/model_list.md))。 + +##### (1)数据 + +然而,在部分文档场景中也会存在一些挑战,如身份证场景中存在着罕见字,在发票等场景中的字体比较特殊,这些问题都会增加文本识别的难度,此时如果希望保证或者进一步提升模型的精度,建议基于特定文档场景的文本识别数据集,加载PP-OCRv3模型进行微调。 + +在模型微调的过程中,建议准备至少`5000`张垂类场景的文本识别图像,可以保证基本的模型微调效果。如果希望提升模型的精度与泛化能力,可以合成更多与该场景类似的文本识别数据,从公开数据集中收集通用真实文本识别数据,一并添加到该场景的文本识别训练任务过程中。在训练过程中,建议每个epoch的真实垂类数据、合成数据、通用数据比例在`1:1:1`左右,这可以通过设置不同数据源的采样比例进行控制。如有3个训练文本文件,分别包含1W、2W、5W条数据,那么可以在配置文件中设置数据如下: + +```yaml linenums="1" linenums="1" +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_1W.txt + - ./train_data/train_list_2W.txt + - ./train_data/train_list_5W.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +##### (2)模型 + +在模型选择方面,推荐使用通用中英文文本识别模型PP-OCRv3_rec,关于更多关于文本识别模型的训练方法介绍,请参考:[OCR文本识别模型训练教程](../../ppocr/model_train/recognition.md)。 + +### 2.2 训练KIE模型 + +对于识别得到的文字进行关键信息抽取,有2种主要的方法。 + +(1)直接使用SER,获取关键信息的类别:如身份证场景中,将“姓名“与”张三“分别标记为`name_key`与`name_value`。最终识别得到的类别为`name_value`对应的**文本字段**即为我们所需要的关键信息。 + +(2)联合SER与RE进行使用:这种方法中,首先使用SER,获取图像文字内容中所有的key与value,然后使用RE方法,对所有的key与value进行配对,找到映射关系,从而完成关键信息的抽取。 + +#### 2.2.1 SER + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等,我们直接将对应的字段标注为特定的类别即可,如下图所示。 + +![](./images/184526682-8b810397-5a93-4395-93da-37b8b8494c41.png) + +**注意:** + +* 标注过程中,对于无关于KIE关键信息的文本内容,均需要将其标注为`other`类别,相当于背景信息。如在身份证场景中,如果我们不关注性别信息,那么可以将“性别”与“男”这2个字段的类别均标注为`other`。 +* 标注过程中,需要以**文本行**为单位进行标注,无需标注单个字符的位置信息。 + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用[PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel/blob/main/README_ch.md)完成KIE的标注过程。 + +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../algorithm/kie/algorithm_kie_layoutxlm.md)与[KIE关键信息抽取使用教程](../model_train/train_kie.md)。 + +#### 2.2.2 SER + RE + +该过程主要包含SER与RE 2个过程。SER阶段主要用于识别出文档图像中的所有key与value,RE阶段主要用于对所有的key与value进行匹配。 + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等关键信息,在SER阶段,我们需要识别所有的question (key) 与answer (value) 。标注如下所示。每个字段的类别信息(`label`字段)可以是question、answer或者other(与待抽取的关键信息无关的字段) + +![](./images/184526785-c3d2d310-cd57-4d31-b933-912716b29856.jpg) + +在RE阶段,需要标注每个字段的的id与连接信息,如下图所示。 + +![](./images/184528728-626f77eb-fd9f-4709-a7dc-5411cc417dab.jpg) + +每个文本行字段中,需要添加`id`与`linking`字段信息,`id`记录该文本行的唯一标识,同一张图片中的不同文本内容不能重复,`linking`是一个列表,记录了不同文本之间的连接信息。如字段“出生”的id为0,字段“1996年1月11日”的id为1,那么它们均有[[0, 1]]的`linking`标注,表示该id=0与id=1的字段构成key-value的关系(姓名、性别等字段类似,此处不再一一赘述)。 + +**注意:** + +* 标注过程中,如果value是多个字符,那么linking中可以新增一个key-value对,如`[[0, 1], [0, 2]]` + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用PPOCRLabel完成KIE的标注过程。 + +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../algorithm/kie/algorithm_kie_layoutxlm.md)与[KIE关键信息抽取使用教程](../model_train/train_kie.md)。 + +## 3. 参考文献 + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/docs/ppstructure/blog/images/184526682-8b810397-5a93-4395-93da-37b8b8494c41.png b/docs/ppstructure/blog/images/184526682-8b810397-5a93-4395-93da-37b8b8494c41.png new file mode 100644 index 0000000000..dfea9eacbb Binary files /dev/null and b/docs/ppstructure/blog/images/184526682-8b810397-5a93-4395-93da-37b8b8494c41.png differ diff --git a/docs/ppstructure/blog/images/184526785-c3d2d310-cd57-4d31-b933-912716b29856.jpg b/docs/ppstructure/blog/images/184526785-c3d2d310-cd57-4d31-b933-912716b29856.jpg new file mode 100644 index 0000000000..c2ef8f5226 Binary files /dev/null and b/docs/ppstructure/blog/images/184526785-c3d2d310-cd57-4d31-b933-912716b29856.jpg differ diff --git a/docs/ppstructure/blog/images/184528728-626f77eb-fd9f-4709-a7dc-5411cc417dab.jpg b/docs/ppstructure/blog/images/184528728-626f77eb-fd9f-4709-a7dc-5411cc417dab.jpg new file mode 100644 index 0000000000..dcd4e52bdc Binary files /dev/null and b/docs/ppstructure/blog/images/184528728-626f77eb-fd9f-4709-a7dc-5411cc417dab.jpg differ diff --git a/docs/ppstructure/blog/images/184588654-d87f54f3-13ab-42c4-afc0-da79bead3f14.png b/docs/ppstructure/blog/images/184588654-d87f54f3-13ab-42c4-afc0-da79bead3f14.png new file mode 100644 index 0000000000..845aeb3ab2 Binary files /dev/null and b/docs/ppstructure/blog/images/184588654-d87f54f3-13ab-42c4-afc0-da79bead3f14.png differ diff --git a/docs/ppstructure/blog/images/3c200538-f2e6-4d79-847a-4c4587efa9f0.jpeg b/docs/ppstructure/blog/images/3c200538-f2e6-4d79-847a-4c4587efa9f0.jpeg new file mode 100644 index 0000000000..e19f99103a Binary files /dev/null and b/docs/ppstructure/blog/images/3c200538-f2e6-4d79-847a-4c4587efa9f0.jpeg differ diff --git a/docs/ppstructure/blog/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg b/docs/ppstructure/blog/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg new file mode 100644 index 0000000000..cf51f42c7a Binary files /dev/null and b/docs/ppstructure/blog/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg differ diff --git a/docs/ppstructure/blog/images/d0858341-a889-483c-8373-5ecaa57f3b20.png b/docs/ppstructure/blog/images/d0858341-a889-483c-8373-5ecaa57f3b20.png new file mode 100644 index 0000000000..a1ea4f87a9 Binary files /dev/null and b/docs/ppstructure/blog/images/d0858341-a889-483c-8373-5ecaa57f3b20.png differ diff --git a/docs/ppstructure/blog/return_word_pos.en.md b/docs/ppstructure/blog/return_word_pos.en.md new file mode 100644 index 0000000000..7b26541e61 --- /dev/null +++ b/docs/ppstructure/blog/return_word_pos.en.md @@ -0,0 +1,100 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Return recognition position + +According to the horizontal document, the recognition model not only returns the recognized content, but also the position of each word. + +## English document recovery + +### Download the inference model first + +```bash linenums="1" +cd PaddleOCR/ppstructure + +## download model +mkdir inference && cd inference +## Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it +https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +## Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +## Download the ultra-lightweight English table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +## Download the layout model of publaynet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` + +### Then use the following command inference in the /ppstructure/ directory + +```bash linenums="1" +python predict_system.py \ +--image_dir=./docs/table/1.png \ +--det_model_dir=inference/en_PP-OCRv3_det_infer \ +--rec_model_dir=inference/en_PP-OCRv3_rec_infer \ +--rec_char_dict_path=../ppocr/utils/en_dict.txt \ +--table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ +--table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ +--layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ +--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ +--vis_font_path=../doc/fonts/simfang.ttf \ +--recovery=True \ +--output=../output/ \ +--return_word_box=True +``` + +### View the visualization of the inference results under `../output/structure/1/show_0.jpg`, as shown below + +![show_0_mdf_v2](./images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg) + +## Recover Chinese documents + +### Download the inference model first + +```bash linenums="1" +cd PaddleOCR/ppstructure + +## download model +cd inference +## Download the detection model of the ultra-lightweight Chinese PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +## Download the recognition model of the ultra-lightweight Chinese PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +## Download the ultra-lightweight Chinese table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +## Download the layout model of CDLA dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar +cd .. +``` + +### Upload the following test image "2.png" to the directory ./docs/table/ + +![2](./images/d0858341-a889-483c-8373-5ecaa57f3b20.png) + +### Then use the following command inference in the /ppstructure/ directory + +```bash linenums="1" +python predict_system.py \ +--image_dir=./docs/table/2.png \ +--det_model_dir=inference/ch_PP-OCRv3_det_infer \ +--rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ +--rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ +--table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ +--table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ +--layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_cdla_infer \ +--layout_dict_path=../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt \ +--vis_font_path=../doc/fonts/chinese_cht.ttf \ +--recovery=True \ +--output=../output/ \ +--return_word_box=True +``` + +### View the visualization of the inference results under `../output/structure/2/show_0.jpg`, as shown below + +![show_1_mdf_v2](./images/3c200538-f2e6-4d79-847a-4c4587efa9f0.jpeg) diff --git a/docs/ppstructure/blog/return_word_pos.md b/docs/ppstructure/blog/return_word_pos.md new file mode 100644 index 0000000000..30c962ad0c --- /dev/null +++ b/docs/ppstructure/blog/return_word_pos.md @@ -0,0 +1,100 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 返回识别位置 + +根据横排的文档,识别模型不仅返回识别的内容,还返回每个文字的位置。 + +## 英文文档恢复 + +### 先下载推理模型 + +```bash linenums="1" +cd PaddleOCR/ppstructure + +## download model +mkdir inference && cd inference +## Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it +https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +## Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +## Download the ultra-lightweight English table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +## Download the layout model of publaynet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` + +### 然后在/ppstructure/目录下使用下面的指令推理 + +```bash linenums="1" +python predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --output=../output/ \ + --return_word_box=True +``` + +### 在`../output/structure/1/show_0.jpg`下查看推理结果的可视化,如下图所示 + +![show_0_mdf_v2](./images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg) + +## 针对中文文档恢复 + +### 先下载推理模型 + +```bash linenums="1" +cd PaddleOCR/ppstructure + +## download model +cd inference +## Download the detection model of the ultra-lightweight Chinesse PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +## Download the recognition model of the ultra-lightweight Chinese PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +## Download the ultra-lightweight Chinese table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +## Download the layout model of CDLA dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar +cd .. +``` + +### 上传下面的测试图片 "2.png" 至目录 ./docs/table/ 中 + +![2](./images/d0858341-a889-483c-8373-5ecaa57f3b20.png) + +### 然后在/ppstructure/目录下使用下面的指令推理 + +```bash linenums="1" +python predict_system.py \ + --image_dir=./docs/table/2.png \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_cdla_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt \ + --vis_font_path=../doc/fonts/chinese_cht.ttf \ + --recovery=True \ + --output=../output/ \ + --return_word_box=True +``` + +### 在`../output/structure/2/show_0.jpg`下查看推理结果的可视化,如下图所示 + +![show_1_mdf_v2](./images/3c200538-f2e6-4d79-847a-4c4587efa9f0.jpeg) diff --git a/docs/ppstructure/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240705094001639.jpg b/docs/ppstructure/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240705094001639.jpg new file mode 100644 index 0000000000..6a5fd84c52 Binary files /dev/null and b/docs/ppstructure/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240705094001639.jpg differ diff --git a/docs/ppstructure/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240705094037073.jpg b/docs/ppstructure/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240705094037073.jpg new file mode 100644 index 0000000000..d406a52da8 Binary files /dev/null and b/docs/ppstructure/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240705094037073.jpg differ diff --git a/docs/ppstructure/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68-20240705093932704.jpg b/docs/ppstructure/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68-20240705093932704.jpg new file mode 100644 index 0000000000..42930f0e8a Binary files /dev/null and b/docs/ppstructure/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68-20240705093932704.jpg differ diff --git a/docs/ppstructure/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240705094013236.png b/docs/ppstructure/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240705094013236.png new file mode 100644 index 0000000000..cd5bc50421 Binary files /dev/null and b/docs/ppstructure/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240705094013236.png differ diff --git a/docs/ppstructure/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240705094043151.jpg b/docs/ppstructure/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240705094043151.jpg new file mode 100644 index 0000000000..bdef2ed23b Binary files /dev/null and b/docs/ppstructure/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240705094043151.jpg differ diff --git a/docs/ppstructure/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png b/docs/ppstructure/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png new file mode 100644 index 0000000000..44845fb5ce Binary files /dev/null and b/docs/ppstructure/images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png differ diff --git a/docs/ppstructure/images/186095641-5843b4da-34d7-4c1c-943a-b1036a859fe3.png b/docs/ppstructure/images/186095641-5843b4da-34d7-4c1c-943a-b1036a859fe3.png new file mode 100644 index 0000000000..e47dbbb9c1 Binary files /dev/null and b/docs/ppstructure/images/186095641-5843b4da-34d7-4c1c-943a-b1036a859fe3.png differ diff --git a/docs/ppstructure/images/186095702-9acef674-12af-4d09-97fc-abf4ab32600e.png b/docs/ppstructure/images/186095702-9acef674-12af-4d09-97fc-abf4ab32600e.png new file mode 100644 index 0000000000..014ec50659 Binary files /dev/null and b/docs/ppstructure/images/186095702-9acef674-12af-4d09-97fc-abf4ab32600e.png differ diff --git a/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1-20240705140834325.jpg b/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1-20240705140834325.jpg new file mode 100644 index 0000000000..3bb2ddde11 Binary files /dev/null and b/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1-20240705140834325.jpg differ diff --git a/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1.jpg b/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1.jpg new file mode 100644 index 0000000000..3bb2ddde11 Binary files /dev/null and b/docs/ppstructure/images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1.jpg differ diff --git a/docs/ppstructure/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png b/docs/ppstructure/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png new file mode 100644 index 0000000000..7b4c271cf2 Binary files /dev/null and b/docs/ppstructure/images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png differ diff --git a/docs/ppstructure/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg b/docs/ppstructure/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg new file mode 100644 index 0000000000..0a513420b9 Binary files /dev/null and b/docs/ppstructure/images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg differ diff --git a/docs/ppstructure/images/ppstructure.gif b/docs/ppstructure/images/ppstructure.gif new file mode 100644 index 0000000000..bff836e3ea Binary files /dev/null and b/docs/ppstructure/images/ppstructure.gif differ diff --git a/docs/ppstructure/images/recovery.jpg b/docs/ppstructure/images/recovery.jpg new file mode 100644 index 0000000000..a3817ab70e Binary files /dev/null and b/docs/ppstructure/images/recovery.jpg differ diff --git a/docs/ppstructure/infer_deploy/cpp_infer.en.md b/docs/ppstructure/infer_deploy/cpp_infer.en.md new file mode 100644 index 0000000000..4ca649b0ab --- /dev/null +++ b/docs/ppstructure/infer_deploy/cpp_infer.en.md @@ -0,0 +1,433 @@ +--- +comments: true +--- + +# Server-side C++ Inference + +This chapter introduces the C++ deployment steps of the PaddleOCR model. C++ is better than Python in terms of performance. Therefore, in CPU and GPU deployment scenarios, C++ deployment is mostly used. +This section will introduce how to configure the C++ environment and deploy PaddleOCR in Linux (CPU\GPU) environment. For Windows deployment please refer to [Windows](../../ppocr/infer_deploy/windows_vs2019_build.en.md) compilation guidelines. + +## 1. Prepare the Environment + +### 1.1 Environment + +- Linux, docker is recommended. +- Windows. + +### 1.2 Compile OpenCV + +- First of all, you need to download the source code compiled package in the Linux environment from the OpenCV official website. Taking OpenCV 3.4.7 as an example, the download command is as follows. + +```bash linenums="1" +cd deploy/cpp_infer +wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz +tar -xf opencv-3.4.7.tar.gz +``` + +Finally, you will see the folder of `opencv-3.4.7/` in the current directory. + +- Compile OpenCV, the OpenCV source path (`root_path`) and installation path (`install_path`) should be set by yourself. Enter the OpenCV source code path and compile it in the following way. + +```bash linenums="1" +root_path=your_opencv_root_path +install_path=${root_path}/opencv3 + +rm -rf build +mkdir build +cd build + +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${install_path} \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DWITH_IPP=OFF \ + -DBUILD_IPP_IW=OFF \ + -DWITH_LAPACK=OFF \ + -DWITH_EIGEN=OFF \ + -DCMAKE_INSTALL_LIBDIR=lib64 \ + -DWITH_ZLIB=ON \ + -DBUILD_ZLIB=ON \ + -DWITH_JPEG=ON \ + -DBUILD_JPEG=ON \ + -DWITH_PNG=ON \ + -DBUILD_PNG=ON \ + -DWITH_TIFF=ON \ + -DBUILD_TIFF=ON + +make -j +make install +``` + +In the above commands, `root_path` is the downloaded OpenCV source code path, and `install_path` is the installation path of OpenCV. After `make install` is completed, the OpenCV header file and library file will be generated in this folder for later OCR source code compilation. + +The final file structure under the OpenCV installation path is as follows. + +``` +opencv3/ +|-- bin +|-- include +|-- lib +|-- lib64 +|-- share +``` + +### 1.3 Compile or Download or the Paddle Inference Library + +- There are 2 ways to obtain the Paddle inference library, described in detail below. + +#### 1.3.1 Direct download and installation + +[Paddle inference library official website](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#linux). You can review and select the appropriate version of the inference library on the official website. + +- After downloading, use the following command to extract files. + +```bash linenums="1" +tar -xf paddle_inference.tgz +``` + +Finally you will see the folder of `paddle_inference/` in the current path. + +#### 1.3.2 Compile the inference source code + +- If you want to get the latest Paddle inference library features, you can download the latest code from Paddle GitHub repository and compile the inference library from the source code. It is recommended to download the inference library with paddle version greater than or equal to 2.0.1. + +- You can refer to [Paddle inference library](https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/inference_deployment/inference/build_and_install_lib_en.html) to get the Paddle source code from GitHub, and then compile To generate the latest inference library. The method of using git to access the code is as follows. + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/Paddle.git +git checkout develop +``` + +- Enter the Paddle directory and run the following commands to compile the paddle inference library. + +```bash linenums="1" +rm -rf build +mkdir build +cd build + +cmake .. \ + -DWITH_CONTRIB=OFF \ + -DWITH_MKL=ON \ + -DWITH_MKLDNN=ON \ + -DWITH_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_INFERENCE_API_TEST=OFF \ + -DON_INFER=ON \ + -DWITH_PYTHON=ON +make -j +make inference_lib_dist +``` + +For more compilation parameter options, please refer to the [document](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi). + +- After the compilation process, you can see the following files in the folder of `build/paddle_inference_install_dir/`. + +```text linenums="1" +build/paddle_inference_install_dir/ +|-- CMakeCache.txt +|-- paddle +|-- third_party +|-- version.txt +``` + +`paddle` is the Paddle library required for C++ prediction later, and `version.txt` contains the version information of the current inference library. + +## 2. Compile and Run the Demo + +### 2.1 Export the inference model + +- You can refer to [Model inference](./python_infer.en.md) and export the inference model. After the model is exported, assuming it is placed in the `inference` directory, the directory structure is as follows. + +```text linenums="1" +inference/ +|-- det_db +| |--inference.pdiparams +| |--inference.pdmodel +|-- rec_rcnn +| |--inference.pdiparams +| |--inference.pdmodel +|-- cls +| |--inference.pdiparams +| |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel +``` + +### 2.2 Compile PaddleOCR C++ inference demo + +- The compilation commands are as follows. The addresses of Paddle C++ inference library, opencv and other Dependencies need to be replaced with the actual addresses on your own machines. + +```bash linenums="1" +sh tools/build.sh +``` + +Specifically, you should modify the paths in `tools/build.sh`. The related content is as follows. + +```bash linenums="1" +OPENCV_DIR=your_opencv_dir +LIB_DIR=your_paddle_inference_dir +CUDA_LIB_DIR=your_cuda_lib_dir +CUDNN_LIB_DIR=your_cudnn_lib_dir +``` + +`OPENCV_DIR` is the OpenCV installation path; `LIB_DIR` is the download (`paddle_inference` folder) +or the generated Paddle inference library path (`build/paddle_inference_install_dir` folder); +`CUDA_LIB_DIR` is the CUDA library file path, in docker; it is `/usr/local/cuda/lib64`; `CUDNN_LIB_DIR` is the cuDNN library file path, in docker it is `/usr/lib/x86_64-linux-gnu/`. + +- After the compilation is completed, an executable file named `ppocr` will be generated in the `build` folder. + +### 2.3 Run the demo + +Execute the built executable file: + +```bash linenums="1" +./build/ppocr [--param1] [--param2] [...] +``` + +**Note**:ppocr uses the `PP-OCRv3` model by default, and the input shape used by the recognition model is `3, 48, 320`, if you want to use the old version model, you should add the parameter `--rec_img_h=32`. + +Specifically, + +#### 1. det+cls+rec + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=true \ + --det=true \ + --rec=true \ + --cls=true \ +``` + +##### 2. det+rec + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=false \ + --det=true \ + --rec=true \ + --cls=false \ +``` + +##### 3. det + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --image_dir=../../doc/imgs/12.jpg \ + --det=true \ + --rec=false +``` + +##### 4. cls+rec + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=true \ + --cls=true \ +``` + +##### 5. rec + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=false \ + --det=false \ + --rec=true \ + --cls=false \ +``` + +##### 6. cls + +```bash linenums="1" +./build/ppocr --cls_model_dir=inference/cls \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=false \ + --cls=true \ +``` + +##### 7. layout+table + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +##### 8. layout + +```bash linenums="1" +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` + +##### 9. table + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + +More parameters are as follows, + +- Common parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|use_gpu|bool|false|Whether to use GPU| +|gpu_id|int|0|GPU id when use_gpu is true| +|gpu_mem|int|4000|GPU memory requested| +|cpu_math_library_num_threads|int|10|Number of threads when using CPU inference. When machine cores is enough, the large the value, the faster the inference speed| +|enable_mkldnn|bool|true|Whether to use mkdlnn library| +|output|str|./output|Path where visualization results are saved| + +- forward + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|det|bool|true|Whether to perform text detection in the forward direction| +|rec|bool|true|Whether to perform text recognition in the forward direction| +|cls|bool|false|Whether to perform text direction classification in the forward direction| + +- Detection related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|det_model_dir|string|-|Address of detection inference model| +|max_side_len|int|960|Limit the maximum image height and width to 960| +|det_db_thresh|float|0.3|Used to filter the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result| +|det_db_box_thresh|float|0.5|DB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate| +|det_db_unclip_ratio|float|1.6|Indicates the compactness of the text box, the smaller the value, the closer the text box to the text| +|det_db_score_mode|string|slow| slow: use polygon box to calculate bbox score, fast: use rectangle box to calculate. Use rectangular box to calculate faster, and polygonal box more accurate for curved text area.| +|visualize|bool|true|Whether to visualize the results,when it is set as true, the prediction results will be saved in the folder specified by the `output` field on an image with the same name as the input image.| + +- Classifier related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|use_angle_cls|bool|false|Whether to use the direction classifier| +|cls_model_dir|string|-|Address of direction classifier inference model| +|cls_thresh|float|0.9|Score threshold of the direction classifier| +|cls_batch_num|int|1|batch size of classifier| + +- Recognition related parameters + +|parameter|data type|default|meaning| +| --- | --- | --- | --- | +|rec_model_dir|string|-|Address of recognition inference model| +|rec_char_dict_path|string|../../ppocr/utils/ppocr_keys_v1.txt|dictionary file| +|rec_batch_num|int|6|batch size of recognition| +|rec_img_h|int|48|image height of recognition| +|rec_img_w|int|320|image width of recognition| + +- Layout related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|layout_model_dir|string|-| Address of layout inference model| +|layout_dict_path|string|../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt|dictionary file| +|layout_score_threshold|float|0.5|Threshold of score.| +|layout_nms_threshold|float|0.5|Threshold of nms.| + +- Table recognition related parameters + +|parameter|data type|default|meaning| +| :---: | :---: | :---: | :---: | +|table_model_dir|string|-|Address of table recognition inference model| +|table_char_dict_path|string|../../ppocr/utils/dict/table_structure_dict.txt|dictionary file| +|table_max_len|int|488|The size of the long side of the input image of the table recognition model, the final input image size of the network is(table_max_len,table_max_len)| +|merge_no_span_structure|bool|true|Whether to merge and to
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** +``` + +## 3. FAQ + + 1. Encountered the error `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, change the github address in `deploy/cpp_infer/external-cmake/auto-log.cmake` to the address. diff --git a/docs/ppstructure/infer_deploy/cpp_infer.md b/docs/ppstructure/infer_deploy/cpp_infer.md new file mode 100644 index 0000000000..bbf54279a0 --- /dev/null +++ b/docs/ppstructure/infer_deploy/cpp_infer.md @@ -0,0 +1,443 @@ +--- +comments: true +--- + +# 服务器端C++预测 + +本章节介绍PaddleOCR 模型的C++部署方法。C++在性能计算上优于Python,因此,在大多数CPU、GPU部署场景,多采用C++的部署方式,本节将介绍如何在Linux\Windows (CPU\GPU)环境下配置C++环境并完成PaddleOCR模型部署。 + +## 1. 准备环境 + +### 1.1 运行准备 + +- Linux环境,推荐使用docker。 +- Windows环境。 + +- 该文档主要介绍基于Linux环境的PaddleOCR C++预测流程,如果需要在Windows下基于预测库进行C++预测,具体编译方法请参考[Windows下编译教程](../../ppocr/infer_deploy/windows_vs2019_build.md) + +### 1.2 编译opencv库 + +- 首先需要从opencv官网上下载在Linux环境下源码编译的包,以opencv3.4.7为例,下载命令如下: + +```bash linenums="1" +cd deploy/cpp_infer +wget https://paddleocr.bj.bcebos.com/libs/opencv/opencv-3.4.7.tar.gz +tar -xf opencv-3.4.7.tar.gz +``` + +最终可以在当前目录下看到`opencv-3.4.7/`的文件夹。 + +- 编译opencv,设置opencv源码路径(`root_path`)以及安装路径(`install_path`)。进入opencv源码路径下,按照下面的方式进行编译。 + +```bash linenums="1" +root_path="your_opencv_root_path" +install_path=${root_path}/opencv3 +build_dir=${root_path}/build + +rm -rf ${build_dir} +mkdir ${build_dir} +cd ${build_dir} + +cmake .. \ + -DCMAKE_INSTALL_PREFIX=${install_path} \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DWITH_IPP=OFF \ + -DBUILD_IPP_IW=OFF \ + -DWITH_LAPACK=OFF \ + -DWITH_EIGEN=OFF \ + -DCMAKE_INSTALL_LIBDIR=lib64 \ + -DWITH_ZLIB=ON \ + -DBUILD_ZLIB=ON \ + -DWITH_JPEG=ON \ + -DBUILD_JPEG=ON \ + -DWITH_PNG=ON \ + -DBUILD_PNG=ON \ + -DWITH_TIFF=ON \ + -DBUILD_TIFF=ON + +make -j +make install +``` + +也可以直接修改`tools/build_opencv.sh`的内容,然后直接运行下面的命令进行编译。 + +```bash linenums="1" +sh tools/build_opencv.sh +``` + +其中`root_path`为下载的opencv源码路径,`install_path`为opencv的安装路径,`make install`完成之后,会在该文件夹下生成opencv头文件和库文件,用于后面的OCR代码编译。 + +最终在安装路径下的文件结构如下所示。 + +```text linenums="1" +opencv3/ +|-- bin +|-- include +|-- lib +|-- lib64 +|-- share +``` + +### 1.3 下载或者编译Paddle预测库 + +可以选择直接下载安装或者从源码编译,下文分别进行具体说明。 + +#### 1.3.1 直接下载安装 + +[Paddle预测库官网](https://www.paddlepaddle.org.cn/inference/master/guides/install/download_lib.html#linux) 上提供了不同cuda版本的Linux预测库,可以在官网查看并选择合适的预测库版本(*建议选择paddle版本>=2.0.1版本的预测库* )。 + +下载之后解压: + +```bash linenums="1" +tar -xf paddle_inference.tgz +``` + +最终会在当前的文件夹中生成`paddle_inference/`的子文件夹。 + +#### 1.3.2 预测库源码编译 + +如果希望获取最新预测库特性,可以从github上克隆最新Paddle代码进行编译,生成最新的预测库。 + +- 使用git获取代码: + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/Paddle.git +git checkout develop +``` + +- 进入Paddle目录,进行编译: + +```bash linenums="1" +rm -rf build +mkdir build +cd build + +cmake .. \ + -DWITH_CONTRIB=OFF \ + -DWITH_MKL=ON \ + -DWITH_MKLDNN=ON \ + -DWITH_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_INFERENCE_API_TEST=OFF \ + -DON_INFER=ON \ + -DWITH_PYTHON=ON +make -j +make inference_lib_dist +``` + +更多编译参数选项介绍可以参考[Paddle预测库编译文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0/guides/05_inference_deployment/inference/build_and_install_lib_cn.html#congyuanmabianyi)。 + +- 编译完成之后,可以在`build/paddle_inference_install_dir/`文件下看到生成了以下文件及文件夹。 + +``` +build/paddle_inference_install_dir/ +|-- CMakeCache.txt +|-- paddle +|-- third_party +|-- version.txt +``` + +其中`paddle`就是C++预测所需的Paddle库,`version.txt`中包含当前预测库的版本信息。 + +## 2. 开始运行 + +### 2.1 准备模型 + +直接下载PaddleOCR提供的推理模型,或者参考[模型预测章节](../../ppocr/infer_deploy/python_infer.md),将训练好的模型导出为推理模型。模型导出之后,假设放在`inference`目录下,则目录结构如下: + +```text linenums="1" +inference/ +|-- det_db +| |--inference.pdiparams +| |--inference.pdmodel +|-- rec_rcnn +| |--inference.pdiparams +| |--inference.pdmodel +|-- cls +| |--inference.pdiparams +| |--inference.pdmodel +|-- table +| |--inference.pdiparams +| |--inference.pdmodel +|-- layout +| |--inference.pdiparams +| |--inference.pdmodel +``` + +### 2.2 编译PaddleOCR C++预测demo + +编译命令如下,其中Paddle C++预测库、opencv等其他依赖库的地址需要换成自己机器上的实际地址。 + +```bash linenums="1" +sh tools/build.sh +``` + +具体的,需要修改`tools/build.sh`中环境路径,相关内容如下: + +```bash linenums="1" +OPENCV_DIR=your_opencv_dir +LIB_DIR=your_paddle_inference_dir +CUDA_LIB_DIR=your_cuda_lib_dir +CUDNN_LIB_DIR=/your_cudnn_lib_dir +``` + +其中,`OPENCV_DIR`为opencv编译安装的地址;`LIB_DIR`为下载(`paddle_inference`文件夹)或者编译生成的Paddle预测库地址(`build/paddle_inference_install_dir`文件夹);`CUDA_LIB_DIR`为cuda库文件地址,在docker中为`/usr/local/cuda/lib64`;`CUDNN_LIB_DIR`为cudnn库文件地址,在docker中为`/usr/lib/x86_64-linux-gnu/`。**注意:以上路径都写绝对路径,不要写相对路径。** + +编译完成之后,会在`build`文件夹下生成一个名为`ppocr`的可执行文件。 + +### 2.3 运行demo + +本demo支持系统串联调用,也支持单个功能的调用,如,只使用检测或识别功能。 + +**注意** ppocr默认使用`PP-OCRv3`模型,识别模型使用的输入shape为`3,48,320`, 如需使用旧版本的PP-OCR模型,则需要设置参数`--rec_img_h=32`。 + +运行方式: + +```bash linenums="1" +./build/ppocr [--param1] [--param2] [...] +``` + +具体命令如下: + +#### 1. 检测+分类+识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=true \ + --det=true \ + --rec=true \ + --cls=true \ +``` + +##### 2. 检测+识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs/12.jpg \ + --use_angle_cls=false \ + --det=true \ + --rec=true \ + --cls=false \ +``` + +##### 3. 检测 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --image_dir=../../doc/imgs/12.jpg \ + --det=true \ + --rec=false +``` + +##### 4. 分类+识别 + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=true \ + --cls=true \ +``` + +##### 5. 识别 + +```bash linenums="1" +./build/ppocr --rec_model_dir=inference/rec_rcnn \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=false \ + --det=false \ + --rec=true \ + --cls=false \ +``` + +##### 6. 分类 + +```bash linenums="1" +./build/ppocr --cls_model_dir=inference/cls \ + --cls_model_dir=inference/cls \ + --image_dir=../../doc/imgs_words/ch/word_1.jpg \ + --use_angle_cls=true \ + --det=false \ + --rec=false \ + --cls=true \ +``` + +##### 7. 版面分析+表格识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --layout_model_dir=inference/layout \ + --type=structure \ + --table=true \ + --layout=true +``` + +##### 8. 版面分析 + +```bash linenums="1" +./build/ppocr --layout_model_dir=inference/layout \ + --image_dir=../../ppstructure/docs/table/1.png \ + --type=structure \ + --table=false \ + --layout=true \ + --det=false \ + --rec=false +``` + +##### 9. 表格识别 + +```bash linenums="1" +./build/ppocr --det_model_dir=inference/det_db \ + --rec_model_dir=inference/rec_rcnn \ + --table_model_dir=inference/table \ + --image_dir=../../ppstructure/docs/table/table.jpg \ + --type=structure \ + --table=true +``` + +更多支持的可调节参数解释如下: + +- 通用参数 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :--------------------------: | :---: | :------: | :---------------------------------------------------------------: | +| use_gpu | bool | false | 是否使用GPU | +| gpu_id | int | 0 | GPU id,使用GPU时有效 | +| gpu_mem | int | 4000 | 申请的GPU内存 | +| cpu_math_library_num_threads | int | 10 | CPU预测时的线程数,在机器核数充足的情况下,该值越大,预测速度越快 | +| enable_mkldnn | bool | true | 是否使用mkldnn库 | +| output | str | ./output | 可视化结果保存的路径 | + +- 前向相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :------: | :---: | :------: | :----------------------: | +| det | bool | true | 前向是否执行文字检测 | +| rec | bool | true | 前向是否执行文字识别 | +| cls | bool | false | 前向是否执行文字方向分类 | + +- 检测模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :-----------------: | :----: | :------: | :----------------------------------------------------------------------------------------------------------: | +| det_model_dir | string | - | 检测模型inference model地址 | +| max_side_len | int | 960 | 输入图像长宽大于960时,等比例缩放图像,使得图像最长边为960 | +| det_db_thresh | float | 0.3 | 用于过滤DB预测的二值化图像,设置为0.-0.3对结果影响不明显 | +| det_db_box_thresh | float | 0.5 | DB后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小 | +| det_db_unclip_ratio | float | 1.6 | 表示文本框的紧致程度,越小则文本框更靠近文本 | +| det_db_score_mode | string | slow | slow:使用多边形框计算bbox score,fast:使用矩形框计算。矩形框计算速度更快,多边形框对弯曲文本区域计算更准确。 | +| visualize | bool | true | 是否对结果进行可视化,为1时,预测结果会保存在`output`字段指定的文件夹下和输入图像同名的图像上。 | + +- 方向分类器相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :-----------: | :----: | :------: | :---------------------------: | +| use_angle_cls | bool | false | 是否使用方向分类器 | +| cls_model_dir | string | - | 方向分类器inference model地址 | +| cls_thresh | float | 0.9 | 方向分类器的得分阈值 | +| cls_batch_num | int | 1 | 方向分类器batchsize | + +- 文字识别模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :----------------: | :----: | :---------------------------------: | :-----------------------------: | +| rec_model_dir | string | - | 文字识别模型inference model地址 | +| rec_char_dict_path | string | ../../ppocr/utils/ppocr_keys_v1.txt | 字典文件 | +| rec_batch_num | int | 6 | 文字识别模型batchsize | +| rec_img_h | int | 48 | 文字识别模型输入图像高度 | +| rec_img_w | int | 320 | 文字识别模型输入图像宽度 | + +- 版面分析模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :--------------------: | :----: | :----------------------------------------------------------: | :-----------------------------: | +| layout_model_dir | string | - | 版面分析模型inference model地址 | +| layout_dict_path | string | ../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt | 字典文件 | +| layout_score_threshold | float | 0.5 | 检测框的分数阈值 | +| layout_nms_threshold | float | 0.5 | nms的阈值 | + +- 表格识别模型相关 + +| 参数名称 | 类型 | 默认参数 | 意义 | +| :---------------------: | :----: | :------------------------------------------------: | :----------------------------------------------------------------------------------: | +| table_model_dir | string | - | 表格识别模型inference model地址 | +| table_char_dict_path | string | ../../ppocr/utils/dict/table_structure_dict_ch.txt | 字典文件 | +| table_max_len | int | 488 | 表格识别模型输入图像长边大小,最终网络输入图像大小为(table_max_len,table_max_len) | +| merge_no_span_structure | bool | true | 是否合并 和 为 | + +- PaddleOCR也支持多语言的预测,更多支持的语言和模型可以参考[识别文档](../../ppocr/blog/multi_languages.md)中的多语言字典与模型部分,如果希望进行多语言预测,只需将修改`rec_char_dict_path`(字典文件路径)以及`rec_model_dir`(inference模型路径)字段即可。 + +最终屏幕上会输出检测结果如下: + +- ocr + +```bash linenums="1" +predict img: ../../doc/imgs/12.jpg +../../doc/imgs/12.jpg +0 det boxes: [[74,553],[427,542],[428,571],[75,582]] rec text: 打浦路252935号 rec score: 0.947724 +1 det boxes: [[23,507],[513,488],[515,529],[24,548]] rec text: 绿洲仕格维花园公寓 rec score: 0.993728 +2 det boxes: [[187,456],[399,448],[400,480],[188,488]] rec text: 打浦路15号 rec score: 0.964994 +3 det boxes: [[42,413],[483,391],[484,428],[43,450]] rec text: 上海斯格威铂尔大酒店 rec score: 0.980086 +The detection visualized image saved in ./output//12.jpg +``` + +- layout+table + +```bash linenums="1" +predict img: ../../ppstructure/docs/table/1.png +0 type: text, region: [12,729,410,848], score: 0.781044, res: count of ocr result is : 7 +********** print ocr result ********** +0 det boxes: [[4,1],[79,1],[79,12],[4,12]] rec text: CTW1500. rec score: 0.769472 +... +6 det boxes: [[4,99],[391,99],[391,112],[4,112]] rec text: sate-of-the-artmethods[12.34.36l.ourapproachachieves rec score: 0.90414 +********** end print ocr result ********** +1 type: text, region: [69,342,342,359], score: 0.703666, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[269,2],[269,13],[8,13]] rec text: Table6.Experimentalresults on CTW-1500 rec score: 0.890454 +********** end print ocr result ********** +2 type: text, region: [70,316,706,332], score: 0.659738, res: count of ocr result is : 2 +********** print ocr result ********** +0 det boxes: [[373,2],[630,2],[630,11],[373,11]] rec text: oroposals.andthegreencontoursarefinal rec score: 0.919729 +1 det boxes: [[8,3],[357,3],[357,11],[8,11]] rec text: Visualexperimentalresultshebluecontoursareboundar rec score: 0.915963 +********** end print ocr result ********** +3 type: text, region: [489,342,789,359], score: 0.630538, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[8,2],[294,2],[294,14],[8,14]] rec text: Table7.Experimentalresults onMSRA-TD500 rec score: 0.942251 +********** end print ocr result ********** +4 type: text, region: [444,751,841,848], score: 0.607345, res: count of ocr result is : 5 +********** print ocr result ********** +0 det boxes: [[19,3],[389,3],[389,17],[19,17]] rec text: Inthispaper,weproposeanovel adaptivebound rec score: 0.941031 +1 det boxes: [[4,22],[390,22],[390,36],[4,36]] rec text: aryproposalnetworkforarbitraryshapetextdetection rec score: 0.960172 +2 det boxes: [[4,42],[392,42],[392,56],[4,56]] rec text: whichadoptanboundaryproposalmodeltogeneratecoarse rec score: 0.934647 +3 det boxes: [[4,61],[389,61],[389,75],[4,75]] rec text: ooundaryproposals,andthenadoptanadaptiveboundary rec score: 0.946296 +4 det boxes: [[5,80],[387,80],[387,93],[5,93]] rec text: leformationmodelcombinedwithGCNandRNNtoper rec score: 0.952401 +********** end print ocr result ********** +5 type: title, region: [444,705,564,724], score: 0.785429, res: count of ocr result is : 1 +********** print ocr result ********** +0 det boxes: [[6,2],[113,2],[113,14],[6,14]] rec text: 5.Conclusion rec score: 0.856903 +********** end print ocr result ********** +6 type: table, region: [14,360,402,711], score: 0.963643, res:
MethodsExtRPFFPS
TextSnake [18]Syn85.367.975.6
CSE [17]MiLT76.178.777.40.38
LOMO[40]Syn76.585.780.84.4
ATRR[35]Sy-80.280.180.1-
SegLink++ [28]Syn79.882.881.3-
TextField [37]Syn79.883.081.46.0
MSR[38]Syn79.084.181.54.3
PSENet-1s [33]MLT79.784.882.23.9
DB [12]Syn80.286.983.422.0
CRAFT [2]Syn81.186.083.5-
TextDragon [5]MLT+82.884.583.6
PAN [34]Syn81.286.483.739.8
ContourNet [36]84.183.783.94.5
DRRG [41]MLT83.0285.9384.45-
TextPerception[23]Syn81.987.584.6
Ours Syn80.5787.6683.9712.08
Ours81.4587.8184.5112.15
OursMLT83.6086.4585.0012.21
+The table visualized image saved in ./output//6_1.png +7 type: table, region: [462,359,820,657], score: 0.953917, res:
MethodsRPFFPS
SegLink [26]70.086.077.08.9
PixelLink [4]73.283.077.8-
TextSnake [18]73.983.278.31.1
TextField [37]75.987.481.35.2
MSR[38]76.787.481.7-
FTSN[3]77.187.682.0:
LSE[30]81.784.282.9
CRAFT [2]78.288.282.98.6
MCN [16]798883-
ATRR[35]82.185.283.6-
PAN [34]83.884.484.130.2
DB[12]79.291.584.932.0
DRRG [41]82.3088.0585.08-
Ours (SynText)80.6885.4082.9712.68
Ours (MLT-17)84.5486.6285.5712.31
+The table visualized image saved in ./output//7_1.png +8 type: figure, region: [14,3,836,310], score: 0.969443, res: count of ocr result is : 26 +********** print ocr result ********** +0 det boxes: [[506,14],[539,15],[539,22],[506,21]] rec text: E rec score: 0.318073 +... +25 det boxes: [[680,290],[759,288],[759,303],[680,305]] rec text: (d) CTW1500 rec score: 0.95911 +********** end print ocr result ********** +``` + +## 3. FAQ + + 1. 遇到报错 `unable to access 'https://github.com/LDOUBLEV/AutoLog.git/': gnutls_handshake() failed: The TLS connection was non-properly terminated.`, 将 `deploy/cpp_infer/external-cmake/auto-log.cmake` 中的github地址改为 地址即可。 diff --git a/docs/ppstructure/infer_deploy/images/deployment-20240704135743247.png b/docs/ppstructure/infer_deploy/images/deployment-20240704135743247.png new file mode 100644 index 0000000000..afd3cf5110 Binary files /dev/null and b/docs/ppstructure/infer_deploy/images/deployment-20240704135743247.png differ diff --git a/docs/ppstructure/infer_deploy/index.en.md b/docs/ppstructure/infer_deploy/index.en.md new file mode 100644 index 0000000000..dc07190ef2 --- /dev/null +++ b/docs/ppstructure/infer_deploy/index.en.md @@ -0,0 +1,23 @@ +--- +comments: true +--- + +# PP-OCR Deployment + +## Paddle Deployment Introduction + +Paddle provides a variety of deployment schemes to meet the deployment requirements of different scenarios. Please choose according to the actual situation: + +![img](./images/deployment-20240704135743247.png) + +PP-OCR has supported muti deployment schemes. Click the link to get the specific tutorial. + +- [Python Inference](./python_infer.en.md) +- [C++ Inference](./cpp_infer.en.md) +- [Serving (Python/C++)](./paddle_server.en.md) +- [Paddle-Lite (ARM CPU/OpenCL ARM GPU)](../../ppocr/infer_deploy/lite.en.md) +- [Paddle.js](../../ppocr/infer_deploy/paddle_js.en.md) +- [Jetson Inference](../../ppocr/infer_deploy/Jetson_infer.en.md) +- [Paddle2ONNX](../../ppocr/infer_deploy/paddle2onnx.en.md) + +If you need the deployment tutorial of academic algorithm models other than PP-OCR, please directly enter the main page of corresponding algorithms, [entrance](../../algorithm/overview.en.md)。 diff --git a/docs/ppstructure/infer_deploy/index.md b/docs/ppstructure/infer_deploy/index.md new file mode 100644 index 0000000000..69f3969905 --- /dev/null +++ b/docs/ppstructure/infer_deploy/index.md @@ -0,0 +1,26 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-OCR 模型推理部署 + +## Paddle 推理部署方式简介 + +飞桨提供多种部署方案,以满足不同场景的部署需求,请根据实际情况进行选择: + +![img](./images/deployment-20240704135743247.png) + +## PP-OCR 推理部署 + +PP-OCR模型已打通多种场景部署方案,点击链接获取具体的使用教程。 + +- [Python 推理](./python_infer.md) +- [C++ 推理](./cpp_infer.md) +- [Serving 服务化部署(Python/C++)](./paddle_server.md) +- [Paddle-Lite 端侧部署(ARM CPU/OpenCL ARM GPU)](../../ppocr/infer_deploy/lite.md) +- [Paddle.js 部署](../../ppocr/infer_deploy/paddle_js.md) +- [Jetson 推理](../../ppocr/infer_deploy/Jetson_infer.md) +- [Paddle2ONNX 推理](../../ppocr/infer_deploy/paddle2onnx.md) + +需要PP-OCR以外的学术算法模型的推理部署,请直接进入相应算法主页面,[入口](../../algorithm/overview.md)。 diff --git a/docs/ppstructure/infer_deploy/paddle_server.en.md b/docs/ppstructure/infer_deploy/paddle_server.en.md new file mode 100755 index 0000000000..1511d3c245 --- /dev/null +++ b/docs/ppstructure/infer_deploy/paddle_server.en.md @@ -0,0 +1,273 @@ +--- +comments: true +--- + +PaddleOCR provides 2 service deployment methods: + +- Based on **PaddleHub Serving**: Code path is `./deploy/hubserving`. Please follow this tutorial. +- Based on **PaddleServing**: Code path is `./deploy/pdserving`. Please refer to the [tutorial](../../ppocr/infer_deploy/paddle_server.en.md) for usage. + +# Service deployment based on PaddleHub Serving + +The hubserving service deployment directory includes seven service packages: text detection, text angle class, text recognition, text detection+text angle class+text recognition three-stage series connection, layout analysis, table recognition, and PP-Structure. Please select the corresponding service package to install and start the service according to your needs. The directory is as follows: + +```text linenums="1" +deploy/hubserving/ + └─ ocr_det text detection module service package + └─ ocr_cls text angle class module service package + └─ ocr_rec text recognition module service package + └─ ocr_system text detection+text angle class+text recognition three-stage series connection service package + └─ structure_layout layout analysis service package + └─ structure_table table recognition service package + └─ structure_system PP-Structure service package + └─ kie_ser KIE(SER) service package + └─ kie_ser_re KIE(SER+RE) service package +``` + +Each service pack contains 3 files. Take the 2-stage series connection service package as an example, the directory is as follows: + +```text linenums="1" +deploy/hubserving/ocr_system/ + └─ __init__.py Empty file, required + └─ config.json Configuration file, optional, passed in as a parameter when using configuration to start the service + └─ module.py Main module file, required, contains the complete logic of the service + └─ params.py Parameter file, required, including parameters such as model path, pre and post-processing parameters +``` + +## 1. Update + +- 2022.10.09 add KIE services. +- 2022.08.23 add layout analysis services. +- 2022.03.30 add PP-Structure and table recognition services. +- 2022.05.05 add PP-OCRv3 text detection and recognition services. + +## 2. Quick start service + +The following steps take the 2-stage series service as an example. If only the detection service or recognition service is needed, replace the corresponding file path. + +### 2.1 Install PaddleHub + +```bash linenums="1" +pip3 install paddlehub==2.1.0 --upgrade +``` + +### 2.2 Download inference model + +Before installing the service module, you need to prepare the inference model and put it in the correct path. By default, the PP-OCRv3 models are used, and the default model path is: + +| Model | Path | +| ------- | - | +| text detection model | ./inference/ch_PP-OCRv3_det_infer/ | +| text recognition model | ./inference/ch_PP-OCRv3_rec_infer/ | +| text angle classifier | ./inference/ch_ppocr_mobile_v2.0_cls_infer/ | +| layout parse model | ./inference/picodet_lcnet_x1_0_fgd_layout_infer/ | +| tanle recognition | ./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/ | +| KIE(SER) | ./inference/ser_vi_layoutxlm_xfund_infer/ | +| KIE(SER+RE) | ./inference/re_vi_layoutxlm_xfund_infer/ | + +**The model path can be found and modified in `params.py`.** +More models provided by PaddleOCR can be obtained from the [model library](../../ppocr/model_list.en.md). You can also use models trained by yourself. + +### 2.3 Install Service Module + +PaddleOCR provides 5 kinds of service modules, install the required modules according to your needs. + +- On the Linux platform(replace `/` with `\` if using Windows), the examples are as the following table: + +| Service model | Command | +| text detection | `hub install deploy/hubserving/ocr_det` | +| text angle class: | `hub install deploy/hubserving/ocr_cls` | +| text recognition: | `hub install deploy/hubserving/ocr_rec` | +| 2-stage series: | `hub install deploy/hubserving/ocr_system` | +| table recognition | `hub install deploy/hubserving/structure_table` | +| PP-Structure | `hub install deploy/hubserving/structure_system` | +| KIE(SER) | `hub install deploy/hubserving/kie_ser` | +| KIE(SER+RE) | `hub install deploy/hubserving/kie_ser_re` | + +### 2.4 Start service + +#### 2.4.1 Start with command line parameters (CPU only) + +**start command:** + +```bash linenums="1" +hub serving start --modules Module1==Version1, Module2==Version2, ... \ + --port 8866 \ + --use_multiprocess \ + --workers \ +``` + +**Parameters:** +|parameters|usage| +|---|---| +|`--modules`/`-m`|PaddleHub Serving pre-installed model, listed in the form of multiple Module==Version key-value pairs
**When Version is not specified, the latest version is selected by default**| +|`--port`/`-p`|Service port, default is 8866| +|`--use_multiprocess`|Enable concurrent mode, by default using the single-process mode, this mode is recommended for multi-core CPU machines
**Windows operating system only supports single-process mode**| +|`--workers`|The number of concurrent tasks specified in concurrent mode, the default is `2*cpu_count-1`, where `cpu_count` is the number of CPU cores| + +For example, start the 2-stage series service: + +```bash linenums="1" +hub serving start -m ocr_system +``` + +This completes the deployment of a service API, using the default port number 8866. + +#### 2.4.2 Start with configuration file(CPU and GPU) + +**start command:** + +```bash linenums="1" +hub serving start --config/-c config.json +``` + +In which the format of `config.json` is as follows: + +```json +{ + "modules_info": { + "ocr_system": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8868, + "use_multiprocess": false, + "workers": 2 +} +``` + +- The configurable parameters in `init_args` are consistent with the `_initialize` function interface in `module.py`. + + **When `use_gpu` is `true`, it means that the GPU is used to start the service**. +- The configurable parameters in `predict_args` are consistent with the `predict` function interface in `module.py`. + + **Note:** + - When using the configuration file to start the service, other parameters will be ignored. + - If you use GPU prediction (that is, `use_gpu` is set to `true`), you need to set the environment variable CUDA_VISIBLE_DEVICES before starting the service, such as: + + ```bash linenums="1" + export CUDA_VISIBLE_DEVICES=0 + ``` + + - **`use_gpu` and `use_multiprocess` cannot be `true` at the same time.** + +For example, use GPU card No. 3 to start the 2-stage series service: + +```bash linenums="1" +export CUDA_VISIBLE_DEVICES=3 +hub serving start -c deploy/hubserving/ocr_system/config.json +``` + +## 3. Send prediction requests + +After the service starts, you can use the following command to send a prediction request to obtain the prediction result: + +```bash linenums="1" +python tools/test_hubserving.py --server_url=server_url --image_dir=image_path +``` + +Two parameters need to be passed to the script: + +- **server_url**:service address, the format of which is + `http://[ip_address]:[port]/predict/[module_name]` + + For example, if using the configuration file to start the text angle classification, text detection, text recognition, detection+classification+recognition 3 stages, table recognition and PP-Structure service, + + also modified the port for each service, then the `server_url` to send the request will be: + + ```text linenums="1" + http://127.0.0.1:8865/predict/ocr_det + http://127.0.0.1:8866/predict/ocr_cls + http://127.0.0.1:8867/predict/ocr_rec + http://127.0.0.1:8868/predict/ocr_system + http://127.0.0.1:8869/predict/structure_table + http://127.0.0.1:8870/predict/structure_system + http://127.0.0.1:8870/predict/structure_layout + http://127.0.0.1:8871/predict/kie_ser + http://127.0.0.1:8872/predict/kie_ser_re + ``` + +- **image_dir**:Test image path, which can be a single image path or an image directory path +- **visualize**:Whether to visualize the results, the default value is False +- **output**:The folder to save the Visualization result, the default value is `./hubserving_result` + +Example: + +```bash linenums="1" +python tools/test_hubserving.py --server_url=http://127.0.0.1:8868/predict/ocr_system --image_dir=./doc/imgs/ --visualize=false` +``` + +## 4. Returned result format + +The returned result is a list. Each item in the list is a dictionary which may contain three fields. The information is as follows: + +|field name|data type|description| +|----|----|----| +|angle|str|angle| +|text|str|text content| +|confidence|float|text recognition confidence| +|text_region|list|text location coordinates| +|html|str|table HTML string| +|regions|list|The result of layout analysis + table recognition + OCR, each item is a list
including `bbox` indicating area coordinates, `type` of area type and `res` of area results| +|layout|list|The result of layout analysis, each item is a dict, including `bbox` indicating area coordinates, `label` of area type| + +The fields returned by different modules are different. For example, the results returned by the text recognition service module do not contain `text_region`, detailed table is as follows: + +|field name/module name |ocr_det |ocr_cls |ocr_rec |ocr_system |structure_table |structure_system |structure_layout |kie_ser |kie_re | +|--- |--- |--- |--- |--- |--- |--- |--- |--- |--- | +|angle | |✔ | |✔ | | | | +|text | | |✔ |✔ | |✔ | |✔ |✔ | +|confidence | |✔ |✔ |✔ | |✔ | |✔ |✔ | +|text_region |✔ | | |✔ | |✔ | |✔ |✔ | +|html | | | | |✔ |✔ | | | | +|regions | | | | |✔ |✔ | | | | +|layout | | | | | | |✔ | | | +|ser_res | | | | | | | |✔ | | +|re_res | | | | | | | | |✔ | + +**Note:** If you need to add, delete or modify the returned fields, you can modify the file `module.py` of the corresponding module. For the complete process, refer to the user-defined modification service module in the next section. + +## 5. User-defined service module modification + +If you need to modify the service logic, the following steps are generally required (take the modification of `deploy/hubserving/ocr_system` for example): + +1. Stop service: + +```bash linenums="1" +hub serving stop --port/-p XXXX +``` + +2. Modify the code in the corresponding files under `deploy/hubserving/ocr_system`, such as `module.py` and `params.py`, to your actual needs. + + For example, if you need to replace the model used by the deployed service, you need to modify model path parameters `det_model_dir` and `rec_model_dir` in `params.py`. If you want to turn off the text direction classifier, set the parameter `use_angle_cls` to `False`. + + Of course, other related parameters may need to be modified at the same time. Please modify and debug according to the actual situation. + + **It is suggested to run `module.py` directly for debugging after modification before starting the service test.** + + **Note** The image input shape used by the PPOCR-v3 recognition model is `3, 48, 320`, so you need to modify `cfg.rec_image_shape = "3, 48, 320"` in `params.py`, if you do not use the PPOCR-v3 recognition model, then there is no need to modify this parameter. +3. (Optional) If you want to rename the module, the following lines should be modified: + - [`ocr_system` within `from deploy.hubserving.ocr_system.params import read_params`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L35) + - [`ocr_system` within `name="ocr_system",`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L39) +4. (Optional) It may require you to delete the directory `__pycache__` to force flush build cache of CPython: + + ```bash linenums="1" + find deploy/hubserving/ocr_system -name '__pycache__' -exec rm -r {} \; + ``` + +5. Install modified service module: + + ```bash linenums="1" + hub install deploy/hubserving/ocr_system/ + ``` + +6. Restart service: + + ```bash linenums="1" + hub serving start -m ocr_system + ``` diff --git a/docs/ppstructure/infer_deploy/paddle_server.md b/docs/ppstructure/infer_deploy/paddle_server.md new file mode 100644 index 0000000000..6e25591917 --- /dev/null +++ b/docs/ppstructure/infer_deploy/paddle_server.md @@ -0,0 +1,279 @@ +--- +comments: true +--- + +PaddleOCR提供2种服务部署方式: + +- 基于PaddleHub Serving的部署:代码路径为`./deploy/hubserving`,按照本教程使用; +- 基于PaddleServing的部署:代码路径为`./deploy/pdserving`,使用方法参考[文档](../../ppocr/infer_deploy/paddle_server.md)。 + +# 基于PaddleHub Serving的服务部署 + +hubserving服务部署目录下包括文本检测、文本方向分类,文本识别、文本检测+文本方向分类+文本识别3阶段串联,版面分析、表格识别和PP-Structure七种服务包,请根据需求选择相应的服务包进行安装和启动。目录结构如下: + +```text linenums="1" +deploy/hubserving/ + └─ ocr_cls 文本方向分类模块服务包 + └─ ocr_det 文本检测模块服务包 + └─ ocr_rec 文本识别模块服务包 + └─ ocr_system 文本检测+文本方向分类+文本识别串联服务包 + └─ structure_layout 版面分析服务包 + └─ structure_table 表格识别服务包 + └─ structure_system PP-Structure服务包 + └─ kie_ser 关键信息抽取-SER服务包 + └─ kie_ser_re 关键信息抽取-SER+RE服务包 +``` + +每个服务包下包含3个文件。以2阶段串联服务包为例,目录如下: + +```text linenums="1" +deploy/hubserving/ocr_system/ + └─ __init__.py 空文件,必选 + └─ config.json 配置文件,可选,使用配置启动服务时作为参数传入 + └─ module.py 主模块,必选,包含服务的完整逻辑 + └─ params.py 参数文件,必选,包含模型路径、前后处理参数等参数 +``` + +## 1. 近期更新 + +- 2022.10.09 新增关键信息抽取服务。 +- 2022.08.23 新增版面分析服务。 +- 2022.05.05 新增PP-OCRv3检测和识别模型。 +- 2022.03.30 新增PP-Structure和表格识别两种服务。 + +## 2. 快速启动服务 + +以下步骤以检测+识别2阶段串联服务为例,如果只需要检测服务或识别服务,替换相应文件路径即可。 + +### 2.1 安装PaddleHub + +paddlehub 需要 python>3.6.2 + +```bash linenums="1" +pip3 install paddlehub==2.1.0 --upgrade -i https://mirror.baidu.com/pypi/simple +``` + +### 2.2 下载推理模型 + +安装服务模块前,需要准备推理模型并放到正确路径。默认使用的是PP-OCRv3模型,默认模型路径为: + +| 模型 | 路径 | +| ------------------- | ------------------------------------------------------ | +| 检测模型 | `./inference/ch_PP-OCRv3_det_infer/` | +| 识别模型 | `./inference/ch_PP-OCRv3_rec_infer/` | +| 方向分类器 | `./inference/ch_ppocr_mobile_v2.0_cls_infer/` | +| 版面分析模型 | `./inference/picodet_lcnet_x1_0_fgd_layout_infer/` | +| 表格结构识别模型 | `./inference/ch_ppstructure_mobile_v2.0_SLANet_infer/` | +| 关键信息抽取SER模型 | `./inference/ser_vi_layoutxlm_xfund_infer/` | +| 关键信息抽取RE模型 | `./inference/re_vi_layoutxlm_xfund_infer/` | + +**模型路径可在`params.py`中查看和修改。** + +更多模型可以从PaddleOCR提供的模型库[PP-OCR](../../ppocr/model_list.md)和[PP-Structure](../models_list.md)下载,也可以替换成自己训练转换好的模型。 + +### 2.3 安装服务模块 + +PaddleOCR提供5种服务模块,根据需要安装所需模块。 + +在Linux环境(Windows环境请将`/`替换为`\`)下,安装模块命令如下表: + +| 服务模块 | 命令 | +| ------------------ | ------------------------------------------------ | +| 检测 | `hub install deploy/hubserving/ocr_det` | +| 分类 | `hub install deploy/hubserving/ocr_cls` | +| 识别 | `hub install deploy/hubserving/ocr_rec` | +| 检测+识别串联 | `hub install deploy/hubserving/ocr_system` | +| 表格识别 | `hub install deploy/hubserving/structure_table` | +| PP-Structure | `hub install deploy/hubserving/structure_system` | +| 版面分析 | `hub install deploy/hubserving/structure_layout` | +| 关键信息抽取SER | `hub install deploy/hubserving/kie_ser` | +| 关键信息抽取SER+RE | `hub install deploy/hubserving/kie_ser_re` | + +### 2.4 启动服务 + +#### 2.4.1. 命令行命令启动(仅支持CPU) + +**启动命令:** + +```bash linenums="1" +hub serving start --modules Module1==Version1, Module2==Version2, ... \ + --port 8866 \ + --use_multiprocess \ + --workers \ +``` + +**参数:** + +| 参数 | 用途 | +| ----- | ---- | +| `--modules`/`-m` | PaddleHub Serving预安装模型,以多个Module==Version键值对的形式列出
**当不指定Version时,默认选择最新版本** | +| `--port`/`-p` | 服务端口,默认为8866 | +| `--use_multiprocess` | 是否启用并发方式,默认为单进程方式,推荐多核CPU机器使用此方式
**Windows操作系统只支持单进程方式** | +| `--workers` | 在并发方式下指定的并发任务数,默认为`2*cpu_count-1`,其中`cpu_count`为CPU核数 | + +如启动串联服务: + +```bash linenums="1" +hub serving start -m ocr_system +``` + +这样就完成了一个服务化API的部署,使用默认端口号8866。 + +#### 2.4.2 配置文件启动(支持CPU、GPU) + +**启动命令:** + +```bash linenums="1" +hub serving start -c config.json +``` + +其中,`config.json`格式如下: + +```json +{ + "modules_info": { + "ocr_system": { + "init_args": { + "version": "1.0.0", + "use_gpu": true + }, + "predict_args": { + } + } + }, + "port": 8868, + "use_multiprocess": false, + "workers": 2 +} +``` + +- `init_args`中的可配参数与`module.py`中的`_initialize`函数接口一致。 + + **当`use_gpu`为`true`时,表示使用GPU启动服务。** +- `predict_args`中的可配参数与`module.py`中的`predict`函数接口一致。 + +**注意:** + +- 使用配置文件启动服务时,其他参数会被忽略。 +- 如果使用GPU预测(即,`use_gpu`置为`true`),则需要在启动服务之前,设置CUDA_VISIBLE_DEVICES环境变量,如: + + ```bash linenums="1" + export CUDA_VISIBLE_DEVICES=0 + ``` + +- **`use_gpu`不可与`use_multiprocess`同时为`true`**。 + +如,使用GPU 3号卡启动串联服务: + +```bash linenums="1" +export CUDA_VISIBLE_DEVICES=3 +hub serving start -c deploy/hubserving/ocr_system/config.json +``` + +## 3. 发送预测请求 + +配置好服务端,可使用以下命令发送预测请求,获取预测结果: + +```bash linenums="1" +python tools/test_hubserving.py --server_url=server_url --image_dir=image_path +``` + +需要给脚本传递2个参数: + +- `server_url`:服务地址,格式为`http://[ip_address]:[port]/predict/[module_name]` + + 例如,如果使用配置文件启动分类,检测、识别,检测+分类+识别3阶段,表格识别和PP-Structure服务 + + 并为每个服务修改了port,那么发送请求的url将分别是: + + ```text linenums="1" + http://127.0.0.1:8865/predict/ocr_det + http://127.0.0.1:8866/predict/ocr_cls + http://127.0.0.1:8867/predict/ocr_rec + http://127.0.0.1:8868/predict/ocr_system + http://127.0.0.1:8869/predict/structure_table + http://127.0.0.1:8870/predict/structure_system + http://127.0.0.1:8870/predict/structure_layout + http://127.0.0.1:8871/predict/kie_ser + http://127.0.0.1:8872/predict/kie_ser_re + ``` + +- `image_dir`:测试图像路径,可以是单张图片路径,也可以是图像集合目录路径 +- `visualize`:是否可视化结果,默认为False +- `output`:可视化结果保存路径,默认为`./hubserving_result` + +访问示例: + +```bash linenums="1" +python tools/test_hubserving.py --server_url=http://127.0.0.1:8868/predict/ocr_system --image_dir=./doc/imgs/ --visualize=false +``` + +## 4. 返回结果格式说明 + +返回结果为列表(list),列表中的每一项为词典(dict),词典一共可能包含3种字段,信息如下: + +| 字段名称 | 数据类型 | 意义 | +| ----------- | -------- | ----- | +| angle | str | 文本角度 | +| text | str | 文本内容 | +| confidence | float | 文本识别置信度或文本角度分类置信度 | +| text_region | list | 文本位置坐标 | +| html | str | 表格的html字符串 | +| regions | list | 版面分析+表格识别+OCR的结果,每一项为一个list
包含表示区域坐标的`bbox`,区域类型的`type`和区域结果的`res`三个字段 | +| layout | list | 版面分析的结果,每一项一个dict,包含版面区域坐标的`bbox`,区域类型的`label` | + +不同模块返回的字段不同,如,文本识别服务模块返回结果不含`text_region`字段,具体信息如下: + +| 字段名/模块名 | ocr_det | ocr_cls | ocr_rec | ocr_system | structure_table | structure_system | structure_layout | kie_ser | kie_re | +| ------------- | ------- | ------- | ------- | ---------- | --------------- | ---------------- | ---------------- | ------- | ------ | +| angle | | ✔ | | ✔ | | | | +| text | | | ✔ | ✔ | | ✔ | | ✔ | ✔ | +| confidence | | ✔ | ✔ | ✔ | | ✔ | | ✔ | ✔ | +| text_region | ✔ | | | ✔ | | ✔ | | ✔ | ✔ | +| html | | | | | ✔ | ✔ | | | | +| regions | | | | | ✔ | ✔ | | | | +| layout | | | | | | | ✔ | | | +| ser_res | | | | | | | | ✔ | | +| re_res | | | | | | | | | ✔ | + +**说明:** 如果需要增加、删除、修改返回字段,可在相应模块的`module.py`文件中进行修改,完整流程参考下一节自定义修改服务模块。 + +## 5. 自定义修改服务模块 + +如果需要修改服务逻辑,一般需要操作以下步骤(以修改`deploy/hubserving/ocr_system`为例): + +1. 停止服务: + + ```bash linenums="1" + hub serving stop --port/-p XXXX + ``` + +2. 到`deploy/hubserving/ocr_system`下的`module.py`和`params.py`等文件中根据实际需求修改代码。 + + 例如,如果需要替换部署服务所用模型,则需要到`params.py`中修改模型路径参数`det_model_dir`和`rec_model_dir`,如果需要关闭文本方向分类器,则将参数`use_angle_cls`置为`False` + + 当然,同时可能还需要修改其他相关参数,请根据实际情况修改调试。 + + **强烈建议修改后先直接运行`module.py`调试,能正确运行预测后再启动服务测试。** + + **注意:** PPOCR-v3识别模型使用的图片输入shape为`3,48,320`,因此需要修改`params.py`中的`cfg.rec_image_shape = "3, 48, 320"`,如果不使用PPOCR-v3识别模型,则无需修改该参数。 +3. (可选)如果想要重命名模块需要更改`module.py`文件中的以下行: + - [`from deploy.hubserving.ocr_system.params import read_params`中的`ocr_system`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L35) + - [`name="ocr_system",`中的`ocr_system`](https://github.com/PaddlePaddle/PaddleOCR/blob/a923f35de57b5e378f8dd16e54d0a3e4f51267fd/deploy/hubserving/ocr_system/module.py#L39) +4. (可选)可能需要删除`__pycache__`目录以强制刷新CPython缓存: + + ```bash linenums="1" + find deploy/hubserving/ocr_system -name '__pycache__' -exec rm -r {} \; + ``` + +5. 安装修改后的新服务包: + + ```bash linenums="1" + hub install deploy/hubserving/ocr_system + ``` + +6. 重新启动服务: + + ```bash linenums="1" + hub serving start -m ocr_system + ``` diff --git a/docs/ppstructure/infer_deploy/python_infer.en.md b/docs/ppstructure/infer_deploy/python_infer.en.md new file mode 100644 index 0000000000..d40503852f --- /dev/null +++ b/docs/ppstructure/infer_deploy/python_infer.en.md @@ -0,0 +1,116 @@ +--- +comments: true +--- + +# Python Inference + +## 1. Layout Structured Analysis + +Go to the `ppstructure` directory + +```bash linenums="1" +cd ppstructure + +# download model +mkdir inference && cd inference +# Download the PP-StructureV2 layout analysis model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-StructureV2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +``` + +### 1.1 layout analysis + table recognition + +```bash linenums="1" +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. Detailed results are stored in the `res.txt` file. + +### 1.2 layout analysis + +```bash linenums="1" +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false +``` + +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each picture in image will be cropped and saved. The filename of picture area is their coordinates in the image. Layout analysis results will be stored in the `res.txt` file + +### 1.3 table recognition + +```bash linenums="1" +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=./docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --layout=false +``` + +After the operation is completed, each image will have a directory with the same name in the `structure` directory under the directory specified by the `output` field. Each table in the image will be stored as an excel. The filename of excel is their coordinates in the image. + +## 2. Key Information Extraction + +### 2.1 SER + +```bash linenums="1" +cd ppstructure + +mkdir inference && cd inference +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +cd .. +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +After the operation is completed, each image will store the visualized image in the `kie` directory under the directory specified by the `output` field, and the image name is the same as the input image name. + +### 2.2 RE+SER + +```bash linenums="1" +cd ppstructure + +mkdir inference && cd inference +# download model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. + +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +After the operation is completed, each image will have a directory with the same name in the `kie` directory under the directory specified by the `output` field, where the visual images and prediction results are stored. diff --git a/docs/ppstructure/infer_deploy/python_infer.md b/docs/ppstructure/infer_deploy/python_infer.md new file mode 100644 index 0000000000..0db980303c --- /dev/null +++ b/docs/ppstructure/infer_deploy/python_infer.md @@ -0,0 +1,119 @@ +--- +comments: true +--- + +# 基于Python预测引擎推理 + +## 1. 版面信息抽取 + +进入`ppstructure`目录 + +```bash linenums="1" +cd ppstructure +``` + +下载模型 + +```bash linenums="1" +mkdir inference && cd inference +# 下载PP-StructureV2版面分析模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar && tar xf picodet_lcnet_x1_0_layout_infer.tar +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-StructureV2表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +``` + +### 1.1 版面分析+表格识别 + +```bash linenums="1" +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。详细的结果会存储在`res.txt`文件中。 + +### 1.2 版面分析 + +```bash linenums="1" +python3 predict_system.py --layout_model_dir=inference/picodet_lcnet_x1_0_layout_infer \ + --image_dir=./docs/table/1.png \ + --output=../output \ + --table=false \ + --ocr=false +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,图片区域会被裁剪之后保存下来,图片名为表格在图片里的坐标。版面分析结果会存储在`res.txt`文件中。 + +### 1.3 表格识别 + +```bash linenums="1" +python3 predict_system.py --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=./docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --output=../output \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --layout=false +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`structure`目录下有一个同名目录,表格会存储为一个excel,excel文件名为`[0,0,img_h,img_w]`。 + +## 2. 关键信息抽取 + +### 2.1 SER + +```bash linenums="1" +cd ppstructure + +mkdir inference && cd inference +# 下载SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +cd .. +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下存放可视化之后的图片,图片名和输入图片名一致。 + +### 2.2 RE+SER + +```bash linenums="1" +cd ppstructure + +mkdir inference && cd inference +# 下载RE SER XFUND 模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. + +python3 predict_system.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=./inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=./inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../ppocr/utils/dict/kie_dict/xfund_class_list.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" \ + --mode=kie +``` + +运行完成后,每张图片会在`output`字段指定的目录下的`kie`目录下有一个同名目录,目录中存放可视化图片和预测结果。 diff --git a/docs/ppstructure/model_train/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg b/docs/ppstructure/model_train/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg new file mode 100644 index 0000000000..6a5fd84c52 Binary files /dev/null and b/docs/ppstructure/model_train/images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg differ diff --git a/docs/ppstructure/model_train/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg b/docs/ppstructure/model_train/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg new file mode 100644 index 0000000000..d406a52da8 Binary files /dev/null and b/docs/ppstructure/model_train/images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg differ diff --git a/docs/ppstructure/model_train/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68.jpg b/docs/ppstructure/model_train/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68.jpg new file mode 100644 index 0000000000..42930f0e8a Binary files /dev/null and b/docs/ppstructure/model_train/images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68.jpg differ diff --git a/docs/ppstructure/model_train/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png b/docs/ppstructure/model_train/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png new file mode 100644 index 0000000000..cd5bc50421 Binary files /dev/null and b/docs/ppstructure/model_train/images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png differ diff --git a/docs/ppstructure/model_train/images/185539735-37b5c2ef-629d-43fe-9abb-44bb717ef7ee.jpg b/docs/ppstructure/model_train/images/185539735-37b5c2ef-629d-43fe-9abb-44bb717ef7ee.jpg new file mode 100644 index 0000000000..4abdc096d4 Binary files /dev/null and b/docs/ppstructure/model_train/images/185539735-37b5c2ef-629d-43fe-9abb-44bb717ef7ee.jpg differ diff --git a/docs/ppstructure/model_train/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg b/docs/ppstructure/model_train/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg new file mode 100644 index 0000000000..bdef2ed23b Binary files /dev/null and b/docs/ppstructure/model_train/images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg differ diff --git a/docs/ppstructure/model_train/images/185540291-f64e5daf-6d42-4e7c-bbbb-471e3fac4fcc.png b/docs/ppstructure/model_train/images/185540291-f64e5daf-6d42-4e7c-bbbb-471e3fac4fcc.png new file mode 100644 index 0000000000..b5f45779fa Binary files /dev/null and b/docs/ppstructure/model_train/images/185540291-f64e5daf-6d42-4e7c-bbbb-471e3fac4fcc.png differ diff --git a/docs/ppstructure/model_train/images/195319840-68fc60ec-ea66-4095-b734-0ec115860341.png b/docs/ppstructure/model_train/images/195319840-68fc60ec-ea66-4095-b734-0ec115860341.png new file mode 100644 index 0000000000..d5b30f15a9 Binary files /dev/null and b/docs/ppstructure/model_train/images/195319840-68fc60ec-ea66-4095-b734-0ec115860341.png differ diff --git a/docs/ppstructure/model_train/images/layout.jpg b/docs/ppstructure/model_train/images/layout.jpg new file mode 100644 index 0000000000..b210054ca0 Binary files /dev/null and b/docs/ppstructure/model_train/images/layout.jpg differ diff --git a/docs/ppstructure/model_train/images/layout_res.jpg b/docs/ppstructure/model_train/images/layout_res.jpg new file mode 100644 index 0000000000..93b3a8bef3 Binary files /dev/null and b/docs/ppstructure/model_train/images/layout_res.jpg differ diff --git a/docs/ppstructure/model_train/images/recovery-20240708091126891.jpg b/docs/ppstructure/model_train/images/recovery-20240708091126891.jpg new file mode 100644 index 0000000000..a3817ab70e Binary files /dev/null and b/docs/ppstructure/model_train/images/recovery-20240708091126891.jpg differ diff --git a/docs/ppstructure/model_train/images/recovery_ch.jpg b/docs/ppstructure/model_train/images/recovery_ch.jpg new file mode 100644 index 0000000000..df5a5063f0 Binary files /dev/null and b/docs/ppstructure/model_train/images/recovery_ch.jpg differ diff --git a/docs/ppstructure/model_train/images/table_ch_result1.jpg b/docs/ppstructure/model_train/images/table_ch_result1.jpg new file mode 100644 index 0000000000..c75eee40f6 Binary files /dev/null and b/docs/ppstructure/model_train/images/table_ch_result1.jpg differ diff --git a/docs/ppstructure/model_train/images/table_ch_result2.png b/docs/ppstructure/model_train/images/table_ch_result2.png new file mode 100644 index 0000000000..426de79eac Binary files /dev/null and b/docs/ppstructure/model_train/images/table_ch_result2.png differ diff --git a/docs/ppstructure/model_train/images/table_ch_result3.jpg b/docs/ppstructure/model_train/images/table_ch_result3.jpg new file mode 100644 index 0000000000..bdd92aa6ee Binary files /dev/null and b/docs/ppstructure/model_train/images/table_ch_result3.jpg differ diff --git a/docs/ppstructure/model_train/images/tableocr_pipeline.jpg b/docs/ppstructure/model_train/images/tableocr_pipeline.jpg new file mode 100644 index 0000000000..da868791b1 Binary files /dev/null and b/docs/ppstructure/model_train/images/tableocr_pipeline.jpg differ diff --git a/docs/ppstructure/model_train/images/zh_val_42_ser.jpg b/docs/ppstructure/model_train/images/zh_val_42_ser.jpg new file mode 100644 index 0000000000..7b05e8436b Binary files /dev/null and b/docs/ppstructure/model_train/images/zh_val_42_ser.jpg differ diff --git a/docs/ppstructure/model_train/recovery_to_doc.en.md b/docs/ppstructure/model_train/recovery_to_doc.en.md new file mode 100644 index 0000000000..bddcda7325 --- /dev/null +++ b/docs/ppstructure/model_train/recovery_to_doc.en.md @@ -0,0 +1,184 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Layout Recovery + +## 1. Introduction + +The layout recovery module is used to restore the image or pdf to an +editable Word file consistent with the original image layout. + +Two layout recovery methods are provided, you can choose by PDF format: + +- **Standard PDF parse(the input is standard PDF)**: Python based PDF to word library [pdf2docx](https://github.com/dothinking/pdf2docx) is optimized, the method extracts data from PDF with PyMuPDF, then parse layout with rule, finally, generate docx with python-docx. + +- **Image format PDF parse(the input can be standard PDF or image format PDF)**: Layout recovery combines [layout analysis](./train_layout.en.md)、[table recognition](./train_table.en.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. + +The input formats and application scenarios of the two methods are as follows: + +| method | input formats | application scenarios/problem | +| :-----: | :----------: | :----------------------------------------------------------: | +| Standard PDF parse | pdf | Advantages: Better recovery for non-paper documents, each page remains on the same page after restoration
Disadvantages: English characters in some Chinese documents are garbled, some contents are still beyond the current page, the whole page content is restored to the table format, and the recovery effect of some pictures is not good | +| Image format PDF parse( | pdf、picture | Advantages: More suitable for paper document content recovery, OCR recognition effect is more good
Disadvantages: Currently, the recovery is based on rules, the effect of content typesetting (spacing, fonts, etc.) need to be further improved, and the effect of layout recovery depends on layout analysis | + +The following figure shows the effect of restoring the layout of documents by using PDF parse: + +![img](./images/195319840-68fc60ec-ea66-4095-b734-0ec115860341.png) + +The following figures show the effect of restoring the layout of English and Chinese documents by using OCR technique: + +![img](./images/recovery-20240708091126891.jpg) + +![img](./images/recovery_ch.jpg) + +## 2. Install + +### 2.1 Install PaddlePaddle + +```bash linenums="1" +python3 -m pip install --upgrade pip + +# If you have cuda9 or cuda10 installed on your machine, please run the following command to install +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple + +# CPU installation +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple +```` + +For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/macos-pip_en.html). + +### 2.2 Install PaddleOCR + +- **(1) Download source code** + +```bash linenums="1" +[Recommended] git clone https://github.com/PaddlePaddle/PaddleOCR + +# If the pull cannot be successful due to network problems, you can also choose to use the hosting on the code cloud: +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first. +```` + +- **(2) Install recovery `requirements`** + +The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. + +Install all the libraries by running the following command: + +```bash linenums="1" +python3 -m pip install -r ppstructure/recovery/requirements.txt +```` + + And if using pdf parse method, we need to install pdf2docx api. + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + +## 3. Quick Start using standard PDF parse + +`use_pdf2docx_api` use PDF parse for layout recovery, The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../quick_start.en.md) for details. + +```bash linenums="1" +# install paddleocr +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +Command line: + +```bash linenums="1" +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + +## 4. Quick Start using image format PDF parse + +Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: + +- OCR detection and recognition is performed in the text area, and the coordinates of the OCR detection box and the text content information are added on the basis of the previous information + +- The table area identifies tables and records html and text information of tables +- Save the image directly + +We can restore the test picture through the layout information, OCR detection and recognition structure, table information, and saved pictures. + +The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../quick_start.en.md) for details. + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +``` + +### 4.1 Download models + +If input is English document, download English models: + +```bash linenums="1" +cd PaddleOCR/ppstructure + +# download model +mkdir inference && cd inference +# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +# Download the ultra-lightweight English table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# Download the layout model of publaynet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` + +If input is Chinese document,download Chinese models: +[Chinese and English ultra-lightweight PP-OCRv3 model](../../ppocr/model_list.md) + +### 4.2 Layout recovery + +```bash linenums="1" +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --output=../output/ +``` + +After running, the docx of each picture will be saved in the directory specified by the output field + +Field: + +- image_dir:test file, can be picture, picture directory, pdf file, pdf file directory +- det_model_dir:OCR detection model path +- rec_model_dir:OCR recognition model path +- rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary +- table_model_dir:tabel recognition model path +- table_char_dict_path:tabel recognition dict path. If the Chinese model is used, no need to change +- layout_model_dir:layout analysis model path +- layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:whether to enable layout of recovery, default False +- output:save the recovery result path + +## 5. More + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](../../ppocr/model_train/detection.en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](../../ppocr/model_train/recognition.en.md). + +For training, evaluation and inference tutorial for layout analysis models, please refer to [layout analysis doc](./train_layout.en.md) + +For training, evaluation and inference tutorial for table recognition models, please refer to [table recognition doc](./train_table.en.md) diff --git a/docs/ppstructure/model_train/recovery_to_doc.md b/docs/ppstructure/model_train/recovery_to_doc.md new file mode 100644 index 0000000000..d511a5843c --- /dev/null +++ b/docs/ppstructure/model_train/recovery_to_doc.md @@ -0,0 +1,191 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 版面恢复 + +## 1. 简介 + +版面恢复就是将输入的图片、pdf内容仍然像原文档那样排列着,段落不变、顺序不变的输出到word文档中等。 + +提供了2种版面恢复方法,可根据输入PDF的格式进行选择: + +- **标准PDF解析(输入须为标准PDF)**:基于Python的pdf转word库[pdf2docx](https://github.com/dothinking/pdf2docx)进行优化,该方法通过PyMuPDF获取页面元素,然后利用规则解析章节、段落、表格等布局及样式,最后通过python-docx将解析的内容元素重建到word文档中。 +- **图片格式PDF解析(输入可为标准PDF或图片格式PDF)**:结合[版面分析](./train_layout.md)、[表格识别](./train_table.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件。 + +2种方法输入格式、适用场景如下: + +| 方法 | 支持输入文件 | 适用场景/存在问题 | +| :-------------: | :----------: | :----------------------------------------------------------: | +| 标准PDF解析 | pdf | 优点:非论文文档恢复效果更优、每一页内容恢复后仍在同一页
缺点:有些中文文档中的英文乱码、仍存在内容超出当前页面的情况、整页内容恢复为表格格式、部分图片恢复效果不佳 | +| 图片格式PDF解析 | pdf、图片 | 优点:更适合论文文档正文内容的恢复、中英文文档OCR识别效果好
缺点:目前内容恢复基于规则,内容排版效果(间距、字体等)待进一步提升、版面恢复效果依赖于版面分析效果 | + +下图展示了通过PDF解析版面恢复效果: + +![img](./images/195319840-68fc60ec-ea66-4095-b734-0ec115860341.png) + +下图分别展示了通过OCR技术,英文文档和中文文档版面恢复的效果: + +![img](./images/recovery-20240708091126891.jpg) + +![img](./images/recovery_ch.jpg) + +## 2. 安装 + +### 2.1 安装PaddlePaddle + +```bash linenums="1" +python3 -m pip install --upgrade pip + +# 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple + +# 您的机器是CPU,请运行以下命令安装 +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple +``` + +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 2.2 安装PaddleOCR + +- **(1)下载版面恢复源码** + +```bash linenums="1" +【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR + +# 如果因为网络问题无法pull成功,也可选择使用码云上的托管: +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 +``` + +- **(2)安装recovery的`requirements`** + +版面恢复导出为docx文件,所以需要安装Python处理word文档的python-docx API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 + +通过如下命令安装全部库: + +```bash linenums="1" +python3 -m pip install -r ppstructure/recovery/requirements.txt +``` + +使用pdf2docx库解析的方式恢复文档需要安装优化的pdf2docx。 + +```bash linenums="1" +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + +## 3.使用标准PDF解析进行版面恢复 + +`use_pdf2docx_api`表示使用PDF解析的方式进行版面恢复,通过whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../quick_start.md)。 + +```bash linenums="1" +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过命令行的方式: + +```bash linenums="1" +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + +## 4.使用图片格式PDF解析进行版面恢复 + +我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: + +- 文字区域直接进行OCR检测和识别,在之前信息基础上增加OCR检测框坐标和文本内容信息 +- 表格区域进行表格识别,记录表格html和文字信息 +- 图片直接保存 + +我们通过版面信息、OCR检测和识别结构、表格信息、保存的图片,对测试图片进行恢复即可。 + +提供如下代码实现版面恢复,也提供了whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../quick_start.md)。 + +```bash linenums="1" +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +# 中文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' +``` + +### 4.1 下载模型 + +如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 + +```bash linenums="1" +cd PaddleOCR/ppstructure + +# 下载模型 +mkdir inference && cd inference +# 下载英文超轻量PP-OCRv3检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +# 下载英文超轻量PP-OCRv3识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +# 下载英文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# 下载英文版面分析模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` + +如果输入为中文文档类型,在下述链接中下载中文模型即可: + +[PP-OCRv3中英文超轻量文本检测和识别模型](../../ppocr/model_list.md) + +### 4.2 版面恢复 + +使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令: + +```bash linenums="1" +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --output=../output/ +``` + +运行完成后,恢复版面的docx文档会保存到`output`字段指定的目录下 + +字段含义: + +- image_dir:测试文件,可以是图片、图片目录、pdf文件、pdf文件目录 +- det_model_dir:OCR检测模型路径 +- rec_model_dir:OCR识别模型路径 +- rec_char_dict_path:OCR识别字典,如果更换为中文模型,需要更改为"../ppocr/utils/ppocr_keys_v1.txt",如果您在自己的数据集上训练的模型,则更改为训练的字典的文件 +- table_model_dir:表格识别模型路径 +- table_char_dict_path:表格识别字典,如果更换为中文模型,不需要更换字典 +- layout_model_dir:版面分析模型路径 +- layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:是否进行版面恢复,默认False +- output:版面恢复结果保存路径 + +## 5. 更多 + +关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](../../ppocr/model_train/detection.md) + +关于OCR识别模型的训练评估与推理,请参考:[文本识别教程](../../ppocr/model_train/recognition.md) + +关于版面分析模型的训练评估与推理,请参考:[版面分析教程](./train_layout.md) + +关于表格识别模型的训练评估与推理,请参考:[表格识别教程](./train_table.md) diff --git a/docs/ppstructure/model_train/train_kie.en.md b/docs/ppstructure/model_train/train_kie.en.md new file mode 100644 index 0000000000..6453ea3791 --- /dev/null +++ b/docs/ppstructure/model_train/train_kie.en.md @@ -0,0 +1,243 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# Key Information Extraction (KIE) + +## 1. Introduction + +Key information extraction (KIE) refers to extracting key information from text or images. As downstream task of OCR, the key information extraction task of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. + +PP-Structure conducts research based on the LayoutXLM multi-modal, and proposes the VI-LayoutXLM, which gets rid of visual features when finetuning the downstream tasks. An textline sorting method is also utilized to fit in reading order. What's more, UDML knowledge distillation is used for higher accuracy. Finally, the accuracy and inference speed of VI-LayoutXLM surpass those of LayoutXLM. + +The main features of the key information extraction module in PP-Structure are as follows. + +- Integrate multi-modal methods such as [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf), VI-LayoutXLM, and PP-OCR inference engine. +- Supports Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multimodal methods. Based on the SER task, the text recognition and classification in the image can be completed; based on the RE task, the relationship extraction of the text content in the image can be completed, such as judging the problem pair (pair). +- Supports custom training for SER tasks and RE tasks. +- Supports end-to-end system prediction and evaluation of OCR+SER. +- Supports end-to-end system prediction of OCR+SER+RE. +- Support SER model export and inference using PaddleInference. + +## 2. Performance + +We evaluate the methods on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows + +|Model | Backbone | Task | Config file | Hmean | Inference time (ms) | Download link| +| --- | --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 | [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%| 19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| + +- Note:Inference environment:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,tested using fp16. + +For more KIE models in PaddleOCR, please refer to [KIE model zoo](../../algorithm/overview.en.md). + +## 3. Visualization + +There are two main solutions to the key information extraction task based on VI-LayoutXLM series model. + +(1) Text detection + text recognition + semantic entity recognition (SER) + +(2) Text detection + text recognition + semantic entity recognition (SER) + relationship extraction (RE) + +The following images are demo results of the SER and RE models. For more detailed introduction to the above solutions, please refer to [KIE Guide](../blog/how_to_do_kie.en.md). + +### 3.1 SER + +Demo results for SER task are as follows. + +![img](./images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68.jpg) + +![img](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg) + +![img](./images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png) + +![img](./images/185539735-37b5c2ef-629d-43fe-9abb-44bb717ef7ee.jpg) + +**Note:** test pictures are from [xfund dataset](https://github.com/doc-analysis/XFUND), [invoice dataset](https://aistudio.baidu.com/aistudio/datasetdetail/165561) and a composite ID card dataset. + +Boxes of different colors in the image represent different categories. + +The invoice and application form images have three categories: `request`, `answer` and `header`. The `question` and `answer` can be used to extract the relationship. + +For the ID card image, the model can directly identify the key information such as `name`, `gender`, `nationality`, so that the subsequent relationship extraction process is not required, and the key information extraction task can be completed using only one model. + +### 3.2 RE + +Demo results for RE task are as follows. + +![img](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg) + +![img](./images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg) + +![img](./images/185540291-f64e5daf-6d42-4e7c-bbbb-471e3fac4fcc.png) + +Red boxes are questions, blue boxes are answers. The green lines means the two connected objects are a pair. + +## 4. Usage + +### 4.1 Prepare for the environment + +Use the following command to install KIE dependencies. + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +The visualized results of SER are saved in the `./output` folder by default. Examples of results are as follows. + +![img](https://github.com/PaddlePaddle/PaddleOCR/raw/main/ppstructure/docs/kie/result_ser/zh_val_42_ser.jpg) + +### 4.2 Quick start + +Here we use XFUND dataset to quickly experience the SER model and RE model. + +#### 4.2.1 Prepare for the dataset + +```bash linenums="1" +mkdir train_data +cd train_data +# download and uncompress the dataset +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 Predict images using the trained model + +Use the following command to download the models. + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# download and uncompress the SER trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# download and uncompress the RE trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +If you want to use OCR engine to obtain end-to-end prediction results, you can use the following command to predict. + +```bash linenums="1" +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory. + +If you want to use a custom ocr model, you can set it through the following fields + +- `Global.kie_det_model_dir`: the detection inference model path +- `Global.kie_rec_model_dir`: the recognition inference model path + +If you want to load the text detection and recognition results collected before, you can use the following command to predict. + +```bash linenums="1" +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 Inference using PaddleInference + +Firstly, download the inference SER inference model. + +```bash linenums="1" +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. +``` + +- SER + +Use the following command for inference. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visual results and text file will be saved in directory `output`. + +- RE + +Use the following command for inference. + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visual results and text file will be saved in directory `output`. + +If you want to use a custom ocr model, you can set it through the following fields + +- `--det_model_dir`: the detection inference model path +- `--rec_model_dir`: the recognition inference model path + +### 4.3 More + +For training, evaluation and inference tutorial for KIE models, please refer to [KIE doc](../model_train/train_kie.en.md). + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](../../ppocr/model_train/detection.en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](../../ppocr/model_train/recognition.en.md). + +To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](../blog/how_to_do_kie.en.md)。 + +## 5. Reference + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, +- microsoft/unilm/layoutxlm, +- XFUND dataset, + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/docs/ppstructure/model_train/train_kie.md b/docs/ppstructure/model_train/train_kie.md new file mode 100644 index 0000000000..206c26df1b --- /dev/null +++ b/docs/ppstructure/model_train/train_kie.md @@ -0,0 +1,238 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 关键信息抽取 + +## 1. 简介 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。 + +PP-Structure 基于 LayoutXLM 文档多模态系列方法进行研究与优化,设计了视觉特征无关的多模态模型结构VI-LayoutXLM,同时引入符合阅读顺序的文本行排序方法以及UDML联合互学习蒸馏方法,最终在精度与速度均超越LayoutXLM。 + +PP-Structure中关键信息抽取模块的主要特性如下: + +- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)、VI-LayoutXLM等多模态模型以及PP-OCR预测引擎。 +- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。 +- 支持SER任务和RE任务的自定义训练。 +- 支持OCR+SER的端到端系统预测与评估。 +- 支持OCR+SER+RE的端到端系统预测。 +- 支持SER模型的动转静导出与基于PaddleInfernece的模型推理。 + +## 2. 精度与性能 + +我们在 [XFUND](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,SER与RE上的任务性能如下 + +| 模型 | 骨干网络 | 任务 | 配置文件 | hmean | 预测耗时(ms) | 下载链接 | +| ------------ | ----------------- | ---- | ------- | ---------- | ------ | ------ | +| VI-LayoutXLM | VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml) | **93.19%** | 15.49 | [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar) | +| LayoutXLM | LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml) | 90.38% | 19.49 | [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +| VI-LayoutXLM | VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml) | **83.92%** | 15.49 | [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar) | +| LayoutXLM | LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml) | 74.83% | 19.49 | [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | + +- 注:预测耗时测试条件:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,使用FP16进行测试。 + +更多关于PaddleOCR中关键信息抽取模型的介绍,请参考[关键信息抽取模型库](../../algorithm/overview.md)。 + +## 3. 效果演示 + +基于多模态模型的关键信息抽取任务有2种主要的解决方案。 + +(1)文本检测 + 文本识别 + 语义实体识别(SER) +(2)文本检测 + 文本识别 + 语义实体识别(SER) + 关系抽取(RE) + +下面给出SER与RE任务的示例效果,关于上述解决方案的详细介绍,请参考[关键信息抽取全流程指南](../blog/how_to_do_kie.md)。 + +### 3.1 SER + +对于SER任务,效果如下所示。 + +![img](./images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68.jpg) + +![img](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808.jpg) + +![img](./images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a.png) + +![img](./images/185539735-37b5c2ef-629d-43fe-9abb-44bb717ef7ee.jpg) + +**注意:** 测试图片来源于[XFUND数据集](https://github.com/doc-analysis/XFUND)、[发票数据集](https://aistudio.baidu.com/aistudio/datasetdetail/165561)以及合成的身份证数据集。 + +图中不同颜色的框表示不同的类别。 + +图中的发票以及申请表图像,有`QUESTION`, `ANSWER`, `HEADER` 3种类别,识别的`QUESTION`, `ANSWER`可以用于后续的问题与答案的关系抽取。 + +图中的身份证图像,则直接识别出其中的`姓名`、`性别`、`民族`等关键信息,这样就无需后续的关系抽取过程,一个模型即可完成关键信息抽取。 + +### 3.2 RE + +对于RE任务,效果如下所示。 + +![img](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb.jpg) + +![img](./images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f.jpg) + +![img](./images/185540291-f64e5daf-6d42-4e7c-bbbb-471e3fac4fcc.png) + +红色框是问题,蓝色框是答案。绿色线条表示连接的两端为一个key-value的pair。 + +## 4. 使用 + +### 4.1 准备环境 + +使用下面的命令安装运行SER与RE关键信息抽取的依赖。 + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +### 4.2 快速开始 + +下面XFUND数据集,快速体验SER模型与RE模型。 + +#### 4.2.1 准备数据 + +```bash linenums="1" +mkdir train_data +cd train_data +# 下载与解压数据 +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 基于动态图的预测 + +首先下载模型。 + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# 下载并解压SER预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# 下载并解压RE预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +如果希望使用OCR引擎,获取端到端的预测结果,可以使用下面的命令进行预测。 + +```bash linenums="1" +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +`Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。 + +如果想使用自定义OCR模型,可通过如下字段进行设置 + +- `Global.kie_det_model_dir`: 设置检测inference模型地址 +- `Global.kie_rec_model_dir`: 设置识别inference模型地址 + +如果希望加载标注好的文本检测与识别结果,仅预测可以使用下面的命令进行预测。 + +```bash linenums="1" +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 基于PaddleInference的预测 + +首先下载SER和RE的推理模型。 + +```bash linenums="1" +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. +``` + +- SER + +执行下面的命令进行预测。 + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化结果保存在`output`目录下。 + +- RE + +执行下面的命令进行预测。 + +```bash linenums="1" +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化结果保存在`output`目录下。 + +如果想使用自定义OCR模型,可通过如下字段进行设置 + +- `--det_model_dir`: 设置检测inference模型地址 +- `--rec_model_dir`: 设置识别inference模型地址 + +### 4.3 更多 + +关于KIE模型的训练评估与推理,请参考:[关键信息抽取教程](./train_kie.md)。 + +关于文本检测模型的训练评估与推理,请参考:[文本检测教程](../../ppocr/model_train/detection.md)。 + +关于文本识别模型的训练评估与推理,请参考:[文本识别教程](../../ppocr/model_train/recognition.md)。 + +关于怎样在自己的场景中完成关键信息抽取任务,请参考:[关键信息抽取全流程指南](../blog/how_to_do_kie.md)。 + +## 5. 参考链接 + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, +- microsoft/unilm/layoutxlm, +- XFUND dataset, + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/docs/ppstructure/model_train/train_layout.en.md b/docs/ppstructure/model_train/train_layout.en.md new file mode 100644 index 0000000000..0af0a5b0f4 --- /dev/null +++ b/docs/ppstructure/model_train/train_layout.en.md @@ -0,0 +1,444 @@ +--- +comments: true +--- + +# Layout analysis + +## 1. Introduction + +Layout analysis refers to the regional division of documents in the form of pictures and the positioning of key areas, such as text, title, table, picture, etc. The layout analysis algorithm is based on the lightweight model PP-picodet of [PaddleDetection]( https://github.com/PaddlePaddle/PaddleDetection ), including English layout analysis, Chinese layout analysis and table layout analysis models. English layout analysis models can detect document layout elements such as text, title, table, figure, list. Chinese layout analysis models can detect document layout elements such as text, figure, figure caption, table, table caption, header, footer, reference, and equation. Table layout analysis models can detect table regions. + +![img](./images/layout.jpg) + +## 2. Quick start + +PP-Structure currently provides layout analysis models in Chinese, English and table documents. For the model link, see [models_list](../models_list.en.md). The whl package is also provided for quick use, see [quickstart](../quick_start.en.md) for details. + +## 3. Install + +### 3.1. Install PaddlePaddle + +- **(1) Install PaddlePaddle** + +```bash linenums="1" +python3 -m pip install --upgrade pip + +# GPU Install +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU Install +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple +``` + +For more requirements, please refer to the instructions in the [Install file](https://www.paddlepaddle.org.cn/install/quick)。 + +### 3.2. Install PaddleDetection + +- **(1)Download PaddleDetection Source code** + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` + +- **(2)Install third-party libraries** + +```bash linenums="1" +cd PaddleDetection +python3 -m pip install -r requirements.txt +``` + +## 4. Data preparation + +If you want to experience the prediction process directly, you can skip data preparation and download the pre-training model. + +### 4.1. English data set + +Download document analysis data set [PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(Dataset 96G),contains 5 classes:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` + +``` +# Download data +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# Decompress data +tar -xvf publaynet.tar.gz +``` + +Uncompressed **directory structure:** + +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**data distribution:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | Training set pictures | 335,703 | +| `val/` | Verification set pictures | 11,245 | +| `test/` | Test set pictures | 11,405 | +| `train.json` | Training set annotation files | - | +| `val.json` | Validation set dimension files | - | + +**Data Annotation** + +The JSON file contains the annotations of all images, and the data is stored in a dictionary nested manner.Contains the following keys: + +- info,represents the dimension file info。 + +- licenses,represents the dimension file licenses。 + +- images,represents the list of image information in the annotation file,each element is the information of an image。The information of one of the images is as follows: + + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` + +- annotations, represents the list of annotation information of the target object in the annotation file,each element is the annotation information of a target object。The following is the annotation information of one of the target objects: + + ``` + { + + 'segmentation': # Segmentation annotation of objects + 'area': 60518.099043117836, # Area of object + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` + +### 4.2. More datasets + +We provide CDLA(Chinese layout analysis), TableBank(Table layout analysis)etc. data set download links,process to the JSON format of the above annotation file,that is, the training can be conducted in the same way。 + +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | For form detection (TRACKA) and form identification (TRACKB).Image types include historical data sets (beginning with cTDaR_t0, such as CTDAR_T00872.jpg) and modern data sets (beginning with cTDaR_t1, CTDAR_T10482.jpg). | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | Data sets constructed by manually annotating figures or pages from publicly available annual reports, containing 5 categories:table, figure, natural image, logo, and signature. | +| [TableBank](https://github.com/doc-analysis/TableBank) | For table detection and recognition of large datasets, including Word and Latex document formats | +| [CDLA](https://github.com/buptlihang/CDLA) | Chinese document layout analysis data set, for Chinese literature (paper) scenarios, including 10 categories:Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation | +| [DocBank](https://github.com/doc-analysis/DocBank) | Large-scale dataset (500K document pages) constructed using weakly supervised methods for document layout analysis, containing 12 categories:Author, Caption, Date, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title | + +## 5. Start training + +Training scripts, evaluation scripts, and prediction scripts are provided, and the PubLayNet pre-training model is used as an example in this section. + +If you do not want training and directly experience the following process of model evaluation, prediction, motion to static, and inference, you can download the provided pre-trained model (PubLayNet dataset) and skip this part. + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# Download PubLayNet pre-training model(Direct experience model evaluates, predicts, and turns static) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# Download the PubLaynet inference model(Direct experience model reasoning) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +``` + +If the test image is Chinese, the pre-trained model of Chinese CDLA dataset can be downloaded to identify 10 types of document regions:Table, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation,Download the training model and inference model of Model 'picodet_lcnet_x1_0_fgd_layout_cdla' in [layout analysis model](../models_list.en.md)。If only the table area in the image is detected, you can download the pre-trained model of the table dataset, and download the training model and inference model of the 'picodet_LCnet_x1_0_FGd_layout_table' model in [Layout Analysis model](../models_list.en.md) + +### 5.1. Train + +Start training with the PaddleDetection [layout analysis profile](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/picodet/legacy_model/application/layout_analysis) + +- Modify Profile + +If you want to train your own data set, you need to modify the data configuration and the number of categories in the configuration file. + +Using 'configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml' as an example, the change is as follows: + +```yaml linenums="1" +metric: COCO +# Number of categories +num_classes: 5 + +TrainDataset: + !COCODataSet + # Modify to your own training data directory + image_dir: train + # Modify to your own training data label file + anno_path: train.json + # Modify to your own training data root directory + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # Modify to your own validation data directory + image_dir: val + # Modify to your own validation data label file + anno_path: val.json + # Modify to your own validation data root + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # Modify to your own test data label file + anno_path: /root/publaynet/val.json +``` + +- Start training. During training, PP picodet pre training model will be downloaded by default. There is no need to download in advance. + +```bash linenums="1" +# GPU training supports single-card and multi-card training +# The training log is automatically saved to the log directory + +# Single card training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# Multi-card training, with the -- GPUS parameter specifying the card number +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**Attention:**If the video memory is out during training, adjust Batch_size in TrainReader and base_LR in LearningRate. The published config is obtained by 8-card training. If the number of GPU cards is changed to 1, then the base_LR needs to be reduced by 8 times. + +After starting training normally, you will see the following log output: + +``` +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval` indicates that the best model is saved as `output/picodet_lcnet_x1_0_layout/best_accuracy` by default during the evaluation process 。 + +**Note that the configuration file for prediction / evaluation must be consistent with the training.** + +### 5.2. FGD Distillation Training + +PaddleDetection supports FGD-based [Focal and Global Knowledge Distillation for Detectors]( https://arxiv.org/abs/2111.11837v1) The training process of the target detection model of distillation, FGD distillation is divided into two parts `Focal` and `Global`. `Focal` Distillation separates the foreground and background of the image, allowing the student model to focus on the key pixels of the foreground and background features of the teacher model respectively;`Global`Distillation section reconstructs the relationships between different pixels and transfers them from the teacher to the student to compensate for the global information lost in `Focal`Distillation. + +Change the dataset and modify the data configuration and number of categories in the [TODO] configuration, referring to 4.1. Start training: + +```bash linenums="1" +# Single Card Training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the compression policy profile. + +## 6. Model evaluation and prediction + +### 6.1. Indicator evaluation + + Model parameters in training are saved by default in `output/picodet_ Lcnet_ X1_ 0_ Under the layout` directory. When evaluating indicators, you need to set `weights` to point to the saved parameter file.Assessment datasets can be accessed via `configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` . Modify `EvalDataset` : `img_dir`,`anno_ Path`and`dataset_dir` setting. + +```bash linenums="1" +# GPU evaluation, weights as weights to be measured +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` + +The following information will be printed out, such as mAP, AP0.5, etc. + +```python linenums="1" + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +If you use the provided pre-training model for evaluation or the FGD distillation training model, replace the `weights` model path and execute the following command for evaluation: + +```bash linenums="1" +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the distillation policy profile. +- `-o weights`: Specify the model path trained by the distillation algorithm. + +### 6.2. Test Layout Analysis Results + +The profile predicted to be used must be consistent with the training, for example, if you pass `python3 tools/train'. Py-c configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` completed the training process for the model. + +With trained PaddleDetection model, you can use the following commands to make model predictions. + +```bash linenums="1" +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +- `--infer_img`: Reasoning for a single picture can also be done via `--infer_ Dir`Inform all pictures in the file. +- `--output_dir`: Specify the path to save the visualization results. +- `--draw_threshold`:Specify the NMS threshold for drawing the result box. + +If you use the provided pre-training model for prediction or the FGD distillation training model, change the `weights` model path and execute the following command to make the prediction: + +```bash linenums="1" +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +## 7. Model Export and Inference + +### 7.1 Model Export + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +Layout analysis model to inference model steps are as follows: + +```bash linenums="1" +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +- If no post-export processing is required, specify:`-o export.benchmark=True`(If -o already exists, delete -o here) +- If you do not need to export NMS, specify:`-o export.nms=False` + +After successful conversion, there are three files in the directory: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference Parameter file for model + ├── model.pdiparams.info # inference Model parameter information, ignorable + └── model.pdmodel # inference Model Structure File for Model +``` + +If you change the `weights` model path using the provided pre-training model to the Inference model, or using the FGD distillation training model, the model to inference model steps are as follows: + +```bash linenums="1" +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + +### 7.2 Model inference + +Replace model_with the provided inference training model for inference or the FGD distillation training `model_dir`Inference model path, execute the following commands for inference: + +```bash linenums="1" +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +- --device:Specify the GPU or CPU device + +When model inference is complete, you will see the following log output: + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:model structure +- Transform Order:Preprocessing operation +- class_id, confidence, left_top, right_bottom:Indicates category id, confidence level, upper left coordinate, lower right coordinate, respectively +- save result to:Save path of visual layout analysis results, default save to ./output folder +- inference time info:Inference time, where preprocess_time represents the preprocessing time, Inference_time represents the model prediction time, and postprocess_time represents the post-processing time + +The result of visualization layout is shown in the following figure + +![img](./images/layout_res.jpg) + +## Citations + +```bibtex +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/docs/ppstructure/model_train/train_layout.md b/docs/ppstructure/model_train/train_layout.md new file mode 100644 index 0000000000..15207db619 --- /dev/null +++ b/docs/ppstructure/model_train/train_layout.md @@ -0,0 +1,441 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 版面分析 + +## 1. 简介 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等。版面分析算法基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的轻量模型PP-PicoDet进行开发,包含英文、中文、表格版面分析3类模型。其中,英文模型支持Text、Title、Tale、Figure、List5类区域的检测,中文模型支持Text、Title、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation10类区域的检测,表格版面分析支持Table区域的检测,版面分析效果如下图所示: + +![img](./images/layout.jpg) + +## 2. 快速开始 + +PP-Structure目前提供了中文、英文、表格三类文档版面分析模型,模型链接见 [models_list](../models_list.md)。也提供了whl包的形式方便快速使用,详见 [quickstart](../quick_start.md)。 + +## 3. 安装 + +### 3.1. 安装PaddlePaddle + +- **(1) 安装PaddlePaddle** + +```bash linenums="1" +python3 -m pip install --upgrade pip + +# GPU安装 +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU安装 +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple +``` + +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 3.2. 安装PaddleDetection + +- **(1)下载PaddleDetection源码** + +```bash linenums="1" +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` + +- **(2)安装其他依赖** + +```bash linenums="1" +cd PaddleDetection +python3 -m pip install -r requirements.txt +``` + +## 4. 数据准备 + +如果希望直接体验预测过程,可以跳过数据准备,下载我们提供的预训练模型。 + +### 4.1. 英文数据集 + +下载文档分析数据集[PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(数据集96G),包含5个类:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` + +``` +# 下载数据 +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# 解压数据 +tar -xvf publaynet.tar.gz +``` + +解压之后的**目录结构:** + +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**数据分布:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | 训练集图片 | 335,703 | +| `val/` | 验证集图片 | 11,245 | +| `test/` | 测试集图片 | 11,405 | +| `train.json` | 训练集标注文件 | - | +| `val.json` | 验证集标注文件 | - | + +**标注格式:** + +json文件包含所有图像的标注,数据以字典嵌套的方式存放,包含以下key: + +- info,表示标注文件info。 + +- licenses,表示标注文件licenses。 + +- images,表示标注文件中图像信息列表,每个元素是一张图像的信息。如下为其中一张图像的信息: + + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` + +- annotations,表示标注文件中目标物体的标注信息列表,每个元素是一个目标物体的标注信息。如下为其中一个目标物体的标注信息: + + ``` + { + + 'segmentation': # 物体的分割标注 + 'area': 60518.099043117836, # 物体的区域面积 + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` + +### 4.2. 更多数据集 + +我们提供了CDLA(中文版面分析)、TableBank(表格版面分析)等数据集的下连接,处理为上述标注文件json格式,即可以按相同方式进行训练。 + +| dataset | 简介 | +| --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | 用于表格检测(TRACKA)和表格识别(TRACKB)。图片类型包含历史数据集(以cTDaR_t0开头,如cTDaR_t00872.jpg)和现代数据集(以cTDaR_t1开头,cTDaR_t10482.jpg)。 | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | 手动注释公开的年度报告中的图形或页面而构建的数据集,包含5类:table, figure, natural image, logo, and signature | +| [CDLA](https://github.com/buptlihang/CDLA) | 中文文档版面分析数据集,面向中文文献类(论文)场景,包含10类:Text、Title、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation | +| [TableBank](https://github.com/doc-analysis/TableBank) | 用于表格检测和识别大型数据集,包含Word和Latex2种文档格式 | +| [DocBank](https://github.com/doc-analysis/DocBank) | 使用弱监督方法构建的大规模数据集(500K文档页面),用于文档布局分析,包含12类:Author、Caption、Date、Equation、Figure、Footer、List、Paragraph、Reference、Section、Table、Title | + +## 5. 开始训练 + +提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 + +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型(PubLayNet数据集),并跳过5.1和5.2。 + +```bash linenums="1" +mkdir pretrained_model +cd pretrained_model +# 下载PubLayNet预训练模型(直接体验模型评估、预测、动转静) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# 下载PubLaynet推理模型(直接体验模型推理) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +``` + +如果测试图片为中文,可以下载中文CDLA数据集的预训练模型,识别10类文档区域:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation,在[版面分析模型](../models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_cdla`模型的训练模型和推理模型。如果只检测图片中的表格区域,可以下载表格数据集的预训练模型,在[版面分析模型](../models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_table`模型的训练模型和推理模型。 + +### 5.1. 启动训练 + +使用PaddleDetection[版面分析配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/picodet/legacy_model/application/layout_analysis)启动训练 + +- 修改配置文件 + +如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 + +以`configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 + +```yaml linenums="1" +metric: COCO +# 类别数 +num_classes: 5 + +TrainDataset: + !COCODataSet + # 修改为你自己的训练数据目录 + image_dir: train + # 修改为你自己的训练数据标签文件 + anno_path: train.json + # 修改为你自己的训练数据根目录 + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # 修改为你自己的验证数据目录 + image_dir: val + # 修改为你自己的验证数据标签文件 + anno_path: val.json + # 修改为你自己的验证数据根目录 + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # 修改为你自己的测试数据标签文件 + anno_path: /root/publaynet/val.json +``` + +- 开始训练,在训练时,会默认下载PP-PicoDet预训练模型,这里无需预先下载。 + +```bash linenums="1" +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存到 log 目录中 + +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# 多卡训练,通过--gpus参数指定卡号 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 + +正常启动训练后,会看到以下log输出: + +```bash linenums="1" +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval`表示训练的同时,进行评估, 评估过程中默认将最佳模型,保存为 `output/picodet_lcnet_x1_0_layout/best_accuracy` 。 + +**注意,预测/评估时的配置文件请务必与训练一致。** + +### 5.2. FGD蒸馏训练 + +PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for Detectors](https://arxiv.org/abs/2111.11837v1))蒸馏的目标检测模型训练过程,FGD蒸馏分为两个部分`Focal`和`Global`。`Focal`蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;`Global`蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿`Focal`蒸馏中丢失的全局信息。 + +更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: + +```bash linenums="1" +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 + +## 6. 模型评估与预测 + +### 6.1. 指标评估 + +训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 + +```bash linenums="1" +# GPU 评估, weights 为待测权重 +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` + +会输出以下信息,打印出mAP、AP0.5等信息。 + +```python linenums="1" + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +若使用**提供的预训练模型进行评估**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行评估: + +```bash linenums="1" +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定蒸馏策略配置文件。 +- `-o weights`: 指定蒸馏算法训好的模型路径。 + +### 6.2 测试版面分析结果 + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 + +使用 PaddleDetection 训练好的模型,您可以使用如下命令进行模型预测。 + +```bash linenums="1" +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +- `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 +- `--output_dir`: 指定可视化结果保存路径。 +- `--draw_threshold`:指定绘制结果框的NMS阈值。 + +若使用**提供的预训练模型进行预测**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行预测: + +```bash linenums="1" +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +## 7. 模型导出与预测 + +### 7.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +版面分析模型转inference模型步骤如下: + +```bash linenums="1" +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +- 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) +- 如无需导出NMS,请指定:`-o export.nms=False` + +转换成功后,在目录下有三个文件: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference模型的参数文件 + ├── model.pdiparams.info # inference模型的参数信息,可忽略 + └── model.pdmodel # inference模型的模型结构文件 +``` + +若使用**提供的预训练模型转Inference模型**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,模型转inference模型步骤如下: + +```bash linenums="1" +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + +### 7.2 模型推理 + +若使用**提供的推理训练模型推理**,或使用**FGD蒸馏训练的模型**,更换`model_dir`推理模型路径,执行如下命令进行推理: + +```bash linenums="1" +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +- --device:指定GPU、CPU设备 + +模型推理完成,会看到以下log输出 + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:模型结构 +- Transform Order:预处理操作 +- class_id、confidence、left_top、right_bottom:分别表示类别id、置信度、左上角坐标、右下角坐标 +- save result to:可视化版面分析结果保存路径,默认保存到`./output`文件夹 +- Inference Time Info:推理时间,其中preprocess_time表示预处理耗时,inference_time表示模型预测耗时,postprocess_time表示后处理耗时 + +可视化版面结果如下图所示 + +![img](./images/layout_res.jpg) + +## Citations + +```bibtex +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/docs/ppstructure/model_train/train_table.en.md b/docs/ppstructure/model_train/train_table.en.md new file mode 100644 index 0000000000..5a75ba2bc3 --- /dev/null +++ b/docs/ppstructure/model_train/train_table.en.md @@ -0,0 +1,164 @@ +--- +comments: true +--- + +# Table Recognition + +## 1. pipeline + +The table recognition mainly contains three models + +1. Single line text detection-DB +2. Single line text recognition-CRNN +3. Table structure and cell coordinate prediction-SLANet + +The table recognition flow chart is as follows + +![tableocr_pipeline](./images/tableocr_pipeline.jpg) + +1. The coordinates of single-line text is detected by DB model, and then sends it to the recognition model to get the recognition result. +2. The table structure and cell coordinates is predicted by SLANet model. +3. The recognition result of the cell is combined by the coordinates, recognition result of the single line and the coordinates of the cell. +4. The cell recognition result and the table structure together construct the html string of the table. + +## 2. Performance + +We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: + +|Method|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.30% |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) | 76.31%| 95.89%|766ms| + +The performance indicators are explained as follows: + +- Acc: The accuracy of the table structure in each image, a wrong token is considered an error. +- TEDS: The accuracy of the model's restoration of table information. This indicator evaluates not only the table structure, but also the text content in the table. +- Speed: The inference speed of a single image when the model runs on the CPU machine and MKL is enabled. + +## 3. Result + +![img](./images/table_ch_result1.jpg) + +![img](./images/table_ch_result2.jpg) + +![img](./images/table_ch_result3.jpg) + +## 4. How to use + +### 4.1 Quick start + +PP-Structure currently provides table recognition models in both Chinese and English. For the model link, see [models_list](../models_list.en.md). The whl package is also provided for quick use, see [quickstart](../quick_start.en.md) for details. + +The following takes the Chinese table recognition model as an example to introduce how to recognize a table. + +Use the following commands to quickly complete the identification of a table. + +```bash linenums="1" +cd PaddleOCR/ppstructure + +# download model +mkdir inference && cd inference +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-StructureV2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +# run +python3.7 table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table + +``` + +After the operation is completed, the excel table of each image will be saved to the directory specified by the output field, and an html file will be produced in the directory to visually view the cell coordinates and the recognized table. + +**NOTE** + +1. If you want to use the English table recognition model, you need to download the English text detection and recognition model and the English table recognition model in [models_list](../models_list.en.md), and replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`. +2. To use the TableRec-RARE model, you need to replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`, and add parameter `--merge_no_span_structure=False` + +### 4.2 Training, Evaluation and Inference + +The training, evaluation and inference process of the text detection model can be referred to [detection](../../ppocr/model_train/detection.en.md) + +The training, evaluation and inference process of the text recognition model can be referred to [recognition](../../ppocr/model_train/recognition.en.md) + +The training, evaluation and inference process of the table recognition model can be referred to [table_recognition](./train_table.en.md) + +### 4.3 Calculate TEDS + +The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: + +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
+``` + +Each line in gt consists of the file name and the html string of the table. The file name and the html string of the table are separated by `\t`. + +You can also use the following command to generate an evaluation gt file from the annotation file: + +```bash linenums="1" +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file +``` + +Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. + +```bash linenums="1" +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +Evaluate on the PubLatNet dataset using the English model + +```bash linenums="1" +cd PaddleOCR/ppstructure +# Download the model +mkdir inference && cd inference +# Download the text detection model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# Download the text recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# Download the table recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --rec_image_shape=3,32,320 \ + --gt_path=path/to/gt.txt +``` + +output is + +```bash linenums="1" +teds: 95.89 +``` + +## 5. Reference + +1. +2. diff --git a/docs/ppstructure/model_train/train_table.md b/docs/ppstructure/model_train/train_table.md new file mode 100644 index 0000000000..165e5bb118 --- /dev/null +++ b/docs/ppstructure/model_train/train_table.md @@ -0,0 +1,167 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# 表格识别 + +## 1. 表格识别 pipeline + +表格识别主要包含三个模型 + +1. 单行文本检测-DB +2. 单行文本识别-CRNN +3. 表格结构和cell坐标预测-SLANet + +具体流程图如下 + +![tableocr_pipeline](./images/tableocr_pipeline.jpg) + +流程说明: + +1. 图片由单行文字检测模型检测到单行文字的坐标,然后送入识别模型拿到识别结果。 +2. 图片由SLANet模型拿到表格的结构信息和单元格的坐标信息。 +3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 +4. 单元格的识别结果和表格结构一起构造表格的html字符串。 + +## 2. 性能 + +我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 + +| 算法 | Acc | [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) | Speed | +| ------------------- | ------ | ----------------------------------------------------------------------------------------------------- | ----- | +| EDD[2] | x | 88.30% | x | +| TableRec-RARE(ours) | 71.73% | 93.88% | 779ms | +| SLANet(ours) | 76.31% | 95.89% | 766ms | + +性能指标解释如下: + +- Acc: 模型对每张图像里表格结构的识别准确率,错一个token就算错误。 +- TEDS: 模型对表格信息还原的准确度,此指标评价内容不仅包含表格结构,还包含表格内的文字内容。 +- Speed: 模型在CPU机器上,开启MKL的情况下,单张图片的推理速度。 + +## 3. 效果演示 + +![img](./images/table_ch_result1.jpg) + +![img](./images/table_ch_result2.jpg) + +![img](./images/table_ch_result3.jpg) + +## 4. 使用 + +### 4.1 快速开始 + +PP-Structure目前提供了中英文两种语言的表格识别模型,模型链接见 [models_list](../models_list.md)。也提供了whl包的形式方便快速使用,详见 [quickstart](../quick_start.md)。 + +下面以中文表格识别模型为例,介绍如何识别一张表格。 + +使用如下命令即可快速完成一张表格的识别。 + +```bash linenums="1" +cd PaddleOCR/ppstructure + +# 下载模型 +mkdir inference && cd inference +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-StructureV2中文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +# 执行表格识别 +python table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table +``` + +运行完成后,每张图片的excel表格会保存到output字段指定的目录下,同时在该目录下回生产一个html文件,用于可视化查看单元格坐标和识别的表格。 + +**NOTE** + +1. 如果想使用英文模型,需要在 [models_list](../../ppocr/model_list.md) 中下载英文文字检测识别模型和英文表格识别模型,同时替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`即可。 +2. 如需使用TableRec-RARE模型,需要替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`,同时参数`--merge_no_span_structure=False` + +### 4.2 模型训练、评估与推理 + +文本检测模型的训练、评估和推理流程可参考 [detection](../../ppocr/model_train/detection.md) + +文本识别模型的训练、评估和推理流程可参考 [recognition](../../ppocr/model_train/recognition.md) + +表格识别模型的训练、评估和推理流程可参考 [table_recognition](../../ppstructure/model_train/train_table.md) + +### 4.3 计算TEDS + +表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: + +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
+``` + +gt每一行都由文件名和表格的html字符串组成,文件名和表格的html字符串之间使用`\t`分隔。 + +也可使用如下命令,由标注文件生成评估的gt文件: + +```bash linenums="1" +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file +``` + +准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 + +```bash linenums="1" +cd PaddleOCR/ppstructure +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +如使用英文表格识别模型在PubLatNet数据集上进行评估 + +```bash linenums="1" +cd PaddleOCR/ppstructure +# 下载模型 +mkdir inference && cd inference +# 下载基于PubTabNet数据集训练的文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# 下载基于PubTabNet数据集训练的文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# 下载基于PubTabNet数据集训练的表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --rec_image_shape=3,32,320 \ + --gt_path=path/to/gt.txt +``` + +将会输出 + +```bash linenums="1" +teds: 95.89 +``` + +## 5. Reference + +1. +2. diff --git a/docs/ppstructure/model_train/training.en.md b/docs/ppstructure/model_train/training.en.md new file mode 100644 index 0000000000..56eeb4b8f1 --- /dev/null +++ b/docs/ppstructure/model_train/training.en.md @@ -0,0 +1,132 @@ +--- +comments: true +--- + +# Model Training + +This article will introduce the basic concepts that is necessary for model training and tuning. + +At the same time, it will briefly introduce the structure of the training data and how to prepare the data to fine-tune model in vertical scenes. + +## 1. Yml Configuration + +The PaddleOCR uses configuration files to control network training and evaluation parameters. In the configuration file, you can set the model, optimizer, loss function, and pre- and post-processing parameters of the model. PaddleOCR reads these parameters from the configuration file, and then builds a complete training process to train the model. Fine-tuning can also be completed by modifying the parameters in the configuration file, which is simple and convenient. + +For the complete configuration file description, please refer to [Configuration File](../../ppocr/blog/config.en.md) + +## 2. Basic Concepts + +During the model training process, some hyper-parameters can be manually specified to obtain the optimal result at the least cost. Different data volumes may require different hyper-parameters. When you want to fine-tune the model based on your own data, there are several parameter adjustment strategies for reference: + +### 2.1 Learning Rate + +The learning rate is one of the most important hyper-parameters for training neural networks. It represents the step length of the gradient moving towards the optimal solution of the loss function in each iteration. +A variety of learning rate update strategies are provided by PaddleOCR, which can be specified in configuration files. For example, + +```yaml linenums="1" +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +`Piecewise` stands for piece-wise constant attenuation. Different learning rates are specified in different learning stages, and the learning rate stay the same in each stage. + +`warmup_epoch` means that in the first 5 epochs, the learning rate will be increased gradually from 0 to base_lr. For all strategies, please refer to the code [learning_rate.py](../../ppocr/optimizer/learning_rate.py). + +### 2.2 Regularization + +Regularization can effectively avoid algorithm over-fitting. PaddleOCR provides L1 and L2 regularization methods. +L1 and L2 regularization are the most widely used regularization methods. +L1 regularization adds a regularization term to the objective function to reduce the sum of absolute values of the parameters; +while in L2 regularization, the purpose of adding a regularization term is to reduce the sum of squared parameters. +The configuration method is as follows: + +```yaml linenums="1" +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + +### 2.3 Evaluation Indicators + +(1) Detection stage: First, evaluate according to the IOU of the detection frame and the labeled frame. If the IOU is greater than a certain threshold, it is judged that the detection is accurate. Here, the detection frame and the label frame are different from the general general target detection frame, and they are represented by polygons. Detection accuracy: the percentage of the correct detection frame number in all detection frames is mainly used to judge the detection index. Detection recall rate: the percentage of correct detection frames in all marked frames, which is mainly an indicator of missed detection. + +(2) Recognition stage: Character recognition accuracy, that is, the ratio of correctly recognized text lines to the number of marked text lines. Only the entire line of text recognition pairs can be regarded as correct recognition. + +(3) End-to-end statistics: End-to-end recall rate: accurately detect and correctly identify the proportion of text lines in all labeled text lines; End-to-end accuracy rate: accurately detect and correctly identify the number of text lines in the detected text lines The standard for accurate detection is that the IOU of the detection box and the labeled box is greater than a certain threshold, and the text in the correctly identified detection box is the same as the labeled text. + +## 3. Data and Vertical Scenes + +### 3.1 Training Data + +The current open source models, data sets and magnitudes are as follows: + +- Detection: + - English data set, ICDAR2015 + - Chinese data set, LSVT street view data set training data 3w pictures + +- Identification: + - English data set, MJSynth and SynthText synthetic data, the data volume is tens of millions. + - Chinese data set, LSVT street view data set crops the image according to the truth value, and performs position calibration, a total of 30w images. In addition, based on the LSVT corpus, 500w of synthesized data. + - Small language data set, using different corpora and fonts, respectively generated 100w synthetic data set, and using ICDAR-MLT as the verification set. + +Among them, the public data sets are all open source, users can search and download by themselves, or refer to [Chinese data set](../../datasets/datasets.en.md), synthetic data is not open source, users can use open source synthesis tools to synthesize by themselves. Synthesis tools include [text_renderer](https://github.com/Sanster/text_renderer), [SynthText](https://github.com/ankush-me/SynthText), [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) etc. + +### 3.2 Vertical Scene + +PaddleOCR mainly focuses on general OCR. If you have vertical requirements, you can use PaddleOCR + vertical data to train yourself; +If there is a lack of labeled data, or if you do not want to invest in research and development costs, it is recommended to directly call the open API, which covers some of the more common vertical categories. + +### 3.3 Build Your Own Dataset + +There are several experiences for reference when constructing the data set: + +(1) The amount of data in the training set: + +a. The data required for detection is relatively small. For Fine-tune based on the PaddleOCR model, 500 sheets are generally required to achieve good results. + +b. Recognition is divided into English and Chinese. Generally, English scenarios require hundreds of thousands of data to achieve good results, while Chinese requires several million or more. + +(2) When the amount of training data is small, you can try the following three ways to get more data: + +a. Manually collect more training data, the most direct and effective way. + +b. Basic image processing or transformation based on PIL and opencv. For example, the three modules of ImageFont, Image, ImageDraw in PIL write text into the background, opencv's rotating affine transformation, Gaussian filtering and so on. + +c. Use data generation algorithms to synthesize data, such as algorithms such as pix2pix. + +## 4. FAQ + +**Q**: How to choose a suitable network input shape when training CRNN recognition? + + A: The general height is 32, the longest width is selected, there are two methods: + + (1) Calculate the aspect ratio distribution of training sample images. The selection of the maximum aspect ratio considers 80% of the training samples. + + (2) Count the number of texts in training samples. The selection of the longest number of characters considers the training sample that satisfies 80%. Then the aspect ratio of Chinese characters is approximately considered to be 1, and that of English is 3:1, and the longest width is estimated. + +**Q**: During the recognition training, the accuracy of the training set has reached 90, but the accuracy of the verification set has been kept at 70, what should I do? + + A: If the accuracy of the training set is 90 and the test set is more than 70, it should be over-fitting. There are two methods to try: + + (1) Add more augmentation methods or increase the [probability] of augmented prob (https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341), The default is 0.4. + + (2) Increase the [l2 dcay value] of the system (https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: When the recognition model is trained, loss can drop normally, but acc is always 0 + + A: It is normal for the acc to be 0 at the beginning of the recognition model training, and the indicator will come up after a longer training period. + +*** + +Click the following links for detailed training tutorial: + +- [text detection model training](./detection.en.md) +- [text recognition model training](./recognition.en.md) +- [text direction classification model training](./angle_class.en.md) diff --git a/docs/ppstructure/model_train/training.md b/docs/ppstructure/model_train/training.md new file mode 100644 index 0000000000..4f137b66a9 --- /dev/null +++ b/docs/ppstructure/model_train/training.md @@ -0,0 +1,128 @@ +--- +comments: true +--- + +# PP-OCR模型训练 + +本文将介绍模型训练时需掌握的基本概念,和训练时的调优方法。 + +同时会简单介绍PaddleOCR模型训练数据的组成部分,以及如何在垂类场景中准备数据finetune模型。 + +## 1. 配置文件说明 + +PaddleOCR模型使用配置文件管理网络训练、评估的参数。在配置文件中,可以设置组建模型、优化器、损失函数、模型前后处理的参数,PaddleOCR从配置文件中读取到这些参数,进而组建出完整的训练流程,完成模型训练,在需要对模型进行优化的时,可以通过修改配置文件中的参数完成配置,使用简单且方便修改。 + +完整的配置文件说明可以参考[配置文件](../../ppocr/blog/config.md) + +## 2. 基本概念 + +模型训练过程中需要手动调整一些超参数,帮助模型以最小的代价获得最优指标。不同的数据量可能需要不同的超参,当您希望在自己的数据上finetune或对模型效果调优时,有以下几个参数调整策略可供参考: + +### 2.1 学习率 + +学习率是训练神经网络的重要超参数之一,它代表在每一次迭代中梯度向损失函数最优解移动的步长。 +在PaddleOCR中提供了多种学习率更新策略,可以通过配置文件修改,例如: + +```yaml linenums="1" +Optimizer: + ... + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] + warmup_epoch: 5 +``` + +Piecewise 代表分段常数衰减,在不同的学习阶段指定不同的学习率,在每段内学习率相同。 +warmup_epoch 代表在前5个epoch中,学习率将逐渐从0增加到base_lr。全部策略可以参考代码[learning_rate.py](../../ppocr/optimizer/learning_rate.py) 。 + +### 2.2 正则化 + +正则化可以有效的避免算法过拟合,PaddleOCR中提供了L1、L2正则方法,L1 和 L2 正则化是最常用的正则化方法。L1 正则化向目标函数添加正则化项,以减少参数的绝对值总和;而 L2 正则化中,添加正则化项的目的在于减少参数平方的总和。配置方法如下: + +```yaml linenums="1" +Optimizer: + ... + regularizer: + name: L2 + factor: 2.0e-05 +``` + +### 2.3 评估指标 + +(1)检测阶段:先按照检测框和标注框的IOU评估,IOU大于某个阈值判断为检测准确。这里检测框和标注框不同于一般的通用目标检测框,是采用多边形进行表示。检测准确率:正确的检测框个数在全部检测框的占比,主要是判断检测指标。检测召回率:正确的检测框个数在全部标注框的占比,主要是判断漏检的指标。 + +(2)识别阶段: 字符识别准确率,即正确识别的文本行占标注的文本行数量的比例,只有整行文本识别对才算正确识别。 + +(3)端到端统计: 端对端召回率:准确检测并正确识别文本行在全部标注文本行的占比; 端到端准确率:准确检测并正确识别文本行在 检测到的文本行数量 的占比; 准确检测的标准是检测框与标注框的IOU大于某个阈值,正确识别的检测框中的文本与标注的文本相同。 + +## 3. 数据与垂类场景 + +### 3.1 训练数据 + +目前开源的模型,数据集和量级如下: + +- 检测: + - 英文数据集,ICDAR2015 + - 中文数据集,LSVT街景数据集训练数据3w张图片 + +- 识别: + - 英文数据集,MJSynth和SynthText合成数据,数据量上千万。 + - 中文数据集,LSVT街景数据集根据真值将图crop出来,并进行位置校准,总共30w张图像。此外基于LSVT的语料,合成数据500w。 + - 小语种数据集,使用不同语料和字体,分别生成了100w合成数据集,并使用ICDAR-MLT作为验证集。 + +其中,公开数据集都是开源的,用户可自行搜索下载,也可参考[中文数据集](../../datasets/datasets.md),合成数据暂不开源,用户可使用开源合成工具自行合成,可参考的合成工具包括[text_renderer](https://github.com/Sanster/text_renderer) 、[SynthText](https://github.com/ankush-me/SynthText) 、[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator) 等。 + +### 3.2 垂类场景 + +PaddleOCR主要聚焦通用OCR,如果有垂类需求,您可以用PaddleOCR+垂类数据自己训练; +如果缺少带标注的数据,或者不想投入研发成本,建议直接调用开放的API,开放的API覆盖了目前比较常见的一些垂类。 + +### 3.3 自己构建数据集 + +在构建数据集时有几个经验可供参考: + +(1) 训练集的数据量: + +a. 检测需要的数据相对较少,在PaddleOCR模型的基础上进行Fine-tune,一般需要500张可达到不错的效果。 + +b. 识别分英文和中文,一般英文场景需要几十万数据可达到不错的效果,中文则需要几百万甚至更多。 + +(2)当训练数据量少时,可以尝试以下三种方式获取更多的数据: + +a. 人工采集更多的训练数据,最直接也是最有效的方式。 + +b. 基于PIL和opencv基本图像处理或者变换。例如PIL中ImageFont, Image, ImageDraw三个模块将文字写到背景中,opencv的旋转仿射变换,高斯滤波等。 + +c. 利用数据生成算法合成数据,例如pix2pix或[StyleText](https://github.com/PFCCLab/StyleText)等算法。 + +## 4. 常见问题 + +**Q**:训练CRNN识别时,如何选择合适的网络输入shape? + + A:一般高度采用32,最长宽度的选择,有两种方法: + + (1)统计训练样本图像的宽高比分布。最大宽高比的选取考虑满足80%的训练样本。 + + (2)统计训练样本文字数目。最长字符数目的选取考虑满足80%的训练样本。然后中文字符长宽比近似认为是1,英文认为3:1,预估一个最长宽度。 + +**Q**:识别训练时,训练集精度已经到达90了,但验证集精度一直在70,涨不上去怎么办? + + A:训练集精度90,测试集70多的话,应该是过拟合了,有两个可尝试的方法: + + (1)加入更多的增广方式或者调大增广prob的[概率](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppocr/data/imaug/rec_img_aug.py#L341),默认为0.4。 + + (2)调大系统的[l2 dcay值](https://github.com/PaddlePaddle/PaddleOCR/blob/a501603d54ff5513fc4fc760319472e59da25424/configs/rec/ch_ppocr_v1.1/rec_chinese_lite_train_v1.1.yml#L47) + +**Q**: 识别模型训练时,loss能正常下降,但acc一直为0 + + A:识别模型训练初期acc为0是正常的,多训一段时间指标就上来了。 + +*** + +具体的训练教程可点击下方链接跳转: + +- [文本检测模型训练](./detection.md) +- [文本识别模型训练](./recognition.md) +- [文本方向分类器训练](./angle_class.md) +- [知识蒸馏](../model_compress/knowledge_distillation.md) diff --git a/docs/ppstructure/models_list.en.md b/docs/ppstructure/models_list.en.md new file mode 100644 index 0000000000..57157adc67 --- /dev/null +++ b/docs/ppstructure/models_list.en.md @@ -0,0 +1,57 @@ +--- +comments: true +--- + +# PP-Structure Model list + +## 1. Layout Analysis + +|model name| description | inference model size |download|dict path| +| --- |----| --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | The layout analysis English model trained on the PubLayNet dataset based on PicoDet LCNet_x1_0 and FGD . the model can recognition 5 types of areas such as **Text, Title, Table, Picture and List** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | The layout analysis English model trained on the PubLayNet dataset based on PP-YOLOv2 | 221.0M | [inference_moel](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [trained model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | same as above | +| picodet_lcnet_x1_0_fgd_layout_cdla | The layout analysis Chinese model trained on the CDLA dataset, the model can recognition 10 types of areas such as **Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation** | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | The layout analysis model trained on the table dataset, the model can detect tables in Chinese and English documents | 9.7M | [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | The layout analysis model trained on the TableBank Word dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | same as above | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | The layout analysis model trained on the TableBank Latex dataset based on PP-YOLOv2, the model can detect tables in English documents | 221.0M | [inference model](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | same as above | + +## 2. OCR and Table Recognition + +### 2.1 OCR + +|model name| description | inference model size |download| +| --- |---|---| --- | +|en_ppocr_mobile_v2.0_table_det| Text detection model of English table scenes trained on PubTabNet dataset | 4.7M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec| Text recognition model of English table scenes trained on PubTabNet dataset | 6.9M |[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | + +If you need to use other OCR models, you can download the model in [PP-OCR model_list](../ppocr/model_list.en.md) or use the model you trained yourself to configure to `det_model_dir`, `rec_model_dir` field. + +### 2.2 Table Recognition + +|model| description |inference model size|download| +| --- |-----| --- | --- | +|en_ppocr_mobile_v2.0_table_structure| English table recognition model trained on PubTabNet dataset based on TableRec-RARE |6.8M|[inference model](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppstructure_mobile_v2.0_SLANet|English table recognition model trained on PubTabNet dataset based on SLANet|9.2M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|Chinese table recognition model based on SLANet|9.3M|[inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | + +## 3. KIE + +On XFUND_zh dataset, Accuracy and time cost of different models on V100 GPU are as follows. + +|Model|Backbone|Task|Config|Hmean|Time cost(ms)|Download link| +| --- | --- | --- | --- | --- | --- |--- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49| [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 |[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|LayoutLM| LayoutLM-base | SER | [ser_layoutlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlm_xfund_zh.yml)|77.31%|-|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | SER | [ser_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutlmv2_xfund_zh.yml)|85.44%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**|15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%|19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| +|LayoutLMv2| LayoutLMv2-base | RE | [re_layoutlmv2_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutlmv2_xfund_zh.yml)|67.77%|31.46|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar)| + +* Note: The above time cost information just considers inference time without preprocess or postprocess, test environment: `V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4` + +On wildreceipt dataset, the algorithm result is as follows: + +|Model|Backbone|Config|Hmean|Download link| +| --- | --- | --- | --- | --- | +|SDMGR|VGG6|[configs/kie/sdmgr/kie_unet_sdmgr.yml](../../configs/kie/sdmgr/kie_unet_sdmgr.yml)|86.70%|[trained model](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/docs/ppstructure/models_list.md b/docs/ppstructure/models_list.md new file mode 100644 index 0000000000..fe587f4389 --- /dev/null +++ b/docs/ppstructure/models_list.md @@ -0,0 +1,57 @@ +--- +comments: true +--- + +# PP-Structure 系列模型列表 + +## 1. 版面分析模型 + +|模型名称|模型简介|推理模型大小|下载地址|dict path| +| --- | --- | --- | --- | --- | +| picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | +| picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | + +## 2. OCR和表格识别模型 + +### 2.1 OCR + +|模型名称|模型简介|推理模型大小|下载地址| +| --- | --- | --- | --- | +|en_ppocr_mobile_v2.0_table_det|PubTabNet数据集训练的英文表格场景的文字检测|4.7M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +|en_ppocr_mobile_v2.0_table_rec|PubTabNet数据集训练的英文表格场景的文字识别|6.9M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | + +如需要使用其他OCR模型,可以在 [PP-OCR model_list](../ppocr/model_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 + +### 2.2 表格识别模型 + +|模型名称|模型简介|推理模型大小|下载地址| +| --- | --- | --- | --- | +|en_ppocr_mobile_v2.0_table_structure|基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型|6.8M|[推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +|en_ppstructure_mobile_v2.0_SLANet|基于SLANet在PubTabNet数据集上训练的英文表格识别模型|9.2M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | +|ch_ppstructure_mobile_v2.0_SLANet|基于SLANet的中文表格识别模型|9.3M|[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | + +## 3. KIE模型 + +在XFUND_zh数据集上,不同模型的精度与V100 GPU上速度信息如下所示。 + +|模型名称|模型简介 | 推理模型大小| 精度(hmean) | 预测耗时(ms) | 下载地址| +| --- | --- | --- |--- |--- | --- | +|ser_VI-LayoutXLM_xfund_zh|基于VI-LayoutXLM在xfund中文数据集上训练的SER模型|1.1G| 93.19% | 15.49 | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar) | +|re_VI-LayoutXLM_xfund_zh|基于VI-LayoutXLM在xfund中文数据集上训练的RE模型|1.1G| 83.92% | 15.49 |[推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar) | +|ser_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的SER模型|1.4G| 90.38% | 19.49 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +|re_LayoutXLM_xfund_zh|基于LayoutXLM在xfund中文数据集上训练的RE模型|1.4G| 74.83% | 19.49 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +|ser_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfund中文数据集上训练的SER模型|778.0M| 85.44% | 31.46 |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +|re_LayoutLMv2_xfund_zh|基于LayoutLMv2在xfun中文数据集上训练的RE模型|765.0M| 67.77% | 31.46 |[推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | +|ser_LayoutLM_xfund_zh|基于LayoutLM在xfund中文数据集上训练的SER模型|430.0M| 77.31% | - |[推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | + +* 注:上述预测耗时信息仅包含了inference模型的推理耗时,没有统计预处理与后处理耗时,测试环境为`V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4`。 + +在wildreceipt数据集上,SDMGR模型精度与下载地址如下所示。 + +|模型名称|模型简介|模型大小|精度|下载地址| +| --- | --- | --- |--- | --- | +|SDMGR|关键信息提取模型|78.0M| 86.70% | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar)| diff --git a/docs/ppstructure/overview.en.md b/docs/ppstructure/overview.en.md new file mode 100644 index 0000000000..d1e3f54714 --- /dev/null +++ b/docs/ppstructure/overview.en.md @@ -0,0 +1,104 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-Structure + +## 1. Introduction + +PP-Structure is an intelligent document analysis system developed by the PaddleOCR team, which aims to help developers better complete tasks related to document understanding such as layout analysis and table recognition. + +The pipeline of PP-StructureV2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. + +- In the layout analysis task, the image first goes through the layout analysis model to divide the image into different areas such as text, table, and figure, and then analyze these areas separately. For example, the table area is sent to the form recognition module for structured recognition, and the text area is sent to the OCR engine for text recognition. Finally, the layout recovery module restores it to a word or pdf file with the same layout as the original image; +- In the key information extraction task, the OCR engine is first used to extract the text content, and then the SER(semantic entity recognition) module obtains the semantic entities in the image, and finally the RE(relationship extraction) module obtains the correspondence between the semantic entities, thereby extracting the required key information. + +![img](./images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1-20240705140834325.jpg) + +More technical details: 👉 [PP-StructureV2 Technical Report](https://arxiv.org/abs/2210.05391) + +PP-StructureV2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: + +- [Layout Analysis](./model_train/train_layout.en.md) +- [Table Recognition](./model_train/train_table.en.md) +- [Key Information Extraction](./model_train/train_kie.en.md) +- [Layout Recovery](./model_train/recovery_to_doc.en.md) + +## 2. Features + +The main features of PP-StructureV2 are as follows: + +- Support layout analysis of documents in the form of images/pdfs, which can be divided into areas such as **text, titles, tables, figures, formulas, etc.**; +- Support common Chinese and English **table detection** tasks; +- Support structured table recognition, and output the final result to **Excel file**; +- Support multimodal-based Key Information Extraction (KIE) tasks - **Semantic Entity Recognition** (SER) and **Relation Extraction (RE); +- Support **layout recovery**, that is, restore the document in word or pdf format with the same layout as the original image; +- Support customized training and multiple inference deployment methods such as python whl package quick start; +- Connect with the semi-automatic data labeling tool PPOCRLabel, which supports the labeling of layout analysis, table recognition, and SER. + +## 3. Results + +PP-StructureV2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. + +### 3.1 Layout analysis and table recognition + +The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. + +![img](./images/ppstructure.gif) + +### 3.1.1 Layout recognition returns the coordinates of a single word + +The following figure shows the result of layout analysis on single word, please refer to the [doc](./blog/return_word_pos.en.md). + +![show_0_mdf_v2](./images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg) + +### 3.2 Layout recovery + +The following figure shows the effect of layout recovery based on the results of layout analysis and table recognition in the previous section. + +![img](./images/recovery.jpg) + +### 3.3 KIE + +- SER + +Different colored boxes in the figure represent different categories. + +![img](./images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68-20240705093932704.jpg) + +![img](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240705094001639.jpg) + +![img](./images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240705094013236.png) + +![img](./images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png) + +![img](./images/186095702-9acef674-12af-4d09-97fc-abf4ab32600e.png) + +- RE + +In the figure, the red box represents `Question`, the blue box represents `Answer`, and `Question` and `Answer` are connected by green lines. + +![img](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240705094037073.jpg) + +![img](./images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240705094043151.jpg) + +![img](./images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png) + +![img](./images/186095641-5843b4da-34d7-4c1c-943a-b1036a859fe3.png) + +## 4. Quick start + +Start from [Quick Start](./quick_start.en.md). + +## 5. Model List + +Some tasks need to use both the structured analysis models and the OCR models. For example, the table recognition task needs to use the table recognition model for structured analysis, and the OCR model to recognize the text in the table. Please select the appropriate models according to your specific needs. + +For structural analysis related model downloads, please refer to: + +- [PP-Structure Model Zoo](./models_list.en.md) + +For OCR related model downloads, please refer to: + +- [PP-OCR Model Zoo](../ppocr/model_list.en.md) diff --git a/docs/ppstructure/overview.md b/docs/ppstructure/overview.md new file mode 100644 index 0000000000..92512e047a --- /dev/null +++ b/docs/ppstructure/overview.md @@ -0,0 +1,104 @@ +--- +typora-copy-images-to: images +comments: true +--- + +# PP-Structure 文档分析 + +## 1. 简介 + +PP-Structure是PaddleOCR团队自研的智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别等文档理解相关任务。 + +PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。 + +- 版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件; +- 关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 + +![img](./images/195265734-6f4b5a7f-59b1-4fcc-af6d-89afc9bd51e1-20240705140834325.jpg) + +更多技术细节:👉 PP-StructureV2技术报告 [中文版](docs/PP-StructureV2_introduction.md),[英文版](https://arxiv.org/abs/2210.05391)。 + +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: + +- [版面分析](model_train/train_layout.md) +- [表格识别](model_train/train_table.md) +- [关键信息抽取](model_train/train_kie.md) +- [版面复原](model_train/recovery_to_doc.md) + +## 2. 特性 + +PP-StructureV2的主要特性如下: + +- 支持对图片/pdf形式的文档进行版面分析,可以划分**文字、标题、表格、图片、公式等**区域; +- 支持通用的中英文**表格检测**任务; +- 支持表格区域进行结构化识别,最终结果输出**Excel文件**; +- 支持基于多模态的关键信息抽取(Key Information Extraction,KIE)任务-**语义实体识别**(Semantic Entity Recognition,SER)和**关系抽取**(Relation Extraction,RE); +- 支持**版面复原**,即恢复为与原始图像布局一致的word或者pdf格式的文件; +- 支持自定义训练及python whl包调用等多种推理部署方式,简单易用; +- 与半自动数据标注工具PPOCRLabel打通,支持版面分析、表格识别、SER三种任务的标注。 + +## 3. 效果展示 + +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,这里仅展示几种代表性使用方式的可视化效果。 + +### 3.1 版面分析和表格识别 + +下图展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。 + +![img](./images/ppstructure.gif) + +### 3.1.1 版面识别返回单字坐标 + +下图展示了基于上一节版面分析对文字进行定位的效果, 可参考[文档](blog/return_word_pos.md)。 + +![show_0_mdf_v2](./images/799450d4-d2c5-4b61-b490-e160dc0f515c.jpeg) + +### 3.2 版面恢复 + +下图展示了基于上一节版面分析和表格识别的结果进行版面恢复的效果。 + +![img](./images/recovery.jpg) + +### 3.3 关键信息抽取 + +- SER + +图中不同颜色的框表示不同的类别。 + +![img](./images/185539141-68e71c75-5cf7-4529-b2ca-219d29fa5f68-20240705093932704.jpg) + +![img](./images/185310636-6ce02f7c-790d-479f-b163-ea97a5a04808-20240705094001639.jpg) + +![img](./images/185539517-ccf2372a-f026-4a7c-ad28-c741c770f60a-20240705094013236.png) + +![img](./images/197464552-69de557f-edff-4c7f-acbf-069df1ba097f.png) + +![img](./images/186095702-9acef674-12af-4d09-97fc-abf4ab32600e.png) + +- RE + +图中红色框表示`问题`,蓝色框表示`答案`,`问题`和`答案`之间使用绿色线连接。 + +![img](./images/185393805-c67ff571-cf7e-4217-a4b0-8b396c4f22bb-20240705094037073.jpg) + +![img](./images/185540080-0431e006-9235-4b6d-b63d-0b3c6e1de48f-20240705094043151.jpg) + +![img](./images/186094813-3a8e16cc-42e5-4982-b9f4-0134dfb5688d.png) + +![img](./images/186095641-5843b4da-34d7-4c1c-943a-b1036a859fe3.png) + +## 4. 快速体验 + +请参考[快速使用](./quick_start.md)教程。 + +## 5. 模型库 + +部分任务需要同时用到结构化分析模型和OCR模型,如表格识别需要使用表格识别模型进行结构化解析,同时也要用到OCR模型对表格内的文字进行识别,请根据具体需求选择合适的模型。 + +结构化分析相关模型下载可以参考: + +- [PP-Structure 模型库](./models_list.md) + +OCR相关模型下载可以参考: + +- [PP-OCR 模型库](../ppocr/model_list.md) diff --git a/docs/ppstructure/ppstructure_model.md b/docs/ppstructure/ppstructure_model.md new file mode 100644 index 0000000000..4b3310a9fc --- /dev/null +++ b/docs/ppstructure/ppstructure_model.md @@ -0,0 +1,60 @@ +--- +comments: true +hide: + - toc +--- + + +# PP-Structure 系列模型列表 + +## 1. 版面分析模型 + +| 模型名称 | 模型简介 | 推理模型大小 | 下载地址 | dict path | +| --------------------------------------- | ----- | ------------ | ------------------- | ------ | +| picodet_lcnet_x1_0_fgd_layout | 基于PicoDet LCNet_x1_0和FGD蒸馏在PubLayNet 数据集训练的英文版面分析模型,可以划分**文字、标题、表格、图片以及列表**5类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) | [PubLayNet dict](../../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt) | +| ppyolov2_r50vd_dcn_365e_publaynet | 基于PP-YOLOv2在PubLayNet数据集上训练的英文版面分析模型 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar) / [训练模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet_pretrained.pdparams) | 同上 | +| picodet_lcnet_x1_0_fgd_layout_cdla | CDLA数据集训练的中文版面分析模型,可以划分为**表格、图片、图片标题、表格、表格标题、页眉、脚本、引用、公式**10类区域 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla.pdparams) | [CDLA dict](../../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt) | +| picodet_lcnet_x1_0_fgd_layout_table | 表格数据集训练的版面分析模型,支持中英文文档表格区域的检测 | 9.7M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_table.pdparams) | [Table dict](../../ppocr/utils/dict/layout_dict/layout_table_dict.txt) | +| ppyolov2_r50vd_dcn_365e_tableBank_word | 基于PP-YOLOv2在TableBank Word 数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar) | 同上 | +| ppyolov2_r50vd_dcn_365e_tableBank_latex | 基于PP-YOLOv2在TableBank Latex数据集训练的版面分析模型,支持英文文档表格区域的检测 | 221.0M | [推理模型](https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar) | 同上 | + +## 2. OCR和表格识别模型 + +### 2.1 OCR + +| 模型名称 | 模型简介 | 推理模型大小 | 下载地址 | +| ------------------------------ | ------------------------------------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| en_ppocr_mobile_v2.0_table_det | PubTabNet数据集训练的英文表格场景的文字检测 | 4.7M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_det_train.tar) | +| en_ppocr_mobile_v2.0_table_rec | PubTabNet数据集训练的英文表格场景的文字识别 | 6.9M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_rec_train.tar) | + +如需要使用其他OCR模型,可以在 [PP-OCR model_list](../ppocr/model_list.md) 下载模型或者使用自己训练好的模型配置到 `det_model_dir`, `rec_model_dir`两个字段即可。 + +### 2.2 表格识别模型 + +| 模型名称 | 模型简介 | 推理模型大小 | 下载地址 | +| ------------------------------------ | ---------------------------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| en_ppocr_mobile_v2.0_table_structure | 基于TableRec-RARE在PubTabNet数据集上训练的英文表格识别模型 | 6.8M | [推理模型](https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/table/en_ppocr_mobile_v2.0_table_structure_train.tar) | +| en_ppstructure_mobile_v2.0_SLANet | 基于SLANet在PubTabNet数据集上训练的英文表格识别模型 | 9.2M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_train.tar) | +| ch_ppstructure_mobile_v2.0_SLANet | 基于SLANet的中文表格识别模型 | 9.3M | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar) | + +## 3. KIE模型 + +在XFUND_zh数据集上,不同模型的精度与V100 GPU上速度信息如下所示。 + +| 模型名称 | 模型简介 | 推理模型大小 | 精度(hmean) | 预测耗时(ms) | 下载地址 | +| ------------------------- | ------------------------------------------------ | ------------ | ----------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ser_VI-LayoutXLM_xfund_zh | 基于VI-LayoutXLM在xfund中文数据集上训练的SER模型 | 1.1G | 93.19% | 15.49 | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar) | +| re_VI-LayoutXLM_xfund_zh | 基于VI-LayoutXLM在xfund中文数据集上训练的RE模型 | 1.1G | 83.92% | 15.49 | [推理模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar) | +| ser_LayoutXLM_xfund_zh | 基于LayoutXLM在xfund中文数据集上训练的SER模型 | 1.4G | 90.38% | 19.49 | [推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | +| re_LayoutXLM_xfund_zh | 基于LayoutXLM在xfund中文数据集上训练的RE模型 | 1.4G | 74.83% | 19.49 | [推理模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | +| ser_LayoutLMv2_xfund_zh | 基于LayoutLMv2在xfund中文数据集上训练的SER模型 | 778.0M | 85.44% | 31.46 | [推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | +| re_LayoutLMv2_xfund_zh | 基于LayoutLMv2在xfun中文数据集上训练的RE模型 | 765.0M | 67.77% | 31.46 | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | +| ser_LayoutLM_xfund_zh | 基于LayoutLM在xfund中文数据集上训练的SER模型 | 430.0M | 77.31% | - | [推理模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh_infer.tar) / [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | + +* 注:上述预测耗时信息仅包含了inference模型的推理耗时,没有统计预处理与后处理耗时,测试环境为`V100 GPU + CUDA 10.2 + CUDNN 8.1.1 + TRT 7.2.3.4`。 + +在wildreceipt数据集上,SDMGR模型精度与下载地址如下所示。 + +| 模型名称 | 模型简介 | 模型大小 | 精度 | 下载地址 | +| -------- | ---------------- | -------- | ------ | ----------------------------------------------------------------------------------------------------- | +| SDMGR | 关键信息提取模型 | 78.0M | 86.70% | [推理模型 coming soon]() / [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/kie_vgg16.tar) | diff --git a/docs/ppstructure/quick_start.en.md b/docs/ppstructure/quick_start.en.md new file mode 100644 index 0000000000..be3ec3d5d2 --- /dev/null +++ b/docs/ppstructure/quick_start.en.md @@ -0,0 +1,341 @@ +--- +comments: true +--- + +# PP-Structure Quick Start + +## 1. Environment Preparation + +### 1.1 Install PaddlePaddle + +> If you do not have a Python environment, please refer to [Environment Preparation](../ppocr/environment.en.md). + +- If you have CUDA 9 or CUDA 10 installed on your machine, please run the following command to install + + ```bash linenums="1" + python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + ``` + +- If you have no available GPU on your machine, please run the following command to install the CPU version + + ```bash linenums="1" + python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + +For more software version requirements, please refer to the instructions in [Installation Document](https://www.paddlepaddle.org.cn/install/quick) for operation. + +### 1.2 Install PaddleOCR Whl Package + +```bash linenums="1" +# Install paddleocr, version 2.6 is recommended +pip3 install "paddleocr>=2.6.0.3" + +# Install the image direction classification dependency package paddleclas (if you do not use the image direction classification, you can skip it) +pip3 install paddleclas>=2.4.3 +``` + +## 2. Quick Use + +### 2.1 Use by command line + +#### 2.1.1 image orientation + layout analysis + table recognition + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --image_orientation=true +``` + +#### 2.1.2 layout analysis + table recognition + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure +``` + +#### 2.1.3 layout analysis + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +``` + +#### 2.1.4 table recognition + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout=false +``` + +#### 2.1.5 Key Information Extraction + +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [inference document](./infer_deploy/python_infer.en.md). + +#### 2.1.6 layout recovery(PDF to Word) + +Two layout recovery methods are provided, For detailed usage tutorials, please refer to: [Layout Recovery](./model_train/recovery_to_doc.en.md). + +- PDF parse +- OCR + +Recovery by using PDF parse (only support pdf as input): + +```bash linenums="1" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +Recovery by using OCR: + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +``` + +### 2.2 Use by python script + +#### 2.2.1 image orientation + layout analysis + table recognition + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True, image_orientation=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +#### 2.2.2 layout analysis + table recognition + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # font provided in PaddleOCR +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +#### 2.2.3 layout analysis + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(table=False, ocr=False, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +ocr_engine = PPStructure(table=False, ocr=True, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/recovery/UnrealText.pdf' +result = ocr_engine(img_path) +for index, res in enumerate(result): + save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index) + +for res in result: + for line in res: + line.pop('img') + print(line) +``` + +```python linenums="1" +import os +import cv2 +import numpy as np +from paddleocr import PPStructure,save_structure_res +from paddle.utils import try_import +from PIL import Image + +ocr_engine = PPStructure(table=False, ocr=True, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/recovery/UnrealText.pdf' + +fitz = try_import("fitz") +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.page_count): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.get_pixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + +for index, img in enumerate(imgs): + result = ocr_engine(img) + save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index) + for line in result: + line.pop('img') + print(line) +``` + +#### 2.2.4 table recognition + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/table.jpg' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + +#### 2.2.5 Key Information Extraction + +Key information extraction does not currently support use by the whl package. For detailed usage tutorials, please refer to: [Inference](../infer_deploy/python_infer.en.md). + +#### 2.2.6 layout recovery + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + +# Chinese image +table_engine = PPStructure(recovery=True) +# English image +# table_engine = PPStructure(recovery=True, lang='en') + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) +``` + +### 2.3 Result description + +The return of PP-Structure is a list of dicts, the example is as follows: + +#### 2.3.1 layout analysis + table recognition + +```bash linenums="1" +[ + { 'type': 'Text', + 'bbox': [34, 432, 345, 462], + 'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]], + [('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)]) + } +] +``` + +Each field in dict is described as follows: + +| field | description | +| --- |---| +|type| Type of image area. | +|bbox| The coordinates of the image area in the original image, respectively [upper left corner x, upper left corner y, lower right corner x, lower right corner y]. | +|res| OCR or table recognition result of the image area.
table: a dict with field descriptions as follows:
        `html`: html str of table.
        In the code usage mode, set return_ocr_result_in_table=True whrn call can get the detection and recognition results of each text in the table area, corresponding to the following fields:
        `boxes`: text detection boxes.
        `rec_res`: text recognition results.
OCR: A tuple containing the detection boxes and recognition results of each single text. | + +After the recognition is completed, each image will have a directory with the same name under the directory specified by the `output` field. Each table in the image will be stored as an excel, and the picture area will be cropped and saved. The filename of excel and picture is their coordinates in the image. + + ```text linenums="1" + /output/table/1/ + └─ res.txt + └─ [454, 360, 824, 658].xlsx table recognition result + └─ [16, 2, 828, 305].jpg picture in Image + └─ [17, 361, 404, 711].xlsx table recognition result + ``` + +#### 2.3.2 Key Information Extraction + +Please refer to: [Key Information Extraction](../ppocr/model_train/kie.en.md) . + +### 2.4 Parameter Description + +| field | description | default | +|---|---|---| +| output | result save path | ./output/table | +| table_max_len | long side of the image resize in table structure model | 488 | +| table_model_dir | Table structure model inference model path| None | +| table_char_dict_path | The dictionary path of table structure model | ../ppocr/utils/dict/table_structure_dict.txt | +| merge_no_span_structure | In the table recognition model, whether to merge '\' and '\' | False | +| layout_model_dir | Layout analysis model inference model path| None | +| layout_dict_path | The dictionary path of layout analysis model| ../ppocr/utils/dict/layout_publaynet_dict.txt | +| layout_score_threshold | The box threshold path of layout analysis model| 0.5| +| layout_nms_threshold | The nms threshold path of layout analysis model| 0.5| +| kie_algorithm | kie model algorithm| LayoutXLM| +| ser_model_dir | Ser model inference model path| None| +| ser_dict_path | The dictionary path of Ser model| ../train_data/XFUND/class_list_xfun.txt| +| mode | structure or kie | structure | +| image_orientation | Whether to perform image orientation classification in forward | False | +| layout | Whether to perform layout analysis in forward | True | +| table | Whether to perform table recognition in forward | True | +| ocr | Whether to perform ocr for non-table areas in layout analysis. When layout is False, it will be automatically set to False| True | +| recovery | Whether to perform layout recovery in forward| False | +| save_pdf | Whether to convert docx to pdf when recovery| False | +| structure_version | Structure version, optional PP-structure and PP-structurev2 | PP-structure | + +Most of the parameters are consistent with the PaddleOCR whl package, see [whl package documentation](../ppocr/blog/whl.en.md) + +## 3. Summary + +Through the content in this section, you can master the use of PP-Structure related functions through PaddleOCR whl package. Please refer to [documentation tutorial](../index.en.md) for more detailed usage tutorials including model training, inference and deployment, etc. diff --git a/docs/ppstructure/quick_start.md b/docs/ppstructure/quick_start.md new file mode 100644 index 0000000000..a5da40fa1b --- /dev/null +++ b/docs/ppstructure/quick_start.md @@ -0,0 +1,359 @@ +--- +comments: true +--- + +# PP-Structure 快速开始 + +## 1. 准备环境 + +### 1.1 安装PaddlePaddle +> +> 如果您没有基础的Python运行环境,请参考[运行环境准备](../ppocr/environment.md)。 + +- 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 + + ```bash linenums="1" + python3 -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple + ``` + +- 您的机器是CPU,请运行以下命令安装 + + ```bash linenums="1" + python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple + ``` + +更多的版本需求,请参照[飞桨官网安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 1.2 安装PaddleOCR whl包 + +```bash linenums="1" +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6.0.3" + +# 安装 图像方向分类依赖包paddleclas(如不需要图像方向分类功能,可跳过) +pip3 install paddleclas>=2.4.3 +``` + +## 2. 便捷使用 + +### 2.1 命令行使用 + +#### 2.1.1 图像方向分类+版面分析+表格识别 + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --image_orientation=true +``` + +#### 2.1.2 版面分析+表格识别 + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure +``` + +#### 2.1.3 版面分析 + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --table=false --ocr=false +``` + +#### 2.1.4 表格识别 + +```bash linenums="1" +paddleocr --image_dir=ppstructure/docs/table/table.jpg --type=structure --layout=false +``` + +#### 2.1.5 关键信息抽取 + +关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[关键信息抽取教程](../ppocr/model_train/kie.md)。 + +#### 2.1.6 版面恢复 + +版面恢复分为2种方法,详细介绍请参考:[版面恢复教程](./model_train/recovery_to_doc.md): + +- PDF解析 +- OCR技术 + +通过PDF解析(只支持pdf格式的输入): + +```bash linenums="1" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过OCR技术: + +版面恢复分为2种方法,详细介绍请参考:[版面恢复教程](./model_train/recovery_to_doc.md): + +- PDF解析 +- OCR技术 + +通过PDF解析(只支持pdf格式的输入): + +```bash linenums="1" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过OCR技术: + +```bash linenums="1" +# 中文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' +``` + +### 2.2 Python脚本使用 + +#### 2.2.1 图像方向分类+版面分析+表格识别 + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True, image_orientation=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +#### 2.2.2 版面分析+表格识别 + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,draw_structure_result,save_structure_res + +table_engine = PPStructure(show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder,os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +from PIL import Image + +font_path = 'doc/fonts/simfang.ttf' # PaddleOCR下提供字体包 +image = Image.open(img_path).convert('RGB') +im_show = draw_structure_result(image, result,font_path=font_path) +im_show = Image.fromarray(im_show) +im_show.save('result.jpg') +``` + +#### 2.2.3 版面分析 + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(table=False, ocr=False, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +ocr_engine = PPStructure(table=False, ocr=True, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/recovery/UnrealText.pdf' +result = ocr_engine(img_path) +for index, res in enumerate(result): + save_structure_res(res, save_folder, os.path.basename(img_path).split('.')[0], index) + +for res in result: + for line in res: + line.pop('img') + print(line) +``` + +```python linenums="1" +import os +import cv2 +import numpy as np +from paddleocr import PPStructure,save_structure_res +from paddle.utils import try_import +from PIL import Image + +ocr_engine = PPStructure(table=False, ocr=True, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/recovery/UnrealText.pdf' + +fitz = try_import("fitz") +imgs = [] +with fitz.open(img_path) as pdf: + for pg in range(0, pdf.page_count): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.get_pixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + +for index, img in enumerate(imgs): + result = ocr_engine(img) + save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0], index) + for line in result: + line.pop('img') + print(line) +``` + +#### 2.2.4 表格识别 + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res + +table_engine = PPStructure(layout=False, show_log=True) + +save_folder = './output' +img_path = 'ppstructure/docs/table/table.jpg' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) +``` + +#### 2.2.5 关键信息抽取 + +关键信息抽取暂不支持通过whl包调用,详细使用教程请参考:[inference文档](./infer_deploy/python_infer.md)。 + +#### 2.2.6 版面恢复 + +```python linenums="1" +import os +import cv2 +from paddleocr import PPStructure,save_structure_res +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + +# 中文测试图 +table_engine = PPStructure(recovery=True) +# 英文测试图 +# table_engine = PPStructure(recovery=True, lang='en') + +save_folder = './output' +img_path = 'ppstructure/docs/table/1.png' +img = cv2.imread(img_path) +result = table_engine(img) +save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0]) + +for line in result: + line.pop('img') + print(line) + +h, w, _ = img.shape +res = sorted_layout_boxes(result, w) +convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0]) +``` + +### 2.3 返回结果说明 + +PP-Structure的返回结果为一个dict组成的list,示例如下: + +#### 2.3.1 版面分析+表格识别 + +```bash linenums="1" +[ + { 'type': 'Text', + 'bbox': [34, 432, 345, 462], + 'res': ([[36.0, 437.0, 341.0, 437.0, 341.0, 446.0, 36.0, 447.0], [41.0, 454.0, 125.0, 453.0, 125.0, 459.0, 41.0, 460.0]], + [('Tigure-6. The performance of CNN and IPT models using difforen', 0.90060663), ('Tent ', 0.465441)]) + } +] +``` + +dict 里各个字段说明如下: + +| 字段 | 说明 | +| ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| type | 图片区域的类型 | +| bbox | 图片区域的在原图的坐标,分别[左上角x,左上角y,右下角x,右下角y] | +| res | 图片区域的OCR或表格识别结果。
表格: 一个dict,字段说明如下
        `html`: 表格的HTML字符串
        在代码使用模式下,前向传入return_ocr_result_in_table=True可以拿到表格中每个文本的检测识别结果,对应为如下字段:
        `boxes`: 文本检测坐标
        `rec_res`: 文本识别结果。
OCR: 一个包含各个单行文字的检测坐标和识别结果的元组 | + +运行完成后,每张图片会在`output`字段指定的目录下有一个同名目录,图片里的每个表格会存储为一个excel,图片区域会被裁剪之后保存下来,excel文件和图片名为表格在图片里的坐标。 + + ``` + /output/table/1/ + └─ res.txt + └─ [454, 360, 824, 658].xlsx 表格识别结果 + └─ [16, 2, 828, 305].jpg 被裁剪出的图片区域 + └─ [17, 361, 404, 711].xlsx 表格识别结果 + ``` + +#### 2.3.2 关键信息抽取 + +请参考:[关键信息抽取教程](../ppocr/model_train/kie.md)。 + +### 2.4 参数说明 + +| 字段 | 说明 | 默认值 | +| ----- | ---- | ------ | +| output | 结果保存地址 | ./output/table | +| table_max_len | 表格结构模型预测时,图像的长边resize尺度 | 488 | +| table_model_dir | 表格结构模型 inference 模型地址 | None | +| table_char_dict_path | 表格结构模型所用字典地址 | ../ppocr/utils/dict/table_structure_dict.txt | +| merge_no_span_structure | 表格识别模型中,是否对'\'和'\' 进行合并 | False | +| layout_model_dir | 版面分析模型 inference 模型地址 | None | +| layout_dict_path | 版面分析模型字典 | ../ppocr/utils/dict/layout_publaynet_dict.txt | +| layout_score_threshold | 版面分析模型检测框阈值 | 0.5 | +| layout_nms_threshold | 版面分析模型nms阈值 | 0.5 | +| kie_algorithm | kie模型算法 | LayoutXLM | +| ser_model_dir | ser模型 inference 模型地址 | None | +| ser_dict_path | ser模型字典 | ../train_data/XFUND/class_list_xfun.txt | +| mode | structure or kie | structure | +| image_orientation | 前向中是否执行图像方向分类 | False | +| layout | 前向中是否执行版面分析 | True | +| table | 前向中是否执行表格识别 | True | +| ocr | 对于版面分析中的非表格区域,是否执行ocr。当layout为False时会被自动设置为False | True | +| recovery | 前向中是否执行版面恢复 | False | +| save_pdf | 版面恢复导出docx文件的同时,是否导出pdf文件 | False | +| structure_version | 模型版本,可选 PP-structure和PP-structurev2 | PP-structure | + +大部分参数和PaddleOCR whl包保持一致,见 [whl包文档](../ppocr/blog/whl.md) + +## 3. 小结 + +通过本节内容,相信您已经熟练掌握通过PaddleOCR whl包调用PP-Structure相关功能的使用方法,您可以参考[文档教程](../index.md),获取包括模型训练、推理部署等更详细的使用教程。 diff --git a/docs/quick_start.en.md b/docs/quick_start.en.md new file mode 100644 index 0000000000..98eaf53fad --- /dev/null +++ b/docs/quick_start.en.md @@ -0,0 +1,14 @@ +--- +comments: true +hide: + - navigation +--- + +- Web online experience + - PP-OCRv4 online experience: + - PP-ChatOCR online experience: +- One line of code quick use: [Quick Start(Chinese/English/Multilingual/Document Analysis](./ppocr/quick_start.en.md) +- Full-process experience of training, inference, and high-performance deployment in the Paddle AI suite (PaddleX): + - PP-OCRv4: + - PP-ChatOCR: +- Mobile demo experience:[Installation DEMO](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(Based on EasyEdge and Paddle-Lite, support iOS and Android systems) diff --git a/docs/quick_start.md b/docs/quick_start.md new file mode 100644 index 0000000000..4ec13a18db --- /dev/null +++ b/docs/quick_start.md @@ -0,0 +1,14 @@ +--- +comments: true +hide: + - navigation +--- + +- 在线免费体验: + - PP-OCRv4 在线体验地址: + - SLANet 在线体验地址: + - PP-ChatOCRv2-common 在线体验地址: + - PP-ChatOCRv2-doc 在线体验地址: + +- 一行命令快速使用:[快速开始(中英文/多语言/文档分析)](./ppocr/quick_start.md) +- 移动端 demo 体验:[安装包 DEMO 下载地址](https://ai.baidu.com/easyedge/app/openSource?from=paddlelite)(基于 EasyEdge 和 Paddle-Lite, 支持 iOS 和 Android 系统) diff --git a/docs/static/images/demo.jpg b/docs/static/images/demo.jpg new file mode 100644 index 0000000000..d8b5ca1891 Binary files /dev/null and b/docs/static/images/demo.jpg differ diff --git a/docs/static/images/favicon.ico b/docs/static/images/favicon.ico new file mode 100644 index 0000000000..639c44e3d6 Binary files /dev/null and b/docs/static/images/favicon.ico differ diff --git a/docs/static/images/logo.jpg b/docs/static/images/logo.jpg new file mode 100644 index 0000000000..4ba9055b1e Binary files /dev/null and b/docs/static/images/logo.jpg differ diff --git a/docs/update.en.md b/docs/update.en.md new file mode 100644 index 0000000000..392ccbe183 --- /dev/null +++ b/docs/update.en.md @@ -0,0 +1,69 @@ +--- +comments: true +--- + +### RECENT UPDATES + +#### 2022.5.9 release PaddleOCR v2.5, including + +- [PP-OCRv3](./ppocr_introduction_en.md#pp-ocrv3): With comparable speed, the effect of Chinese scene is further improved by 5% compared with PP-OCRv2, the effect of English scene is improved by 11%, and the average recognition accuracy of 80 language multilingual models is improved by more than 5%. +- [PPOCRLabelv2](https://github.com/PFCCLab/PPOCRLabel/blob/main/README.md): Add the annotation function for table recognition task, key information extraction task and irregular text image. +- Interactive e-book [*"Dive into OCR"*](./ocr_book_en.md), covers the cutting-edge theory and code practice of OCR full stack technology. + +#### 2022.5.7 Add support for metric and model logging during training to [Weights & Biases](https://docs.wandb.ai/) + +#### 2021.12.21 OCR open source online course starts. The lesson starts at 8:30 every night and lasts for ten days. Free registration: + +#### 2021.12.21 release PaddleOCR v2.4, release 1 text detection algorithm (PSENet), 3 text recognition algorithms (NRTR、SEED、SAR), 1 key information extraction algorithm (SDMGR) and 3 DocVQA algorithms (LayoutLM、LayoutLMv2,LayoutXLM) + +#### 2021.9.7 release PaddleOCR v2.3, [PP-OCRv2](#PP-OCRv2) is proposed. The CPU inference speed of PP-OCRv2 is 220% higher than that of PP-OCR server. The F-score of PP-OCRv2 is 7% higher than that of PP-OCR mobile + +#### 2021.8.3 released PaddleOCR v2.2, add a new structured documents analysis toolkit, i.e., [PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README.md), support layout analysis and table recognition (One-key to export chart images to Excel files) + +#### 2021.4.8 release end-to-end text recognition algorithm [PGNet](https://www.aaai.org/AAAI21Papers/AAAI-2885.WangP.pdf) which is published in AAAI 2021. Find tutorial [here](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/pgnet_en.md);release multi language recognition [models](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/multi_languages_en.md), support more than 80 languages recognition; especically, the performance of [English recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_en/models_list_en.md#English) is Optimized + +#### 2021.1.21 update more than 25+ multilingual recognition models [models list](./models_list_en.md), including:English, Chinese, German, French, Japanese,Spanish,Portuguese Russia Arabic and so on. Models for more languages will continue to be updated [Develop Plan](https://github.com/PaddlePaddle/PaddleOCR/issues/1048) + +#### 2020.12.15 update Data synthesis tool, i.e., [Style-Text](https://github.com/PFCCLab/StyleText/blob/main/README.md),easy to synthesize a large number of images which are similar to the target scene image + +#### 2020.11.25 Update a new data annotation tool, i.e., [PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel/blob/main/README.md), which is helpful to improve the labeling efficiency. Moreover, the labeling results can be used in training of the PP-OCR system directly + +#### 2020.9.22 Update the PP-OCR technical article, + +#### 2020.9.19 Update the ultra lightweight compressed ppocr_mobile_slim series models, the overall model size is 3.5M, suitable for mobile deployment + +#### 2020.9.17 update English recognition model and Multilingual recognition model, `English`, `Chinese`, `German`, `French`, `Japanese` and `Korean` have been supported. Models for more languages will continue to be updated + +#### 2020.8.24 Support the use of PaddleOCR through whl package installation,pelease refer [PaddleOCR Package](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_en/whl_en.md) + +#### 2020.8.16 Release text detection algorithm [SAST](https://arxiv.org/abs/1908.05498) and text recognition algorithm [SRN](https://arxiv.org/abs/2003.12294) + +#### 2020.7.23, Release the playback and PPT of live class on BiliBili station, PaddleOCR Introduction, [address](https://aistudio.baidu.com/aistudio/course/introduce/1519) + +#### 2020.7.15, Add mobile App demo , support both iOS and Android ( based on easyedge and Paddle Lite) + +#### 2020.7.15, Improve the deployment ability, add the C + + inference , serving deployment. In addtion, the benchmarks of the ultra-lightweight Chinese OCR model are provided + +#### 2020.7.15, Add several related datasets, data annotation and synthesis tools + +#### 2020.7.9 Add a new model to support recognize the character "space" + +#### 2020.7.9 Add the data augument and learning rate decay strategies during training + +#### 2020.6.8 Add [datasets](dataset/datasets_en.md) and keep updating + +#### 2020.6.5 Support exporting `attention` model to `inference_model` + +#### 2020.6.5 Support separate prediction and recognition, output result score + +#### 2020.5.30 Provide Lightweight Chinese OCR online experience + +#### 2020.5.30 Model prediction and training support on Windows system + +#### 2020.5.30 Open source general Chinese OCR model + +#### 2020.5.14 Release [PaddleOCR Open Class](https://www.bilibili.com/video/BV1nf4y1U7RX?p=4) + +#### 2020.5.14 Release [PaddleOCR Practice Notebook](https://aistudio.baidu.com/aistudio/projectdetail/467229) + +#### 2020.5.14 Open source 8.6M lightweight Chinese OCR model diff --git a/docs/update.md b/docs/update.md new file mode 100644 index 0000000000..0210698685 --- /dev/null +++ b/docs/update.md @@ -0,0 +1,78 @@ +--- +comments: true +--- + +### 更新 + +#### 2022.5.9 发布PaddleOCR v2.5。发布内容包括 + +- [PP-OCRv3](./ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; +- 半自动标注工具[PPOCRLabelv2](https://github.com/PFCCLab/PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; +- OCR产业落地工具集:打通22种训练部署软硬件环境与方式,覆盖企业90%的训练部署环境需求 +- 交互式OCR开源电子书[《动手学OCR》](./ocr_book.md),覆盖OCR全栈技术的前沿理论与代码实践,并配套教学视频。 + +#### 2022.5.7 添加对[Weights & Biases](https://docs.wandb.ai/)训练日志记录工具的支持 + +#### 2021.12.21 《OCR十讲》课程开讲,12月21日起每晚八点半线上授课! 【免费】报名地址: + +#### 2021.12.21 发布PaddleOCR v2.4。OCR算法新增1种文本检测算法(PSENet),3种文本识别算法(NRTR、SEED、SAR);文档结构化算法新增1种关键信息提取算法(SDMGR),3种DocVQA算法(LayoutLM、LayoutLMv2,LayoutXLM) + +#### 2021.9.7 发布PaddleOCR v2.3,发布[PP-OCRv2](#PP-OCRv2),CPU推理速度相比于PP-OCR server提升220%;效果相比于PP-OCR mobile 提升7% + +#### 2021.8.3 发布PaddleOCR v2.2,新增文档结构分析[PP-Structure](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/ppstructure/README_ch.md)工具包,支持版面分析与表格识别(含Excel导出) + +#### 2021.6.29 [FAQ](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/FAQ.md)新增5个高频问题,总数248个,每周一都会更新,欢迎大家持续关注 + +#### 2021.4.8 release 2.1版本,新增AAAI 2021论文[端到端识别算法PGNet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/pgnet.md)开源,[多语言模型](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.2/doc/doc_ch/multi_languages.md)支持种类增加到80+ + +#### 2020.12.15 更新数据合成工具[Style-Text](https://github.com/PFCCLab/StyleText/blob/main/README_ch.md),可以批量合成大量与目标场景类似的图像,在多个场景验证,效果明显提升 + +#### 2020.12.07 [FAQ](../../doc/doc_ch/FAQ.md)新增5个高频问题,总数124个,并且计划以后每周一都会更新,欢迎大家持续关注 + +#### 2020.11.25 更新半自动标注工具[PPOCRLabel](https://github.com/PFCCLab/PPOCRLabel/blob/main/README_ch.md),辅助开发者高效完成标注任务,输出格式与PP-OCR训练任务完美衔接 + +#### 2020.9.22 更新PP-OCR技术文章, + +#### 2020.9.19 更新超轻量压缩ppocr_mobile_slim系列模型,整体模型3.5M(详见PP-OCR Pipeline),适合在移动端部署使用 + +#### 2020.9.17 更新超轻量ppocr_mobile系列和通用ppocr_server系列中英文ocr模型,媲美商业效果 + +#### 2020.9.17 更新[英文识别模型](./models_list.md#english-recognition-model)和[多语种识别模型](./models_list.md#english-recognition-model),已支持`德语、法语、日语、韩语`,更多语种识别模型将持续更新 + +#### 2020.8.26 更新OCR相关的84个常见问题及解答,具体参考[FAQ](./FAQ.md) + +#### 2020.8.24 支持通过whl包安装使用PaddleOCR,具体参考[Paddleocr Package使用说明](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/doc/doc_ch/whl.md) + +#### 2020.8.21 更新8月18日B站直播课回放和PPT,课节2,易学易用的OCR工具大礼包,[获取地址](https://aistudio.baidu.com/aistudio/education/group/info/1519) + +#### 2020.8.16 开源文本检测算法[SAST](https://arxiv.org/abs/1908.05498)和文本识别算法[SRN](https://arxiv.org/abs/2003.12294) + +#### 2020.7.23 发布7月21日B站直播课回放和PPT,课节1,PaddleOCR开源大礼包全面解读,[获取地址](https://aistudio.baidu.com/aistudio/course/introduce/1519) + +#### 2020.7.15 添加基于EasyEdge和Paddle-Lite的移动端DEMO,支持iOS和Android系统 + +#### 2020.7.15 完善预测部署,添加基于C++预测引擎推理、服务化部署和端侧部署方案,以及超轻量级中文OCR模型预测耗时Benchmark + +#### 2020.7.15 整理OCR相关数据集、常用数据标注以及合成工具 + +#### 2020.7.9 添加支持空格的识别模型,识别效果,预测及训练方式请参考快速开始和文本识别训练相关文档 + +#### 2020.7.9 添加数据增强、学习率衰减策略,具体参考[配置文件](./config.md) + +#### 2020.6.8 添加[数据集](dataset/datasets.md),并保持持续更新 + +#### 2020.6.5 支持 `attetnion` 模型导出 `inference_model` + +#### 2020.6.5 支持单独预测识别时,输出结果得分 + +#### 2020.5.30 提供超轻量级中文OCR在线体验 + +#### 2020.5.30 模型预测、训练支持Windows系统 + +#### 2020.5.30 开源通用中文OCR模型 + +#### 2020.5.14 发布[PaddleOCR公开课](https://www.bilibili.com/video/BV1nf4y1U7RX?p=4) + +#### 2020.5.14 发布[PaddleOCR实战练习](https://aistudio.baidu.com/aistudio/projectdetail/467229) + +#### 2020.5.14 开源8.6M超轻量级中文OCR模型 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000..425869d008 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,399 @@ +site_name: PaddleOCR 文档 +site_url: https://paddlepaddle.github.io/PaddleOCR/ +site_author: PaddleOCR PMC +site_description: + Awesome multilingual OCR toolkits based on PaddlePaddle (practical ultra lightweight OCR system, support 80+ languages recognition, provide data annotation and synthesis tools, support training and deployment among server, mobile, embedded and IoT devices) + +repo_name: PaddlePaddle/PaddleOCR +repo_url: https://github.com/PaddlePaddle/PaddleOCR + +copyright: Copyright © 2024 Maintained by PaddleOCR PMC. + +edit_uri: edit/main/docs/ + +theme: + name: material + logo: ./static/images/logo.jpg + favicon: ./static/images/logo.jpg + custom_dir: overrides + features: + - announce.dismiss + - content.tooltips + - content.code.copy + - content.tabs.link + - content.footnote.tooltips + - content.action.edit + - content.action.view + - navigation.expand # 默认打开所有的字节 + - navigation.tabs # 顶级索引被作为tab + - navigation.tabs.sticky # tab始终可见 + - navigation.top # 开启顶部导航栏 + - navigation.tracking # 导航栏跟踪 + - navigation.footer + - navigation.indexes + - search.highlight # 搜索高亮 + - search.share # 搜索分享 + - search.suggest # 搜索建议 + - toc.follow # 目录跟踪-页面右侧的小目录 + + palette: + - media: "(prefers-color-scheme: light)" # 浅色 + scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" # 深色 + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to system preference + + icon: + logo: logo + previous: fontawesome/solid/angle-left + next: fontawesome/solid/angle-right + repo: fontawesome/brands/github + edit: material/pencil + view: material/eye + tag: + default-tag: fontawesome/solid/tag + hardware-tag: fontawesome/solid/microchip + software-tag: fontawesome/solid/laptop-code + admonition: + note: octicons/tag-16 + abstract: octicons/checklist-16 + info: octicons/info-16 + tip: octicons/squirrel-16 + success: octicons/check-16 + question: octicons/question-16 + warning: octicons/alert-16 + failure: octicons/x-circle-16 + danger: octicons/zap-16 + bug: octicons/bug-16 + example: octicons/beaker-16 + quote: octicons/quote-16 + +plugins: + - tags + - offline + - search: + separator: '[\s\u200b\-_,:!=\[\: )"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' + - i18n: + docs_structure: suffix + fallback_to_default: true + reconfigure_material: true + reconfigure_search: true + languages: + - locale: zh + name: 简体中文 + default: true + build: true + - locale: en + name: English + site_name: PaddleOCR Documentation + link: /en/ + nav_translations: + Home: Home + 快速开始: Quick Start + 模型: Model + 概述: Overview + PP-OCR 文本检测识别: PP-OCR + 概述: Overview + 快速开始: Quick Start + 快速安装: Quick Installation + 效果展示: Visualization + 运行环境: Environment Preparation + 模型库: Model + 模型训练: Model Training + 基本概念: Basic concepts + 文本检测: Text Detection + 文本识别: Text Recognition + 文本方向分类器: Text Angle Classification + 关键信息提取: Key Information Extraction + 模型微调: Fine-tune + 模型压缩: Model Compression + 模型量化: Model Quantization + 模型裁剪: Model Prune + 知识蒸馏: Knowledge Distillation + 推理部署: Model Deploy + 概述: Overview + 基于Python预测引擎推理: Python Inference + 基于C++预测引擎推理: CPP Inference + Visual Studio 2019 Community CMake 编译指南: Visual Studio 2019 Community CMake Compilation Guide + 服务化部署: Sever Deployment + Jetson部署: Jetson Deployment + 端侧部署: Device-side Deployment + 网页前端部署: Paddle.js Web Deployment + Paddle2ONNX模型转化与预测: Paddle2ONNX + 云上飞桨部署工具: Paddle Cloud + Benchmark: Benchmark + 博客: Blog + paddleocr package使用说明: Paddleocr Package Instructions + 多语言模型: Multi-language model + 动手学OCR: Dive into OCR + 切片操作: Slice + PaddleOCR模型推理参数解释: PaddleOCR Model Inference Parameter Explanation + 分布式训练: Distributed training + 项目克隆: Project Clone + 配置文件内容与生成: Configuration + 如何生产自定义超轻量模型?: How To Make Your own lightweight OCR model? + PP-Structure文档分析: PP-Structure + 概述: Overview + 快速开始: Quick Start + 模型库: Model + 模型训练: Model Training + 基本概念: Basic concepts + 版面分析: Layout Analysis + 版面恢复: Recovery To Doc + 表格识别: Table Recognition + 关键信息提取: Key Information Extraction + 推理部署: Deploy + 概述: Overview + 基于Python预测引擎推理: Python Inference + 基于C++预测引擎推理: CPP Inference + 服务化部署: Sever Deployment + 博客: Blog + 返回识别位置: Return Recognition Location + 怎样完成基于图像数据的信息抽取任务: Key Information Extraction Pipeline + 前沿算法与模型: Academic Algorithms + 概述: algorithm/overview.md + 文本检测算法: Text Detection Algorithms + DB与DB++: DB and DB++ + 文本识别算法: Text Recognition Algorithms + 文本超分辨率算法: Text Super-Resolution Algorithm + 公式识别算法: Formulat Recognition + 端到端OCR算法: End-to-End OCR Algorithms + 表格识别算法: Table Recognition Algorithms + 关键信息抽取算法: Key Information Extraction Algorithms + 使用PaddleOCR架构添加新算法: Add new algorithms + 场景应用: Applications + 数据标注与合成: Data Annotation and Synthesis + 概述: Overview + 其它数据标注工具: Other data annotation tools + 其它数据合成工具: Others data synthesis tools + 数据集: Datasets + 通用中英文OCR数据集: General Chinese and English OCR dataset + 手写中文OCR数据集: Handwritten Chinese OCR Dataset + 垂类多语言OCR数据集: Vertical multi-language OCR dataset + 版面分析数据集: Layout Analysis Dataset + 表格识别数据集: Table recognition dataset + 关键信息提取数据集: Key Information Extraction Dataset + FAQ: FAQ + 社区: Community + 社区贡献: Community Contribution + 附录: Appendix + - locale: "null" + name: Help translating + build: false + fixed_link: "https://github.com/PaddlePaddle/PaddleOCR/discussions/13374" + - git-committers: + repository: PaddlePaddle/PaddleOCR + branch: main + token: !!python/object/apply:os.getenv ["MKDOCS_GIT_COMMITTERS_APIKEY"] + - git-revision-date-localized: + enable_creation_date: true + +markdown_extensions: + - abbr + - attr_list + - pymdownx.snippets + - pymdownx.critic + - pymdownx.caret + - pymdownx.keys + - pymdownx.mark + - pymdownx.tilde + - footnotes + - def_list + - md_in_html + - pymdownx.tasklist: + custom_checkbox: true + - toc: + permalink: true + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.tabbed: + alternate_style: true + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.arithmatex: + generic: true + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/PaddlePaddle/PaddleOCR + - icon: fontawesome/brands/python + link: https://pypi.org/project/paddleocr/ + +extra_javascript: + - javascripts/katex.min.js + - https://unpkg.com/katex@0/dist/katex.min.js + - https://unpkg.com/katex@0/dist/contrib/auto-render.min.js + +extra_css: + - https://unpkg.com/katex@0/dist/katex.min.css + + +nav: + - Home: index.md + - 快速开始: quick_start.md + - 模型: + - 概览: model/index.md + - 多硬件安装飞桨: + - 多硬件安装飞桨: model/hardware/install_other_devices.md + - 支持硬件列表: model/hardware/supported_models.md + - PP-OCR 文本检测识别: + - 概述: ppocr/overview.md + - 快速开始: ppocr/quick_start.md + - 快速安装: ppocr/installation.md + - 效果展示: ppocr/visualization.md + - 运行环境: ppocr/environment.md + - 模型库: ppocr/model_list.md + - 模型训练: + - 基本概念: ppocr/model_train/training.md + - 文本检测: ppocr/model_train/detection.md + - 文本识别: ppocr/model_train/recognition.md + - 文本方向分类器: ppocr/model_train/angle_class.md + - 关键信息提取: ppocr/model_train/kie.md + - 模型微调: ppocr/model_train/finetune.md + - 模型压缩: + - 模型量化: ppocr/model_compress/quantization.md + - 模型裁剪: ppocr/model_compress/prune.md + - 知识蒸馏: ppocr/model_compress/knowledge_distillation.md + - 推理部署: + - 概述: ppocr/infer_deploy/index.md + - 基于Python预测引擎推理: ppocr/infer_deploy/python_infer.md + - 基于C++预测引擎推理: ppocr/infer_deploy/cpp_infer.md + - Visual Studio 2019 Community CMake 编译指南: ppocr/infer_deploy/windows_vs2019_build.md + - 服务化部署: ppocr/infer_deploy/paddle_server.md + - Jetson部署: ppocr/infer_deploy/Jetson_infer.md + - 端侧部署: ppocr/infer_deploy/lite.md + - 网页前端部署: ppocr/infer_deploy/paddle_js.md + - Paddle2ONNX模型转化与预测: ppocr/infer_deploy/paddle2onnx.md + - 云上飞桨部署工具: ppocr/infer_deploy/paddle_cloud.md + - Benchmark: ppocr/infer_deploy/benchmark.md + - 博客: + - PP-OCRv3技术报告: ppocr/blog/PP-OCRv3_introduction.md + - PP-OCRv4技术报告: ppocr/blog/PP-OCRv4_introduction.md + - paddleocr package使用说明: ppocr/blog/whl.md + - 多语言模型: ppocr/blog/multi_languages.md + - 动手学OCR: ppocr/blog/ocr_book.md + - Enhanced CTC Loss: ppocr/blog/enhanced_ctc_loss.md + - 切片操作: ppocr/blog/slice.md + - PaddleOCR模型推理参数解释: ppocr/blog/inference_args.md + - 分布式训练: ppocr/blog/distributed_training.md + - 项目克隆: ppocr/blog/clone.md + - 配置文件内容与生成: ppocr/blog/config.md + - 如何生产自定义超轻量模型?: ppocr/blog/customize.md + - PP-Structure文档分析: + - 概述: ppstructure/overview.md + - 快速开始: ppstructure/quick_start.md + - 模型库: ppstructure/models_list.md + - 模型训练: + - 基本概念: ppstructure/model_train/training.md + - 版面分析: ppstructure/model_train/train_layout.md + - 表格识别: ppstructure/model_train/train_table.md + - 版面恢复: ppstructure/model_train/recovery_to_doc.md + - 关键信息提取: ppstructure/model_train/train_kie.md + - 推理部署: + - 概述: ppstructure/infer_deploy/index.md + - 基于Python预测引擎推理: ppstructure/infer_deploy/python_infer.md + - 基于C++预测引擎推理: ppstructure/infer_deploy/cpp_infer.md + - 服务化部署: ppstructure/infer_deploy/paddle_server.md + - 博客: + - 返回识别位置: ppstructure/blog/return_word_pos.md + - 怎样完成基于图像数据的信息抽取任务: ppstructure/blog/how_to_do_kie.md + - 前沿算法与模型: + - 概述: algorithm/overview.md + - 文本检测算法: + - DB与DB++: algorithm/text_detection/algorithm_det_db.md + - EAST: algorithm/text_detection/algorithm_det_east.md + - SAST: algorithm/text_detection/algorithm_det_sast.md + - PSENet: algorithm/text_detection/algorithm_det_psenet.md + - FCENet: algorithm/text_detection/algorithm_det_fcenet.md + - DRRG: algorithm/text_detection/algorithm_det_drrg.md + - CT: algorithm/text_detection/algorithm_det_ct.md + - 文本识别算法: + - CRNN: algorithm/text_recognition/algorithm_rec_crnn.md + - Rosetta: algorithm/text_recognition/algorithm_rec_rosetta.md + - STAR-Net: algorithm/text_recognition/algorithm_rec_starnet.md + - RARE: algorithm/text_recognition/algorithm_rec_rare.md + - SRN: algorithm/text_recognition/algorithm_rec_srn.md + - NRTR: algorithm/text_recognition/algorithm_rec_nrtr.md + - SAR: algorithm/text_recognition/algorithm_rec_sar.md + - SEED: algorithm/text_recognition/algorithm_rec_seed.md + - SVTR: algorithm/text_recognition/algorithm_rec_svtr.md + - ViTSTR: algorithm/text_recognition/algorithm_rec_vitstr.md + - ABINet: algorithm/text_recognition/algorithm_rec_abinet.md + - VisionLAN: algorithm/text_recognition/algorithm_rec_visionlan.md + - SPIN: algorithm/text_recognition/algorithm_rec_spin.md + - RobustScanner: algorithm/text_recognition/algorithm_rec_robustscanner.md + - RFL: algorithm/text_recognition/algorithm_rec_rfl.md + - ParseQ: algorithm/text_recognition/algorithm_rec_parseq.md + - CPPD: algorithm/text_recognition/algorithm_rec_cppd.md + - SATRN: algorithm/text_recognition/algorithm_rec_satrn.md + - 文本超分辨率算法: + - Text Gestalt: algorithm/super_resolution/algorithm_sr_gestalt.md + - Text Telescope: algorithm/super_resolution/algorithm_sr_telescope.md + - 公式识别算法: + - CAN: algorithm/formula_recognition/algorithm_rec_can.md + - 端到端OCR算法: + - PGNet: algorithm/end_to_end/algorithm_e2e_pgnet.md + - 表格识别算法: + - TableMaster: algorithm/table_recognition/algorithm_table_master.md + - TableSLANet: algorithm/table_recognition/algorithm_table_slanet.md + - 关键信息抽取算法: + - VI-LayoutXLM: algorithm/kie/algorithm_kie_vi_layoutxlm.md + - LayoutLM: algorithm/kie/algorithm_kie_layoutxlm.md + # - LayoutLMv2: algorithm/kie/algorithm_kie_layoutxlm.md + # - LayoutXLM: algorithm/kie/algorithm_kie_layoutxlm.md + - SDMGR: algorithm/kie/./algorithm_kie_sdmgr.md + - 使用PaddleOCR架构添加新算法: algorithm/add_new_algorithm.md + - 场景应用: + - 通用: + - 高精度中文场景文本识别模型SVTR: applications/高精度中文识别模型.md + - 手写体识别: applications/手写文字识别.md + - 制造: + - 数码管识别: applications/光功率计数码管字符识别.md + - 液晶屏读数识别: applications/液晶屏读数识别.md + - 包装生产日期: applications/包装生产日期识别.md + - PCB文字识别: applications/PCB字符识别.md + - 金融: + - 表单VQA: applications/多模态表单识别.md + - 增值税发票: applications/发票关键信息抽取.md + - 印章检测与识别: applications/印章弯曲文字识别.md + - 通用卡证识别: applications/快速构建卡证类OCR.md + - 合同比对: applications/扫描合同关键信息提取.md + - 交通: + - 车牌识别: applications/轻量级车牌识别.md + - 数据标注与合成: + - 概述: data_anno_synth/overview.md + - 其它数据标注工具: data_anno_synth/data_annotation.md + - 其它数据合成工具: data_anno_synth/data_synthesis.md + - 数据集: + - 通用中英文OCR数据集: datasets/datasets.md + - 手写中文OCR数据集: datasets/handwritten_datasets.md + - 垂类多语言OCR数据集: datasets/vertical_and_multilingual_datasets.md + - 版面分析数据集: datasets/layout_datasets.md + - 表格识别数据集: datasets/table_datasets.md + - 关键信息提取数据集: datasets/kie_datasets.md + - FAQ: FAQ.md + - 社区: + - 社区贡献: community/community_contribution.md + - 附录: community/code_and_doc.md diff --git a/overrides/partials/comments.html b/overrides/partials/comments.html new file mode 100644 index 0000000000..89071941a6 --- /dev/null +++ b/overrides/partials/comments.html @@ -0,0 +1,46 @@ +{% if page.meta.comments %} +

{{ lang.t("meta.comments") }}

+ + + + + + +{% endif %} diff --git a/paddleocr.py b/paddleocr.py index d69c8854c1..fa7ab72e71 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -18,12 +18,12 @@ __dir__ = os.path.dirname(__file__) -import paddle from paddle.utils import try_import sys.path.append(os.path.join(__dir__, "")) import cv2 +from copy import deepcopy import logging import numpy as np from pathlib import Path @@ -31,7 +31,6 @@ from io import BytesIO import pprint from PIL import Image -from tools.infer import predict_system def _import_file(module_name, file_path, make_importable=False): @@ -50,7 +49,6 @@ def _import_file(module_name, file_path, make_importable=False): ppstructure = importlib.import_module("ppstructure", "paddleocr") from ppocr.utils.logging import get_logger -logger = get_logger() from ppocr.utils.utility import ( check_and_read, get_image_file_list, @@ -63,11 +61,14 @@ def _import_file(module_name, file_path, make_importable=False): is_link, confirm_model_dir_url, ) +from tools.infer import predict_system from tools.infer.utility import draw_ocr, str2bool, check_gpu from ppstructure.utility import init_args, draw_structure_result from ppstructure.predict_system import StructureSystem, save_structure_res, to_excel +from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx logger = get_logger() + __all__ = [ "PaddleOCR", "PPStructure", @@ -76,6 +77,8 @@ def _import_file(module_name, file_path, make_importable=False): "save_structure_res", "download_with_progressbar", "to_excel", + "sorted_layout_boxes", + "convert_info_docx", ] SUPPORT_DET_MODEL = ["DB"] @@ -685,15 +688,30 @@ def ocr( """ OCR with PaddleOCR - args: - img: img for OCR, support ndarray, img_path and list or ndarray - det: use text detection or not. If False, only rec will be exec. Default is True - rec: use text recognition or not. If False, only det will be exec. Default is True - cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. - bin: binarize image to black and white. Default is False. - inv: invert image colors. Default is False. - alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white. - slice: use sliding window inference for large images, det and rec must be True. Requires int values for slice["horizontal_stride"], slice["vertical_stride"], slice["merge_x_thres"], slice["merge_y_thres] (See doc/doc_en/slice_en.md). Default is {}. + Args: + img: Image for OCR. It can be an ndarray, img_path, or a list of ndarrays. + det: Use text detection or not. If False, only text recognition will be executed. Default is True. + rec: Use text recognition or not. If False, only text detection will be executed. Default is True. + cls: Use angle classifier or not. Default is True. If True, the text with a rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. + bin: Binarize image to black and white. Default is False. + inv: Invert image colors. Default is False. + alpha_color: Set RGB color Tuple for transparent parts replacement. Default is pure white. + slice: Use sliding window inference for large images. Both det and rec must be True. Requires int values for slice["horizontal_stride"], slice["vertical_stride"], slice["merge_x_thres"], slice["merge_y_thres"] (See doc/doc_en/slice_en.md). Default is {}. + + Returns: + If both det and rec are True, returns a list of OCR results for each image. Each OCR result is a list of bounding boxes and recognized text for each detected text region. + If det is True and rec is False, returns a list of detected bounding boxes for each image. + If det is False and rec is True, returns a list of recognized text for each image. + If both det and rec are False, returns a list of angle classification results for each image. + + Raises: + AssertionError: If the input image is not of type ndarray, list, str, or bytes. + SystemExit: If det is True and the input is a list of images. + + Note: + - If the angle classifier is not initialized (use_angle_cls=False), it will not be used during the forward process. + - For PDF files, if the input is a list of images and the page_num is specified, only the first page_num images will be processed. + - The preprocess_image function is used to preprocess the input image by applying alpha color replacement, inversion, and binarization if specified. """ assert isinstance(img, (np.ndarray, list, str, bytes)) if isinstance(img, list) and det == True: @@ -763,7 +781,21 @@ def preprocess_image(_image): class PPStructure(StructureSystem): + """ + PPStructure class represents the structure analysis system for PaddleOCR. + """ + def __init__(self, **kwargs): + """ + Initializes the PPStructure object with the given parameters. + + Args: + **kwargs: Additional keyword arguments to customize the behavior of the structure analysis system. + + Raises: + AssertionError: If the structure version is not supported. + + """ params = parse_args(mMain=False) params.__dict__.update(**kwargs) assert ( @@ -842,6 +874,19 @@ def __call__( img_idx=0, alpha_color=(255, 255, 255), ): + """ + Performs structure analysis on the input image. + + Args: + img (str or numpy.ndarray): The input image to perform structure analysis on. + return_ocr_result_in_table (bool, optional): Whether to return OCR results in table format. Defaults to False. + img_idx (int, optional): The index of the image. Defaults to 0. + alpha_color (tuple, optional): The alpha color for transparent images. Defaults to (255, 255, 255). + + Returns: + list or dict: The structure analysis results. + + """ img, flag_gif, flag_pdf = check_img(img, alpha_color) if isinstance(img, list) and flag_pdf: res_list = [] @@ -857,6 +902,17 @@ def __call__( def main(): + """ + Main function for running PaddleOCR or PPStructure. + + This function takes command line arguments, processes the images, and performs OCR or structure analysis based on the specified type. + + Args: + None + + Returns: + None + """ # for cmd args = parse_args(mMain=True) image_dir = args.image_dir @@ -939,9 +995,6 @@ def main(): save_structure_res(result, args.output, img_name, index) if args.recovery and result != []: - from copy import deepcopy - from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes - h, w, _ = img.shape result_cp = deepcopy(result) result_sorted = sorted_layout_boxes(result_cp, w) @@ -949,8 +1002,6 @@ def main(): if args.recovery and all_res != []: try: - from ppstructure.recovery.recovery_to_doc import convert_info_docx - convert_info_docx(img, all_res, args.output, img_name) except Exception as ex: logger.error( diff --git a/ppocr/data/__init__.py b/ppocr/data/__init__.py index 27d74c89d8..5678aebec1 100644 --- a/ppocr/data/__init__.py +++ b/ppocr/data/__init__.py @@ -38,6 +38,7 @@ from ppocr.data.pgnet_dataset import PGDataSet from ppocr.data.pubtab_dataset import PubTabDataSet from ppocr.data.multi_scale_sampler import MultiScaleSampler +from ppocr.data.latexocr_dataset import LaTeXOCRDataSet # for PaddleX dataset_type TextDetDataset = SimpleDataSet @@ -45,6 +46,7 @@ MSTextRecDataset = MultiScaleDataSet PubTabTableRecDataset = PubTabDataSet KieDataset = SimpleDataSet +LaTeXOCRDataSet = LaTeXOCRDataSet __all__ = ["build_dataloader", "transform", "create_operators", "set_signal_handlers"] @@ -94,6 +96,7 @@ def build_dataloader(config, mode, device, logger, seed=None): "MSTextRecDataset", "PubTabTableRecDataset", "KieDataset", + "LaTeXOCRDataSet", ] module_name = config[mode]["dataset"]["name"] assert module_name in support_dict, Exception( diff --git a/ppocr/data/collate_fn.py b/ppocr/data/collate_fn.py index f1f317510b..29bb3f1aa4 100644 --- a/ppocr/data/collate_fn.py +++ b/ppocr/data/collate_fn.py @@ -116,3 +116,18 @@ def __call__(self, batch): label_masks[i][:l] = 1 return images, image_masks, labels, label_masks + + +class LaTeXOCRCollator(object): + """ + batch: [ + image [batch_size, channel, maxHinbatch, maxWinbatch] + label [batch_size, maxLabelLen] + label_mask [batch_size, maxLabelLen] + ... + ] + """ + + def __call__(self, batch): + images, labels, attention_mask = batch[0] + return images, labels, attention_mask diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py index 350887933b..d76a15555d 100644 --- a/ppocr/data/imaug/__init__.py +++ b/ppocr/data/imaug/__init__.py @@ -61,6 +61,7 @@ from .fce_targets import FCENetTargets from .ct_process import * from .drrg_targets import DRRGTargets +from .latex_ocr_aug import * def transform(data, ops=None): diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 46cabaed8c..7d4afec4e3 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -25,6 +25,7 @@ import copy import random from random import sample +from collections import defaultdict from ppocr.utils.logging import get_logger from ppocr.data.imaug.vqa.augment import order_by_tbyx @@ -1770,3 +1771,108 @@ def encodech(self, text): if len(text_list) == 0: return None, None, None return text_list, text_node_index, text_node_num + + +class LatexOCRLabelEncode(object): + def __init__( + self, + rec_char_dict_path, + **kwargs, + ): + from tokenizers import Tokenizer as TokenizerFast + + self.tokenizer = TokenizerFast.from_file(rec_char_dict_path) + self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] + self.pad_token_id = 0 + self.bos_token_id = 1 + self.eos_token_id = 2 + + def _convert_encoding( + self, + encoding, + return_token_type_ids=None, + return_attention_mask=None, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + return_offsets_mapping=False, + return_length=False, + verbose=True, + ): + + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + if return_overflowing_tokens and encoding.overflowing is not None: + encodings = [encoding] + encoding.overflowing + else: + encodings = [encoding] + + encoding_dict = defaultdict(list) + for e in encodings: + encoding_dict["input_ids"].append(e.ids) + + if return_token_type_ids: + encoding_dict["token_type_ids"].append(e.type_ids) + if return_attention_mask: + encoding_dict["attention_mask"].append(e.attention_mask) + if return_special_tokens_mask: + encoding_dict["special_tokens_mask"].append(e.special_tokens_mask) + if return_offsets_mapping: + encoding_dict["offset_mapping"].append(e.offsets) + if return_length: + encoding_dict["length"].append(len(e.ids)) + + return encoding_dict, encodings + + def encode( + self, + text, + text_pair=None, + return_token_type_ids=False, + add_special_tokens=True, + is_split_into_words=False, + ): + batched_input = text + encodings = self.tokenizer.encode_batch( + batched_input, + add_special_tokens=add_special_tokens, + is_pretokenized=is_split_into_words, + ) + tokens_and_encodings = [ + self._convert_encoding( + encoding=encoding, + return_token_type_ids=False, + return_attention_mask=None, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + return_offsets_mapping=False, + return_length=False, + verbose=True, + ) + for encoding in encodings + ] + sanitized_tokens = {} + for key in tokens_and_encodings[0][0].keys(): + stack = [e for item, _ in tokens_and_encodings for e in item[key]] + sanitized_tokens[key] = stack + return sanitized_tokens + + def __call__(self, eqs): + topk = self.encode(eqs) + for k, p in zip(topk, [[self.bos_token_id, self.eos_token_id], [1, 1]]): + process_seq = [[p[0]] + x + [p[1]] for x in topk[k]] + max_length = 0 + for seq in process_seq: + max_length = max(max_length, len(seq)) + labels = np.zeros((len(process_seq), max_length), dtype="int64") + for idx, seq in enumerate(process_seq): + l = len(seq) + labels[idx][:l] = seq + topk[k] = labels + return ( + np.array(topk["input_ids"]).astype(np.int64), + np.array(topk["attention_mask"]).astype(np.int64), + max_length, + ) diff --git a/ppocr/data/imaug/latex_ocr_aug.py b/ppocr/data/imaug/latex_ocr_aug.py new file mode 100644 index 0000000000..db787f3459 --- /dev/null +++ b/ppocr/data/imaug/latex_ocr_aug.py @@ -0,0 +1,179 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/lukas-blecher/LaTeX-OCR/blob/main/pix2tex/dataset/transforms.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import math +import cv2 +import numpy as np +import albumentations as A +from PIL import Image + + +class LatexTrainTransform: + def __init__(self, bitmap_prob=0.04, **kwargs): + # your init code + self.bitmap_prob = bitmap_prob + self.train_transform = A.Compose( + [ + A.Compose( + [ + A.ShiftScaleRotate( + shift_limit=0, + scale_limit=(-0.15, 0), + rotate_limit=1, + border_mode=0, + interpolation=3, + value=[255, 255, 255], + p=1, + ), + A.GridDistortion( + distort_limit=0.1, + border_mode=0, + interpolation=3, + value=[255, 255, 255], + p=0.5, + ), + ], + p=0.15, + ), + A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3), + A.GaussNoise(10, p=0.2), + A.RandomBrightnessContrast(0.05, (-0.2, 0), True, p=0.2), + A.ImageCompression(95, p=0.3), + A.ToGray(always_apply=True), + ] + ) + + def __call__(self, data): + img = data["image"] + if np.random.random() < self.bitmap_prob: + img[img != 255] = 0 + img = self.train_transform(image=img)["image"] + data["image"] = img + return data + + +class LatexTestTransform: + def __init__(self, **kwargs): + # your init code + self.test_transform = A.Compose( + [ + A.ToGray(always_apply=True), + ] + ) + + def __call__(self, data): + img = data["image"] + img = self.test_transform(image=img)["image"] + data["image"] = img + return data + + +class MinMaxResize: + def __init__(self, min_dimensions=[32, 32], max_dimensions=[672, 192], **kwargs): + # your init code + self.min_dimensions = min_dimensions + self.max_dimensions = max_dimensions + # pass + + def pad_(self, img, divable=32): + threshold = 128 + data = np.array(img.convert("LA")) + if data[..., -1].var() == 0: + data = (data[..., 0]).astype(np.uint8) + else: + data = (255 - data[..., -1]).astype(np.uint8) + data = (data - data.min()) / (data.max() - data.min()) * 255 + if data.mean() > threshold: + # To invert the text to white + gray = 255 * (data < threshold).astype(np.uint8) + else: + gray = 255 * (data > threshold).astype(np.uint8) + data = 255 - data + + coords = cv2.findNonZero(gray) # Find all non-zero points (text) + a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box + rect = data[b : b + h, a : a + w] + im = Image.fromarray(rect).convert("L") + dims = [] + for x in [w, h]: + div, mod = divmod(x, divable) + dims.append(divable * (div + (1 if mod > 0 else 0))) + padded = Image.new("L", dims, 255) + padded.paste(im, (0, 0, im.size[0], im.size[1])) + return padded + + def minmax_size_(self, img, max_dimensions, min_dimensions): + if max_dimensions is not None: + ratios = [a / b for a, b in zip(img.size, max_dimensions)] + if any([r > 1 for r in ratios]): + size = np.array(img.size) // max(ratios) + img = img.resize(tuple(size.astype(int)), Image.BILINEAR) + if min_dimensions is not None: + # hypothesis: there is a dim in img smaller than min_dimensions, and return a proper dim >= min_dimensions + padded_size = [ + max(img_dim, min_dim) + for img_dim, min_dim in zip(img.size, min_dimensions) + ] + if padded_size != list(img.size): # assert hypothesis + padded_im = Image.new("L", padded_size, 255) + padded_im.paste(img, img.getbbox()) + img = padded_im + return img + + def __call__(self, data): + img = data["image"] + h, w = img.shape[:2] + if ( + self.min_dimensions[0] <= w <= self.max_dimensions[0] + and self.min_dimensions[1] <= h <= self.max_dimensions[1] + ): + return data + else: + im = Image.fromarray(np.uint8(img)) + im = self.minmax_size_( + self.pad_(im), self.max_dimensions, self.min_dimensions + ) + im = np.array(im) + im = np.dstack((im, im, im)) + data["image"] = im + return data + + +class LatexImageFormat: + def __init__(self, **kwargs): + # your init code + pass + + def __call__(self, data): + img = data["image"] + im_h, im_w = img.shape[:2] + divide_h = math.ceil(im_h / 16) * 16 + divide_w = math.ceil(im_w / 16) * 16 + img = img[:, :, 0] + img = np.pad( + img, ((0, divide_h - im_h), (0, divide_w - im_w)), constant_values=(1, 1) + ) + img_expanded = img[:, :, np.newaxis].transpose(2, 0, 1) + data["image"] = img_expanded + return data diff --git a/ppocr/data/latexocr_dataset.py b/ppocr/data/latexocr_dataset.py new file mode 100644 index 0000000000..a1a747f040 --- /dev/null +++ b/ppocr/data/latexocr_dataset.py @@ -0,0 +1,172 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/lukas-blecher/LaTeX-OCR/blob/main/pix2tex/dataset/dataset.py +""" + +import numpy as np +import cv2 +import math +import os +import json +import pickle +import random +import traceback +import paddle +from paddle.io import Dataset +from .imaug.label_ops import LatexOCRLabelEncode +from .imaug import transform, create_operators + + +class LaTeXOCRDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(LaTeXOCRDataSet, self).__init__() + self.logger = logger + self.mode = mode.lower() + + global_config = config["Global"] + dataset_config = config[mode]["dataset"] + loader_config = config[mode]["loader"] + + pkl_path = dataset_config.pop("data") + self.min_dimensions = dataset_config.pop("min_dimensions") + self.max_dimensions = dataset_config.pop("max_dimensions") + self.batchsize = dataset_config.pop("batch_size_per_pair") + self.keep_smaller_batches = dataset_config.pop("keep_smaller_batches") + self.max_seq_len = global_config.pop("max_seq_len") + self.rec_char_dict_path = global_config.pop("rec_char_dict_path") + self.tokenizer = LatexOCRLabelEncode(self.rec_char_dict_path) + + file = open(pkl_path, "rb") + data = pickle.load(file) + temp = {} + for k in data: + if ( + self.min_dimensions[0] <= k[0] <= self.max_dimensions[0] + and self.min_dimensions[1] <= k[1] <= self.max_dimensions[1] + ): + temp[k] = data[k] + self.data = temp + self.do_shuffle = loader_config["shuffle"] + self.seed = seed + + if self.mode == "train" and self.do_shuffle: + random.seed(self.seed) + self.pairs = [] + for k in self.data: + info = np.array(self.data[k], dtype=object) + p = ( + paddle.randperm(len(info)) + if self.mode == "train" and self.do_shuffle + else paddle.arange(len(info)) + ) + for i in range(0, len(info), self.batchsize): + batch = info[p[i : i + self.batchsize]] + if len(batch.shape) == 1: + batch = batch[None, :] + if len(batch) < self.batchsize and not self.keep_smaller_batches: + continue + self.pairs.append(batch) + if self.do_shuffle: + self.pairs = np.random.permutation(np.array(self.pairs, dtype=object)) + else: + self.pairs = np.array(self.pairs, dtype=object) + + self.size = len(self.pairs) + self.set_epoch_as_seed(self.seed, dataset_config) + + self.ops = create_operators(dataset_config["transforms"], global_config) + self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", 2) + self.need_reset = True + + def set_epoch_as_seed(self, seed, dataset_config): + if self.mode == "train": + try: + border_map_id = [ + index + for index, dictionary in enumerate(dataset_config["transforms"]) + if "MakeBorderMap" in dictionary + ][0] + shrink_map_id = [ + index + for index, dictionary in enumerate(dataset_config["transforms"]) + if "MakeShrinkMap" in dictionary + ][0] + dataset_config["transforms"][border_map_id]["MakeBorderMap"][ + "epoch" + ] = (seed if seed is not None else 0) + dataset_config["transforms"][shrink_map_id]["MakeShrinkMap"][ + "epoch" + ] = (seed if seed is not None else 0) + except Exception as E: + print(E) + return + + def shuffle_data_random(self): + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def __getitem__(self, idx): + batch = self.pairs[idx] + eqs, ims = batch.T + try: + max_width, max_height, max_length = 0, 0, 0 + + images_transform = [] + + for img_path in ims: + data = { + "img_path": img_path, + } + with open(data["img_path"], "rb") as f: + img = f.read() + data["image"] = img + item = transform(data, self.ops) + images_transform.append(np.array(item[0])) + image_concat = np.concatenate(images_transform, axis=0)[:, np.newaxis, :, :] + images_transform = image_concat.astype(np.float32) + labels, attention_mask, max_length = self.tokenizer(list(eqs)) + if self.max_seq_len < max_length: + rnd_idx = ( + np.random.randint(self.__len__()) + if self.mode == "train" + else (idx + 1) % self.__len__() + ) + return self.__getitem__(rnd_idx) + return (images_transform, labels, attention_mask) + + except: + + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data["img_path"], traceback.format_exc() + ) + ) + outs = None + + if outs is None: + # during evaluation, we should fix the idx to get same results for many times of evaluation. + rnd_idx = ( + np.random.randint(self.__len__()) + if self.mode == "train" + else (idx + 1) % self.__len__() + ) + return self.__getitem__(rnd_idx) + return outs + + def __len__(self): + return self.size diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index ed66e9837a..915a28d165 100644 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -45,6 +45,7 @@ from .rec_nrtr_loss import NRTRLoss from .rec_parseq_loss import ParseQLoss from .rec_cppd_loss import CPPDLoss +from .rec_latexocr_loss import LaTeXOCRLoss # cls loss from .cls_loss import ClsLoss @@ -107,6 +108,7 @@ def build_loss(config): "NRTRLoss", "ParseQLoss", "CPPDLoss", + "LaTeXOCRLoss", ] config = copy.deepcopy(config) module_name = config.pop("name") diff --git a/ppocr/losses/rec_latexocr_loss.py b/ppocr/losses/rec_latexocr_loss.py new file mode 100644 index 0000000000..d209c04200 --- /dev/null +++ b/ppocr/losses/rec_latexocr_loss.py @@ -0,0 +1,47 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/lucidrains/x-transformers/blob/main/x_transformers/autoregressive_wrapper.py +""" + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class LaTeXOCRLoss(nn.Layer): + """ + LaTeXOCR adopt CrossEntropyLoss for network training. + """ + + def __init__(self): + super(LaTeXOCRLoss, self).__init__() + self.ignore_index = -100 + self.cross = nn.CrossEntropyLoss( + reduction="mean", ignore_index=self.ignore_index + ) + + def forward(self, preds, batch): + word_probs = preds + labels = batch[1][:, 1:] + word_loss = self.cross( + paddle.reshape(word_probs, [-1, word_probs.shape[-1]]), + paddle.reshape(labels, [-1]), + ) + + loss = word_loss + return {"loss": loss} diff --git a/ppocr/metrics/__init__.py b/ppocr/metrics/__init__.py index 9ab515fcb7..dd28d73538 100644 --- a/ppocr/metrics/__init__.py +++ b/ppocr/metrics/__init__.py @@ -22,7 +22,7 @@ __all__ = ["build_metric"] from .det_metric import DetMetric, DetFCEMetric -from .rec_metric import RecMetric, CNTMetric, CANMetric +from .rec_metric import RecMetric, CNTMetric, CANMetric, LaTeXOCRMetric from .cls_metric import ClsMetric from .e2e_metric import E2EMetric from .distillation_metric import DistillationMetric @@ -50,6 +50,7 @@ def build_metric(config): "CTMetric", "CNTMetric", "CANMetric", + "LaTeXOCRMetric", ] config = copy.deepcopy(config) diff --git a/ppocr/metrics/bleu.py b/ppocr/metrics/bleu.py new file mode 100644 index 0000000000..672e7b4c03 --- /dev/null +++ b/ppocr/metrics/bleu.py @@ -0,0 +1,240 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py +""" + +import re +import math +import collections +from functools import lru_cache + + +def _get_ngrams(segment, max_order): + """Extracts all n-grams upto a given maximum order from an input segment. + + Args: + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. + + Returns: + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in range(1, max_order + 1): + for i in range(0, len(segment) - order + 1): + ngram = tuple(segment[i : i + order]) + ngram_counts[ngram] += 1 + return ngram_counts + + +def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False): + """Computes BLEU score of translated segments against one or more references. + + Args: + reference_corpus: list of lists of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + smooth: Whether or not to apply Lin et al. 2004 smoothing. + + Returns: + 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram + precisions and brevity penalty. + """ + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + reference_length = 0 + translation_length = 0 + for references, translation in zip(reference_corpus, translation_corpus): + reference_length += min(len(r) for r in references) + translation_length += len(translation) + + merged_ref_ngram_counts = collections.Counter() + for reference in references: + merged_ref_ngram_counts |= _get_ngrams(reference, max_order) + translation_ngram_counts = _get_ngrams(translation, max_order) + overlap = translation_ngram_counts & merged_ref_ngram_counts + for ngram in overlap: + matches_by_order[len(ngram) - 1] += overlap[ngram] + for order in range(1, max_order + 1): + possible_matches = len(translation) - order + 1 + if possible_matches > 0: + possible_matches_by_order[order - 1] += possible_matches + + precisions = [0] * max_order + for i in range(0, max_order): + if smooth: + precisions[i] = (matches_by_order[i] + 1.0) / ( + possible_matches_by_order[i] + 1.0 + ) + else: + if possible_matches_by_order[i] > 0: + precisions[i] = ( + float(matches_by_order[i]) / possible_matches_by_order[i] + ) + else: + precisions[i] = 0.0 + + if min(precisions) > 0: + p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions) + geo_mean = math.exp(p_log_sum) + else: + geo_mean = 0 + + ratio = float(translation_length) / reference_length + + if ratio > 1.0: + bp = 1.0 + else: + bp = math.exp(1 - 1.0 / ratio) + + bleu = geo_mean * bp + + return (bleu, precisions, bp, ratio, translation_length, reference_length) + + +class BaseTokenizer: + """A base dummy tokenizer to derive from.""" + + def signature(self): + """ + Returns a signature for the tokenizer. + :return: signature string + """ + return "none" + + def __call__(self, line): + """ + Tokenizes an input line with the tokenizer. + :param line: a segment to tokenize + :return: the tokenized line + """ + return line + + +class TokenizerRegexp(BaseTokenizer): + def signature(self): + return "re" + + def __init__(self): + self._re = [ + # language-dependent part (assuming Western languages) + (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), + # tokenize period and comma unless preceded by a digit + (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), + # tokenize period and comma unless followed by a digit + (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), + # tokenize dash when preceded by a digit + (re.compile(r"([0-9])(-)"), r"\1 \2 "), + # one space only between words + # NOTE: Doing this in Python (below) is faster + # (re.compile(r'\s+'), r' '), + ] + + @lru_cache(maxsize=2**16) + def __call__(self, line): + """Common post-processing tokenizer for `13a` and `zh` tokenizers. + :param line: a segment to tokenize + :return: the tokenized line + """ + for _re, repl in self._re: + line = _re.sub(repl, line) + + # no leading or trailing spaces, single space within words + # return ' '.join(line.split()) + # This line is changed with regards to the original tokenizer (seen above) to return individual words + return line.split() + + +class Tokenizer13a(BaseTokenizer): + def signature(self): + return "13a" + + def __init__(self): + self._post_tokenizer = TokenizerRegexp() + + @lru_cache(maxsize=2**16) + def __call__(self, line): + """Tokenizes an input line using a relatively minimal tokenization + that is however equivalent to mteval-v13a, used by WMT. + + :param line: a segment to tokenize + :return: the tokenized line + """ + + # language-independent part: + line = line.replace("", "") + line = line.replace("-\n", "") + line = line.replace("\n", " ") + + if "&" in line: + line = line.replace(""", '"') + line = line.replace("&", "&") + line = line.replace("<", "<") + line = line.replace(">", ">") + + return self._post_tokenizer(f" {line} ") + + +def compute_blue_score( + predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False +): + # if only one reference is provided make sure we still use list of lists + if isinstance(references[0], str): + references = [[ref] for ref in references] + + references = [[tokenizer(r) for r in ref] for ref in references] + predictions = [tokenizer(p) for p in predictions] + score = compute_bleu( + reference_corpus=references, + translation_corpus=predictions, + max_order=max_order, + smooth=smooth, + ) + (bleu, precisions, bp, ratio, translation_length, reference_length) = score + return bleu + + +def cal_distance(word1, word2): + m = len(word1) + n = len(word2) + if m * n == 0: + return m + n + dp = [[0] * (n + 1) for _ in range(m + 1)] + for i in range(m + 1): + dp[i][0] = i + for j in range(n + 1): + dp[0][j] = j + for i in range(1, m + 1): + for j in range(1, n + 1): + a = dp[i - 1][j] + 1 + b = dp[i][j - 1] + 1 + c = dp[i - 1][j - 1] + if word1[i - 1] != word2[j - 1]: + c += 1 + dp[i][j] = min(a, b, c) + return dp[m][n] + + +def compute_edit_distance(prediction, label): + prediction = prediction.strip().split(" ") + label = label.strip().split(" ") + distance = cal_distance(prediction, label) + return distance diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py index e41dd36e09..dbb5ddeb76 100644 --- a/ppocr/metrics/rec_metric.py +++ b/ppocr/metrics/rec_metric.py @@ -17,6 +17,7 @@ import numpy as np import string +from .bleu import compute_blue_score, compute_edit_distance class RecMetric(object): @@ -177,3 +178,121 @@ def epoch_reset(self): self.exp_right = [] self.word_total_length = 0 self.exp_total_num = 0 + + +class LaTeXOCRMetric(object): + def __init__(self, main_indicator="exp_rate", cal_blue_score=False, **kwargs): + self.main_indicator = main_indicator + self.cal_blue_score = cal_blue_score + self.edit_right = [] + self.exp_right = [] + self.blue_right = [] + self.e1_right = [] + self.e2_right = [] + self.e3_right = [] + self.editdistance_total_length = 0 + self.exp_total_num = 0 + self.edit_dist = 0 + self.exp_rate = 0 + if self.cal_blue_score: + self.blue_score = 0 + self.e1 = 0 + self.e2 = 0 + self.e3 = 0 + self.reset() + self.epoch_reset() + + def __call__(self, preds, batch, **kwargs): + for k, v in kwargs.items(): + epoch_reset = v + if epoch_reset: + self.epoch_reset() + word_pred = preds + word_label = batch + line_right, e1, e2, e3 = 0, 0, 0, 0 + lev_dist = [] + for labels, prediction in zip(word_label, word_pred): + if prediction == labels: + line_right += 1 + distance = compute_edit_distance(prediction, labels) + lev_dist.append(Levenshtein.normalized_distance(prediction, labels)) + if distance <= 1: + e1 += 1 + if distance <= 2: + e2 += 1 + if distance <= 3: + e3 += 1 + + batch_size = len(lev_dist) + + self.edit_dist = sum(lev_dist) # float + self.exp_rate = line_right # float + if self.cal_blue_score: + self.blue_score = compute_blue_score(word_pred, word_label) + self.e1 = e1 + self.e2 = e2 + self.e3 = e3 + exp_length = len(word_label) + self.edit_right.append(self.edit_dist) + self.exp_right.append(self.exp_rate) + if self.cal_blue_score: + self.blue_right.append(self.blue_score * batch_size) + self.e1_right.append(self.e1) + self.e2_right.append(self.e2) + self.e3_right.append(self.e3) + self.editdistance_total_length = self.editdistance_total_length + exp_length + self.exp_total_num = self.exp_total_num + exp_length + + def get_metric(self): + """ + return { + 'edit distance': 0, + "blue_score": 0, + "exp_rate": 0, + } + """ + cur_edit_distance = sum(self.edit_right) / self.exp_total_num + cur_exp_rate = sum(self.exp_right) / self.exp_total_num + if self.cal_blue_score: + cur_blue_score = sum(self.blue_right) / self.editdistance_total_length + cur_exp_1 = sum(self.e1_right) / self.exp_total_num + cur_exp_2 = sum(self.e2_right) / self.exp_total_num + cur_exp_3 = sum(self.e3_right) / self.exp_total_num + self.reset() + if self.cal_blue_score: + return { + "blue_score ": cur_blue_score, + "edit distance ": cur_edit_distance, + "exp_rate ": cur_exp_rate, + "exp_rate<=1 ": cur_exp_1, + "exp_rate<=2 ": cur_exp_2, + "exp_rate<=3 ": cur_exp_3, + } + else: + return { + "edit distance": cur_edit_distance, + "exp_rate": cur_exp_rate, + "exp_rate<=1 ": cur_exp_1, + "exp_rate<=2 ": cur_exp_2, + "exp_rate<=3 ": cur_exp_3, + } + + def reset(self): + self.edit_dist = 0 + self.exp_rate = 0 + if self.cal_blue_score: + self.blue_score = 0 + self.e1 = 0 + self.e2 = 0 + self.e3 = 0 + + def epoch_reset(self): + self.edit_right = [] + self.exp_right = [] + if self.cal_blue_score: + self.blue_right = [] + self.e1_right = [] + self.e2_right = [] + self.e3_right = [] + self.editdistance_total_length = 0 + self.exp_total_num = 0 diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py index 81d107c293..2a18a51b4b 100755 --- a/ppocr/modeling/backbones/__init__.py +++ b/ppocr/modeling/backbones/__init__.py @@ -59,6 +59,8 @@ def build_backbone(config, model_type): from .rec_vitstr import ViTSTR from .rec_resnet_rfl import ResNetRFL from .rec_densenet import DenseNet + from .rec_resnetv2 import ResNetV2 + from .rec_hybridvit import HybridTransformer from .rec_shallow_cnn import ShallowCNN from .rec_lcnetv3 import PPLCNetV3 from .rec_hgnet import PPHGNet_small @@ -89,6 +91,8 @@ def build_backbone(config, model_type): "ViT", "RepSVTR", "SVTRv2", + "ResNetV2", + "HybridTransformer", ] elif model_type == "e2e": from .e2e_resnet_vd_pg import ResNet diff --git a/ppocr/modeling/backbones/rec_hybridvit.py b/ppocr/modeling/backbones/rec_hybridvit.py new file mode 100644 index 0000000000..e873a781b6 --- /dev/null +++ b/ppocr/modeling/backbones/rec_hybridvit.py @@ -0,0 +1,529 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from itertools import repeat +import collections +import math +from functools import partial + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppocr.modeling.backbones.rec_resnetv2 import ( + ResNetV2, + StdConv2dSame, + DropPath, + get_padding, +) +from paddle.nn.initializer import ( + TruncatedNormal, + Constant, + Normal, + KaimingUniform, + XavierUniform, +) + +normal_ = Normal(mean=0.0, std=1e-6) +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) +kaiming_normal_ = KaimingUniform(nonlinearity="relu") +trunc_normal_ = TruncatedNormal(std=0.02) +xavier_uniform_ = XavierUniform() + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +class Conv2dAlign(nn.Conv2D): + """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models. + + Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` - + https://arxiv.org/abs/1903.10520v2 + """ + + def __init__( + self, + in_channel, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + eps=1e-6, + ): + + super().__init__( + in_channel, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias, + weight_attr=True, + ) + self.eps = eps + + def forward(self, x): + x = F.conv2d( + x, + self.weight, + self.bias, + self._stride, + self._padding, + self._dilation, + self._groups, + ) + return x + + +class HybridEmbed(nn.Layer): + """CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + + def __init__( + self, + backbone, + img_size=224, + patch_size=1, + feature_size=None, + in_chans=3, + embed_dim=768, + ): + super().__init__() + assert isinstance(backbone, nn.Layer) + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.backbone = backbone + feature_dim = 1024 + feature_size = (42, 12) + patch_size = (1, 1) + assert ( + feature_size[0] % patch_size[0] == 0 + and feature_size[1] % patch_size[1] == 0 + ) + + self.grid_size = ( + feature_size[0] // patch_size[0], + feature_size[1] // patch_size[1], + ) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.proj = nn.Conv2D( + feature_dim, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + weight_attr=True, + bias_attr=True, + ) + + def forward(self, x): + + x = self.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + x = self.proj(x).flatten(2).transpose([0, 2, 1]) + + return x + + +class myLinear(nn.Linear): + def __init__(self, in_channel, out_channels, weight_attr=True, bias_attr=True): + super().__init__( + in_channel, out_channels, weight_attr=weight_attr, bias_attr=bias_attr + ) + + def forward(self, x): + return paddle.matmul(x, self.weight, transpose_y=True) + self.bias + + +class Attention(nn.Layer): + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = myLinear(dim, dim, weight_attr=True, bias_attr=True) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape([B, N, 3, self.num_heads, C // self.num_heads]) + .transpose([2, 0, 3, 1, 4]) + ) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Mlp(nn.Layer): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +class Block(nn.Layer): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x): + + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class HybridTransformer(nn.Layer): + """Implementation of HybridTransformer. + + Args: + x: input images with shape [N, 1, H, W] + label: LaTeX-OCR labels with shape [N, L] , L is the max sequence length + attention_mask: LaTeX-OCR attention mask with shape [N, L] , L is the max sequence length + + Returns: + The encoded features with shape [N, 1, H//16, W//16] + """ + + def __init__( + self, + backbone_layers=[2, 3, 7], + input_channel=1, + is_predict=False, + is_export=False, + img_size=(224, 224), + patch_size=16, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + representation_size=None, + distilled=False, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + embed_layer=None, + norm_layer=None, + act_layer=None, + weight_init="", + **kwargs, + ): + super(HybridTransformer, self).__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = ( + embed_dim # num_features for consistency with other models + ) + self.num_tokens = 2 if distilled else 1 + norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6) + act_layer = act_layer or nn.GELU + self.height, self.width = img_size + self.patch_size = patch_size + backbone = ResNetV2( + layers=backbone_layers, + num_classes=0, + global_pool="", + in_chans=input_channel, + preact=False, + stem_type="same", + conv_layer=StdConv2dSame, + is_export=is_export, + ) + min_patch_size = 2 ** (len(backbone_layers) + 1) + self.patch_embed = HybridEmbed( + img_size=img_size, + patch_size=patch_size // min_patch_size, + in_chans=input_channel, + embed_dim=embed_dim, + backbone=backbone, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = paddle.create_parameter([1, 1, embed_dim], dtype="float32") + self.dist_token = ( + paddle.create_parameter( + [1, 1, embed_dim], + dtype="float32", + ) + if distilled + else None + ) + self.pos_embed = paddle.create_parameter( + [1, num_patches + self.num_tokens, embed_dim], dtype="float32" + ) + self.pos_drop = nn.Dropout(p=drop_rate) + zeros_(self.cls_token) + if self.dist_token is not None: + zeros_(self.dist_token) + zeros_(self.pos_embed) + + dpr = [ + x.item() for x in paddle.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential( + *[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) + + # Representation layer + if representation_size and not distilled: + self.num_features = representation_size + self.pre_logits = nn.Sequential( + ("fc", nn.Linear(embed_dim, representation_size)), ("act", nn.Tanh()) + ) + else: + self.pre_logits = nn.Identity() + + # Classifier head(s) + self.head = ( + nn.Linear(self.num_features, num_classes) + if num_classes > 0 + else nn.Identity() + ) + self.head_dist = None + if distilled: + self.head_dist = ( + nn.Linear(self.embed_dim, self.num_classes) + if num_classes > 0 + else nn.Identity() + ) + self.init_weights(weight_init) + self.out_channels = embed_dim + self.is_predict = is_predict + self.is_export = is_export + + def init_weights(self, mode=""): + assert mode in ("jax", "jax_nlhb", "nlhb", "") + head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0 + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + self.apply(_init_vit_weights) + + def _init_weights(self, m): + # this fn left here for compat with downstream users + _init_vit_weights(m) + + def load_pretrained(self, checkpoint_path, prefix=""): + raise NotImplementedError + + def no_weight_decay(self): + return {"pos_embed", "cls_token", "dist_token"} + + def get_classifier(self): + if self.dist_token is None: + return self.head + else: + return self.head, self.head_dist + + def reset_classifier(self, num_classes, global_pool=""): + self.num_classes = num_classes + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + if self.num_tokens == 2: + self.head_dist = ( + nn.Linear(self.embed_dim, self.num_classes) + if num_classes > 0 + else nn.Identity() + ) + + def forward_features(self, x): + B, c, h, w = x.shape + x = self.patch_embed(x) + cls_tokens = self.cls_token.expand( + [B, -1, -1] + ) # stole cls_tokens impl from Phil Wang, thanks + x = paddle.concat((cls_tokens, x), axis=1) + h, w = h // self.patch_size, w // self.patch_size + repeat_tensor = ( + paddle.arange(h) * (self.width // self.patch_size - w) + ).reshape([-1, 1]) + repeat_tensor = paddle.repeat_interleave( + repeat_tensor, paddle.to_tensor(w), axis=1 + ).reshape([-1]) + pos_emb_ind = repeat_tensor + paddle.arange(h * w) + pos_emb_ind = paddle.concat( + (paddle.zeros([1], dtype="int64"), pos_emb_ind + 1), axis=0 + ).cast(paddle.int64) + x += self.pos_embed[:, pos_emb_ind] + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + return x + + def forward(self, input_data): + + if self.training: + x, label, attention_mask = input_data + else: + if isinstance(input_data, list): + x = input_data[0] + else: + x = input_data + x = self.forward_features(x) + x = self.head(x) + if self.training: + return x, label, attention_mask + else: + return x + + +def _init_vit_weights( + module: nn.Layer, name: str = "", head_bias: float = 0.0, jax_impl: bool = False +): + """ViT weight initialization + * When called without n, head_bias, jax_impl args it will behave exactly the same + as my original init for compatibility with prev hparam / downstream use cases (ie DeiT). + * When called w/ valid n (module name) and jax_impl=True, will (hopefully) match JAX impl + """ + if isinstance(module, nn.Linear): + if name.startswith("head"): + zeros_(module.weight) + constant_ = Constant(value=head_bias) + constant_(module.bias, head_bias) + elif name.startswith("pre_logits"): + zeros_(module.bias) + else: + if jax_impl: + xavier_uniform_(module.weight) + if module.bias is not None: + if "mlp" in name: + normal_(module.bias) + else: + zeros_(module.bias) + else: + trunc_normal_(module.weight) + if module.bias is not None: + zeros_(module.bias) + elif jax_impl and isinstance(module, nn.Conv2D): + # NOTE conv was left to pytorch default in my original init + if module.bias is not None: + zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2D)): + zeros_(module.bias) + ones_(module.weight) diff --git a/ppocr/modeling/backbones/rec_resnetv2.py b/ppocr/modeling/backbones/rec_resnetv2.py new file mode 100644 index 0000000000..083e08c7b9 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnetv2.py @@ -0,0 +1,1283 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/resnetv2.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import collections.abc +from itertools import repeat +from collections import OrderedDict # pylint: disable=g-importing-member + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingUniform +from functools import partial +from typing import Union, Callable, Type, List, Tuple + +IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) +IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) +normal_ = Normal(mean=0.0, std=0.01) +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) +kaiming_normal_ = KaimingUniform(nonlinearity="relu") + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +class StdConv2dSame(nn.Conv2D): + def __init__( + self, + in_channel, + out_channels, + kernel_size, + stride=1, + padding="SAME", + dilation=1, + groups=1, + bias_attr=False, + eps=1e-6, + is_export=False, + ): + padding, is_dynamic = get_padding_value( + padding, kernel_size, stride=stride, dilation=dilation + ) + super().__init__( + in_channel, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias_attr, + ) + self.same_pad = is_dynamic + self.export = is_export + self.eps = eps + + def forward(self, x): + if self.same_pad: + if self.export: + x = pad_same_export(x, self._kernel_size, self._stride, self._dilation) + else: + x = pad_same(x, self._kernel_size, self._stride, self._dilation) + running_mean = paddle.to_tensor([0] * self._out_channels, dtype="float32") + running_variance = paddle.to_tensor([1] * self._out_channels, dtype="float32") + if self.export: + weight = paddle.reshape( + F.batch_norm( + self.weight.reshape([1, self._out_channels, -1]), + running_mean, + running_variance, + momentum=0.0, + epsilon=self.eps, + use_global_stats=False, + ), + self.weight.shape, + ) + else: + weight = paddle.reshape( + F.batch_norm( + self.weight.reshape([1, self._out_channels, -1]), + running_mean, + running_variance, + training=True, + momentum=0.0, + epsilon=self.eps, + ), + self.weight.shape, + ) + x = F.conv2d( + x, + weight, + self.bias, + self._stride, + self._padding, + self._dilation, + self._groups, + ) + return x + + +class StdConv2d(nn.Conv2D): + """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models. + + Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` - + https://arxiv.org/abs/1903.10520v2 + """ + + def __init__( + self, + in_channel, + out_channels, + kernel_size, + stride=1, + padding=None, + dilation=1, + groups=1, + bias=False, + eps=1e-6, + ): + if padding is None: + padding = get_padding(kernel_size, stride, dilation) + super().__init__( + in_channel, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias, + ) + self.eps = eps + + def forward(self, x): + weight = F.batch_norm( + self.weight.reshape(1, self.out_channels, -1), + None, + None, + training=True, + momentum=0.0, + epsilon=self.eps, + ).reshape_as(self.weight) + x = F.conv2d( + x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups + ) + return x + + +class MaxPool2dSame(nn.MaxPool2D): + """Tensorflow like 'SAME' wrapper for 2D max pooling""" + + def __init__( + self, + kernel_size: int, + stride=None, + padding=0, + dilation=1, + ceil_mode=False, + is_export=False, + ): + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + self.export = is_export + super(MaxPool2dSame, self).__init__( + kernel_size, stride, (0, 0), dilation, ceil_mode + ) + + def forward(self, x): + if self.export: + x = pad_same_export(x, self.ksize, self.stride, value=-float("inf")) + else: + x = pad_same(x, self.ksize, self.stride, value=-float("inf")) + return F.max_pool2d(x, self.ksize, self.stride, (0, 0), self.ceil_mode) + + +def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding + + +def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): + return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 + + +def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]: + dynamic = False + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == "same": + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if is_static_pad(kernel_size, **kwargs): + # static case, no extra overhead + padding = get_padding(kernel_size, **kwargs) + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == "valid": + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = get_padding(kernel_size, **kwargs) + return padding, dynamic + + +def create_pool2d(pool_type, kernel_size, stride=None, is_export=False, **kwargs): + stride = stride or kernel_size + padding = kwargs.pop("padding", "") + padding, is_dynamic = get_padding_value( + padding, kernel_size, stride=stride, **kwargs + ) + if is_dynamic: + if pool_type == "avg": + return AvgPool2dSame( + kernel_size, stride=stride, is_export=is_export, **kwargs + ) + elif pool_type == "max": + return MaxPool2dSame( + kernel_size, stride=stride, is_export=is_export, **kwargs + ) + else: + assert False, f"Unsupported pool type {pool_type}" + + +def get_same_padding(x, k, s, d): + return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0) + + +def get_same_padding_export(x, k, s, d): + x = paddle.to_tensor(x) + k = paddle.to_tensor(k) + s = paddle.to_tensor(s) + d = paddle.to_tensor(d) + return paddle.max((paddle.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0) + + +def pad_same_export(x, k, s, d=(1, 1), value=0): + ih, iw = x.shape[-2:] + pad_h, pad_w = get_same_padding_export( + ih, k[0], s[0], d[0] + ), get_same_padding_export(iw, k[1], s[1], d[1]) + pad_h = pad_h.cast(paddle.int32) + pad_w = pad_w.cast(paddle.int32) + pad_list = paddle.to_tensor( + [ + (pad_w // 2), + (pad_w - pad_w // 2).cast(paddle.int32), + (pad_h // 2).cast(paddle.int32), + (pad_h - pad_h // 2).cast(paddle.int32), + ] + ) + + if pad_h > 0 or pad_w > 0: + if len(pad_list.shape) == 2: + pad_list = pad_list.squeeze(1) + x = F.pad(x, pad_list.cast(paddle.int32), value=value) + return x + + +def pad_same(x, k, s, d=(1, 1), value=0, pad_h=None, pad_w=None): + ih, iw = x.shape[-2:] + + pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding( + iw, k[1], s[1], d[1] + ) + if pad_h > 0 or pad_w > 0: + x = F.pad( + x, + [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], + value=value, + ) + return x + + +class AvgPool2dSame(nn.AvgPool2D): + """Tensorflow like 'SAME' wrapper for 2D average pooling""" + + def __init__( + self, + kernel_size: int, + stride=None, + padding=0, + ceil_mode=False, + count_include_pad=True, + ): + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + super(AvgPool2dSame, self).__init__( + kernel_size, stride, (0, 0), ceil_mode, count_include_pad + ) + + def forward(self, x): + x = pad_same(x, self.kernel_size, self.stride) + return F.avg_pool2d( + x, + self.kernel_size, + self.stride, + self.padding, + self.ceil_mode, + self.count_include_pad, + ) + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None, scale_by_keep=True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + +def adaptive_pool_feat_mult(pool_type="avg"): + if pool_type == "catavgmax": + return 2 + else: + return 1 + + +class SelectAdaptivePool2d(nn.Layer): + """Selectable global pooling layer with dynamic input kernel size""" + + def __init__(self, output_size=1, pool_type="fast", flatten=False): + super(SelectAdaptivePool2d, self).__init__() + self.pool_type = ( + pool_type or "" + ) # convert other falsy values to empty string for consistent TS typing + self.flatten = nn.Flatten(1) if flatten else nn.Identity() + if pool_type == "": + self.pool = nn.Identity() # pass through + + def is_identity(self): + return not self.pool_type + + def forward(self, x): + x = self.pool(x) + x = self.flatten(x) + return x + + def feat_mult(self): + return adaptive_pool_feat_mult(self.pool_type) + + def __repr__(self): + return ( + self.__class__.__name__ + + " (" + + "pool_type=" + + self.pool_type + + ", flatten=" + + str(self.flatten) + + ")" + ) + + +def _create_pool(num_features, num_classes, pool_type="avg", use_conv=False): + flatten_in_pool = not use_conv # flatten when we use a Linear layer after pooling + if not pool_type: + assert ( + num_classes == 0 or use_conv + ), "Pooling can only be disabled if classifier is also removed or conv classifier is used" + flatten_in_pool = ( + False # disable flattening if pooling is pass-through (no pooling) + ) + global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten_in_pool) + num_pooled_features = num_features * global_pool.feat_mult() + return global_pool, num_pooled_features + + +def _create_fc(num_features, num_classes, use_conv=False): + if num_classes <= 0: + fc = nn.Identity() # pass-through (no classifier) + elif use_conv: + fc = nn.Conv2D(num_features, num_classes, 1, bias_attr=True) + else: + fc = nn.Linear(num_features, num_classes, bias_attr=True) + return fc + + +class ClassifierHead(nn.Layer): + """Classifier head w/ configurable global pooling and dropout.""" + + def __init__( + self, in_chs, num_classes, pool_type="avg", drop_rate=0.0, use_conv=False + ): + super(ClassifierHead, self).__init__() + self.drop_rate = drop_rate + self.global_pool, num_pooled_features = _create_pool( + in_chs, num_classes, pool_type, use_conv=use_conv + ) + self.fc = _create_fc(num_pooled_features, num_classes, use_conv=use_conv) + self.flatten = nn.Flatten(1) if use_conv and pool_type else nn.Identity() + + def forward(self, x): + x = self.global_pool(x) + if self.drop_rate: + x = F.dropout(x, p=float(self.drop_rate), training=self.training) + x = self.fc(x) + x = self.flatten(x) + return x + + +class EvoNormBatch2d(nn.Layer): + def __init__( + self, num_features, apply_act=True, momentum=0.1, eps=1e-5, drop_block=None + ): + super(EvoNormBatch2d, self).__init__() + self.apply_act = apply_act # apply activation (non-linearity) + self.momentum = momentum + self.eps = eps + self.weight = paddle.create_parameter( + paddle.ones(num_features), dtype="float32" + ) + self.bias = paddle.create_parameter(paddle.zeros(num_features), dtype="float32") + self.v = ( + paddle.create_parameter(paddle.ones(num_features), dtype="float32") + if apply_act + else None + ) + self.register_buffer("running_var", paddle.ones([num_features])) + self.reset_parameters() + + def reset_parameters(self): + ones_(self.weight) + zeros_(self.bias) + if self.apply_act: + ones_(self.v) + + def forward(self, x): + x_type = x.dtype + if self.v is not None: + running_var = self.running_var.view(1, -1, 1, 1) + if self.training: + var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=True) + n = x.numel() / x.shape[1] + running_var = var.detach() * self.momentum * ( + n / (n - 1) + ) + running_var * (1 - self.momentum) + self.running_var.copy_(running_var.view(self.running_var.shape)) + else: + var = running_var + v = self.v.to(dtype=x_type).reshape(1, -1, 1, 1) + d = x * v + ( + x.var(dim=(2, 3), unbiased=False, keepdim=True) + self.eps + ).sqrt().to(dtype=x_type) + d = d.max((var + self.eps).sqrt().to(dtype=x_type)) + x = x / d + return x * self.weight.view(1, -1, 1, 1) + self.bias.view(1, -1, 1, 1) + + +class EvoNormSample2d(nn.Layer): + def __init__( + self, num_features, apply_act=True, groups=32, eps=1e-5, drop_block=None + ): + super(EvoNormSample2d, self).__init__() + self.apply_act = apply_act + self.groups = groups + self.eps = eps + self.weight = paddle.create_parameter( + paddle.ones(num_features), dtype="float32" + ) + self.bias = paddle.create_parameter(paddle.zeros(num_features), dtype="float32") + self.v = ( + paddle.create_parameter(paddle.ones(num_features), dtype="float32") + if apply_act + else None + ) + self.reset_parameters() + + def reset_parameters(self): + ones_(self.weight) + zeros_(self.bias) + if self.apply_act: + ones_(self.v) + + def forward(self, x): + B, C, H, W = x.shape + if self.v is not None: + n = x * (x * self.v.view(1, -1, 1, 1)).sigmoid() + x = x.reshape(B, self.groups, -1) + x = ( + n.reshape(B, self.groups, -1) + / (x.var(dim=-1, unbiased=False, keepdim=True) + self.eps).sqrt() + ) + x = x.reshape(B, C, H, W) + return x * self.weight.reshape([1, -1, 1, 1]) + self.bias.reshape([1, -1, 1, 1]) + + +from paddle.common_ops_import import ( + LayerHelper, + check_type, + check_variable_and_dtype, +) + + +def group_norm( + input, + groups, + epsilon=1e-05, + weight=None, + bias=None, + act=None, + data_layout="NCHW", + name=None, +): + helper = LayerHelper("group_norm", **locals()) + dtype = helper.input_dtype() + check_variable_and_dtype( + input, + "input", + ["float16", "uint16", "float32", "float64"], + "group_norm", + ) + # create intput and parameters + inputs = {"X": input} + input_shape = input.shape + if len(input_shape) < 2: + raise ValueError( + f"The dimensions of Op(static.nn.group_norm)'s input should be more than 1. But received {len(input_shape)}" + ) + if data_layout != "NCHW" and data_layout != "NHWC": + raise ValueError( + "Param(data_layout) of Op(static.nn.group_norm) got wrong value: received " + + data_layout + + " but only NCHW or NHWC supported." + ) + channel_num = input_shape[1] if data_layout == "NCHW" else input_shape[-1] + param_shape = [channel_num] + inputs["Scale"] = weight + inputs["Bias"] = bias + # create output + mean_out = helper.create_variable(dtype=dtype, stop_gradient=True) + variance_out = helper.create_variable(dtype=dtype, stop_gradient=True) + group_norm_out = helper.create_variable(dtype=dtype) + + helper.append_op( + type="group_norm", + inputs=inputs, + outputs={ + "Y": group_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={ + "epsilon": epsilon, + "groups": groups, + "data_layout": data_layout, + }, + ) + + return helper.append_activation(group_norm_out) + + +class GroupNormAct(nn.GroupNorm): + # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args + def __init__( + self, + num_channels, + num_groups=32, + eps=1e-5, + affine=True, + apply_act=True, + act_layer=nn.ReLU, + drop_block=None, + ): + super(GroupNormAct, self).__init__(num_groups, num_channels, epsilon=eps) + if affine: + self.weight = paddle.create_parameter([num_channels], dtype="float32") + self.bias = paddle.create_parameter([num_channels], dtype="float32") + ones_(self.weight) + zeros_(self.bias) + if act_layer is not None and apply_act: + act_args = {} + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + + def forward(self, x): + x = group_norm( + x, self._num_groups, self._epsilon, weight=self.weight, bias=self.bias + ) + x = self.act(x) + return x + + +class BatchNormAct2d(nn.BatchNorm2D): + def __init__( + self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + apply_act=True, + act_layer=nn.ReLU, + drop_block=None, + ): + super(BatchNormAct2d, self).__init__( + num_features, + epsilon=eps, + momentum=momentum, + use_global_stats=track_running_stats, + ) + if act_layer is not None and apply_act: + act_args = dict() + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + + def _forward_python(self, x): + return super(BatchNormAct2d, self).forward(x) + + def forward(self, x): + x = self._forward_python(x) + x = self.act(x) + return x + + +def adapt_input_conv(in_chans, conv_weight): + conv_type = conv_weight.dtype + conv_weight = ( + conv_weight.float() + ) # Some weights are in torch.half, ensure it's float for sum on CPU + O, I, J, K = conv_weight.shape + if in_chans == 1: + if I > 3: + assert conv_weight.shape[1] % 3 == 0 + # For models with space2depth stems + conv_weight = conv_weight.reshape(O, I // 3, 3, J, K) + conv_weight = conv_weight.sum(dim=2, keepdim=False) + else: + conv_weight = conv_weight.sum(dim=1, keepdim=True) + elif in_chans != 3: + if I != 3: + raise NotImplementedError("Weight format not supported by conversion.") + else: + # NOTE this strategy should be better than random init, but there could be other combinations of + # the original RGB input layer weights that'd work better for specific cases. + repeat = int(math.ceil(in_chans / 3)) + conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :] + conv_weight *= 3 / float(in_chans) + conv_weight = conv_weight.to(conv_type) + return conv_weight + + +def named_apply( + fn: Callable, module: nn.Layer, name="", depth_first=True, include_root=False +) -> nn.Layer: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +def _cfg(url="", **kwargs): + return { + "url": url, + "num_classes": 1000, + "input_size": (3, 224, 224), + "pool_size": (7, 7), + "crop_pct": 0.875, + "interpolation": "bilinear", + "mean": IMAGENET_INCEPTION_MEAN, + "std": IMAGENET_INCEPTION_STD, + "first_conv": "stem.conv", + "classifier": "head.fc", + **kwargs, + } + + +def make_div(v, divisor=8): + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class PreActBottleneck(nn.Layer): + """Pre-activation (v2) bottleneck block. + + Follows the implementation of "Identity Mappings in Deep Residual Networks": + https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua + + Except it puts the stride on 3x3 conv when available. + """ + + def __init__( + self, + in_chs, + out_chs=None, + bottle_ratio=0.25, + stride=1, + dilation=1, + first_dilation=None, + groups=1, + act_layer=None, + conv_layer=None, + norm_layer=None, + proj_layer=None, + drop_path_rate=0.0, + is_export=False, + ): + super().__init__() + first_dilation = first_dilation or dilation + conv_layer = conv_layer or StdConv2d + norm_layer = norm_layer or partial(GroupNormAct, num_groups=32) + out_chs = out_chs or in_chs + mid_chs = make_div(out_chs * bottle_ratio) + + if proj_layer is not None: + self.downsample = proj_layer( + in_chs, + out_chs, + stride=stride, + dilation=dilation, + first_dilation=first_dilation, + preact=True, + conv_layer=conv_layer, + norm_layer=norm_layer, + ) + else: + self.downsample = None + + self.norm1 = norm_layer(in_chs) + self.conv1 = conv_layer(in_chs, mid_chs, 1, is_export=is_export) + self.norm2 = norm_layer(mid_chs) + self.conv2 = conv_layer( + mid_chs, + mid_chs, + 3, + stride=stride, + dilation=first_dilation, + groups=groups, + is_export=is_export, + ) + self.norm3 = norm_layer(mid_chs) + self.conv3 = conv_layer(mid_chs, out_chs, 1, is_export=is_export) + self.drop_path = ( + DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + ) + + def zero_init_last(self): + zeros_(self.conv3.weight) + + def forward(self, x): + x_preact = self.norm1(x) + + # shortcut branch + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x_preact) + + # residual branch + x = self.conv1(x_preact) + x = self.conv2(self.norm2(x)) + x = self.conv3(self.norm3(x)) + x = self.drop_path(x) + return x + shortcut + + +class Bottleneck(nn.Layer): + """Non Pre-activation bottleneck block, equiv to V1.5/V1b Bottleneck. Used for ViT.""" + + def __init__( + self, + in_chs, + out_chs=None, + bottle_ratio=0.25, + stride=1, + dilation=1, + first_dilation=None, + groups=1, + act_layer=None, + conv_layer=None, + norm_layer=None, + proj_layer=None, + drop_path_rate=0.0, + is_export=False, + ): + super().__init__() + first_dilation = first_dilation or dilation + act_layer = act_layer or nn.ReLU + conv_layer = conv_layer or StdConv2d + norm_layer = norm_layer or partial(GroupNormAct, num_groups=32) + out_chs = out_chs or in_chs + mid_chs = make_div(out_chs * bottle_ratio) + + if proj_layer is not None: + self.downsample = proj_layer( + in_chs, + out_chs, + stride=stride, + dilation=dilation, + preact=False, + conv_layer=conv_layer, + norm_layer=norm_layer, + is_export=is_export, + ) + else: + self.downsample = None + + self.conv1 = conv_layer(in_chs, mid_chs, 1, is_export=is_export) + self.norm1 = norm_layer(mid_chs) + self.conv2 = conv_layer( + mid_chs, + mid_chs, + 3, + stride=stride, + dilation=first_dilation, + groups=groups, + is_export=is_export, + ) + self.norm2 = norm_layer(mid_chs) + self.conv3 = conv_layer(mid_chs, out_chs, 1, is_export=is_export) + self.norm3 = norm_layer(out_chs, apply_act=False) + self.drop_path = ( + DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + ) + self.act3 = act_layer() + + def zero_init_last(self): + zeros_(self.norm3.weight) + + def forward(self, x): + # shortcut branch + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x) + + # residual + x = self.conv1(x) + x = self.norm1(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.conv3(x) + x = self.norm3(x) + x = self.drop_path(x) + x = self.act3(x + shortcut) + return x + + +class DownsampleConv(nn.Layer): + def __init__( + self, + in_chs, + out_chs, + stride=1, + dilation=1, + first_dilation=None, + preact=True, + conv_layer=None, + norm_layer=None, + is_export=False, + ): + super(DownsampleConv, self).__init__() + self.conv = conv_layer(in_chs, out_chs, 1, stride=stride, is_export=is_export) + self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False) + + def forward(self, x): + return self.norm(self.conv(x)) + + +class DownsampleAvg(nn.Layer): + def __init__( + self, + in_chs, + out_chs, + stride=1, + dilation=1, + first_dilation=None, + preact=True, + conv_layer=None, + norm_layer=None, + is_export=False, + ): + """AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment.""" + super(DownsampleAvg, self).__init__() + avg_stride = stride if dilation == 1 else 1 + if stride > 1 or dilation > 1: + avg_pool_fn = ( + AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2D + ) + self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, exclusive=False) + else: + self.pool = nn.Identity() + self.conv = conv_layer(in_chs, out_chs, 1, stride=1, is_export=is_export) + self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False) + + def forward(self, x): + return self.norm(self.conv(self.pool(x))) + + +class ResNetStage(nn.Layer): + """ResNet Stage.""" + + def __init__( + self, + in_chs, + out_chs, + stride, + dilation, + depth, + bottle_ratio=0.25, + groups=1, + avg_down=False, + block_dpr=None, + block_fn=PreActBottleneck, + is_export=False, + act_layer=None, + conv_layer=None, + norm_layer=None, + **block_kwargs, + ): + super(ResNetStage, self).__init__() + first_dilation = 1 if dilation in (1, 2) else 2 + layer_kwargs = dict( + act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer + ) + proj_layer = DownsampleAvg if avg_down else DownsampleConv + prev_chs = in_chs + self.blocks = nn.Sequential() + for block_idx in range(depth): + drop_path_rate = block_dpr[block_idx] if block_dpr else 0.0 + stride = stride if block_idx == 0 else 1 + self.blocks.add_sublayer( + str(block_idx), + block_fn( + prev_chs, + out_chs, + stride=stride, + dilation=dilation, + bottle_ratio=bottle_ratio, + groups=groups, + first_dilation=first_dilation, + proj_layer=proj_layer, + drop_path_rate=drop_path_rate, + is_export=is_export, + **layer_kwargs, + **block_kwargs, + ), + ) + prev_chs = out_chs + first_dilation = dilation + proj_layer = None + + def forward(self, x): + x = self.blocks(x) + return x + + +def is_stem_deep(stem_type): + return any([s in stem_type for s in ("deep", "tiered")]) + + +def create_resnetv2_stem( + in_chs, + out_chs=64, + stem_type="", + preact=True, + conv_layer=StdConv2d, + norm_layer=partial(GroupNormAct, num_groups=32), + is_export=False, +): + stem = OrderedDict() + assert stem_type in ( + "", + "fixed", + "same", + "deep", + "deep_fixed", + "deep_same", + "tiered", + ) + + # NOTE conv padding mode can be changed by overriding the conv_layer def + if is_stem_deep(stem_type): + # A 3 deep 3x3 conv stack as in ResNet V1D models + if "tiered" in stem_type: + stem_chs = (3 * out_chs // 8, out_chs // 2) # 'T' resnets in resnet.py + else: + stem_chs = (out_chs // 2, out_chs // 2) # 'D' ResNets + stem["conv1"] = conv_layer( + in_chs, stem_chs[0], kernel_size=3, stride=2, is_export=is_export + ) + stem["norm1"] = norm_layer(stem_chs[0]) + stem["conv2"] = conv_layer( + stem_chs[0], stem_chs[1], kernel_size=3, stride=1, is_export=is_export + ) + stem["norm2"] = norm_layer(stem_chs[1]) + stem["conv3"] = conv_layer( + stem_chs[1], out_chs, kernel_size=3, stride=1, is_export=is_export + ) + if not preact: + stem["norm3"] = norm_layer(out_chs) + else: + # The usual 7x7 stem conv + stem["conv"] = conv_layer( + in_chs, out_chs, kernel_size=7, stride=2, is_export=is_export + ) + if not preact: + stem["norm"] = norm_layer(out_chs) + + if "fixed" in stem_type: + # 'fixed' SAME padding approximation that is used in BiT models + stem["pad"] = paddle.nn.Pad2D( + 1, mode="constant", value=0.0, data_format="NCHW", name=None + ) + stem["pool"] = nn.MaxPool2D(kernel_size=3, stride=2, padding=0) + elif "same" in stem_type: + # full, input size based 'SAME' padding, used in ViT Hybrid model + stem["pool"] = create_pool2d( + "max", kernel_size=3, stride=2, padding="same", is_export=is_export + ) + else: + # the usual Pypaddle symmetric padding + stem["pool"] = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + stem_seq = nn.Sequential() + for key, value in stem.items(): + stem_seq.add_sublayer(key, value) + + return stem_seq + + +class ResNetV2(nn.Layer): + """Implementation of Pre-activation (v2) ResNet mode. + + Args: + x: input images with shape [N, 1, H, W] + + Returns: + The extracted features [N, 1, H//16, W//16] + """ + + def __init__( + self, + layers, + channels=(256, 512, 1024, 2048), + num_classes=1000, + in_chans=3, + global_pool="avg", + output_stride=32, + width_factor=1, + stem_chs=64, + stem_type="", + avg_down=False, + preact=True, + act_layer=nn.ReLU, + conv_layer=StdConv2d, + norm_layer=partial(GroupNormAct, num_groups=32), + drop_rate=0.0, + drop_path_rate=0.0, + zero_init_last=False, + is_export=False, + ): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.is_export = is_export + wf = width_factor + self.feature_info = [] + stem_chs = make_div(stem_chs * wf) + self.stem = create_resnetv2_stem( + in_chans, + stem_chs, + stem_type, + preact, + conv_layer=conv_layer, + norm_layer=norm_layer, + is_export=is_export, + ) + stem_feat = ( + ("stem.conv3" if is_stem_deep(stem_type) else "stem.conv") + if preact + else "stem.norm" + ) + self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=stem_feat)) + + prev_chs = stem_chs + curr_stride = 4 + dilation = 1 + block_dprs = [ + x.tolist() + for x in paddle.linspace(0, drop_path_rate, sum(layers)).split(layers) + ] + block_fn = PreActBottleneck if preact else Bottleneck + self.stages = nn.Sequential() + for stage_idx, (d, c, bdpr) in enumerate(zip(layers, channels, block_dprs)): + out_chs = make_div(c * wf) + stride = 1 if stage_idx == 0 else 2 + if curr_stride >= output_stride: + dilation *= stride + stride = 1 + stage = ResNetStage( + prev_chs, + out_chs, + stride=stride, + dilation=dilation, + depth=d, + avg_down=avg_down, + act_layer=act_layer, + conv_layer=conv_layer, + norm_layer=norm_layer, + block_dpr=bdpr, + block_fn=block_fn, + is_export=is_export, + ) + prev_chs = out_chs + curr_stride *= stride + self.feature_info += [ + dict( + num_chs=prev_chs, + reduction=curr_stride, + module=f"stages.{stage_idx}", + ) + ] + self.stages.add_sublayer(str(stage_idx), stage) + + self.num_features = prev_chs + self.norm = norm_layer(self.num_features) if preact else nn.Identity() + self.head = ClassifierHead( + self.num_features, + num_classes, + pool_type=global_pool, + drop_rate=self.drop_rate, + use_conv=True, + ) + + self.init_weights(zero_init_last=zero_init_last) + + def init_weights(self, zero_init_last=True): + named_apply(partial(_init_weights, zero_init_last=zero_init_last), self) + + def load_pretrained(self, checkpoint_path, prefix="resnet/"): + _load_weights(self, checkpoint_path, prefix) + + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool="avg"): + self.num_classes = num_classes + self.head = ClassifierHead( + self.num_features, + num_classes, + pool_type=global_pool, + drop_rate=self.drop_rate, + use_conv=True, + ) + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.head(x) + return x + + +def _init_weights(module: nn.Layer, name: str = "", zero_init_last=True): + if isinstance(module, nn.Linear) or ( + "head.fc" in name and isinstance(module, nn.Conv2D) + ): + normal_(module.weight) + zeros_(module.bias) + elif isinstance(module, nn.Conv2D): + kaiming_normal_(module.weight) + if module.bias is not None: + zeros_(module.bias) + elif isinstance(module, (nn.BatchNorm2D, nn.LayerNorm, nn.GroupNorm)): + ones_(module.weight) + zeros_(module.bias) + elif zero_init_last and hasattr(module, "zero_init_last"): + module.zero_init_last() + + +@paddle.no_grad() +def _load_weights(model: nn.Layer, checkpoint_path: str, prefix: str = "resnet/"): + import numpy as np + + def t2p(conv_weights): + """Possibly convert HWIO to OIHW.""" + if conv_weights.ndim == 4: + conv_weights = conv_weights.transpose([3, 2, 0, 1]) + return paddle.to_tensor(conv_weights) + + weights = np.load(checkpoint_path) + stem_conv_w = adapt_input_conv( + model.stem.conv.weight.shape[1], + t2p(weights[f"{prefix}root_block/standardized_conv2d/kernel"]), + ) + model.stem.conv.weight.copy_(stem_conv_w) + model.norm.weight.copy_(t2p(weights[f"{prefix}group_norm/gamma"])) + model.norm.bias.copy_(t2p(weights[f"{prefix}group_norm/beta"])) + if ( + isinstance(getattr(model.head, "fc", None), nn.Conv2D) + and model.head.fc.weight.shape[0] + == weights[f"{prefix}head/conv2d/kernel"].shape[-1] + ): + model.head.fc.weight.copy_(t2p(weights[f"{prefix}head/conv2d/kernel"])) + model.head.fc.bias.copy_(t2p(weights[f"{prefix}head/conv2d/bias"])) + for i, (sname, stage) in enumerate(model.stages.named_children()): + for j, (bname, block) in enumerate(stage.blocks.named_children()): + cname = "standardized_conv2d" + block_prefix = f"{prefix}block{i + 1}/unit{j + 1:02d}/" + block.conv1.weight.copy_(t2p(weights[f"{block_prefix}a/{cname}/kernel"])) + block.conv2.weight.copy_(t2p(weights[f"{block_prefix}b/{cname}/kernel"])) + block.conv3.weight.copy_(t2p(weights[f"{block_prefix}c/{cname}/kernel"])) + block.norm1.weight.copy_(t2p(weights[f"{block_prefix}a/group_norm/gamma"])) + block.norm2.weight.copy_(t2p(weights[f"{block_prefix}b/group_norm/gamma"])) + block.norm3.weight.copy_(t2p(weights[f"{block_prefix}c/group_norm/gamma"])) + block.norm1.bias.copy_(t2p(weights[f"{block_prefix}a/group_norm/beta"])) + block.norm2.bias.copy_(t2p(weights[f"{block_prefix}b/group_norm/beta"])) + block.norm3.bias.copy_(t2p(weights[f"{block_prefix}c/group_norm/beta"])) + if block.downsample is not None: + w = weights[f"{block_prefix}a/proj/{cname}/kernel"] + block.downsample.conv.weight.copy_(t2p(w)) diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index f9a9528eb0..bcf60e98a2 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -40,6 +40,7 @@ def build_head(config): from .rec_visionlan_head import VLHead from .rec_rfl_head import RFLHead from .rec_can_head import CANHead + from .rec_latexocr_head import LaTeXOCRHead from .rec_satrn_head import SATRNHead from .rec_parseq_head import ParseQHead from .rec_cppd_head import CPPDHead @@ -81,6 +82,7 @@ def build_head(config): "RFLHead", "DRRGHead", "CANHead", + "LaTeXOCRHead", "SATRNHead", "PFHeadLocal", "ParseQHead", diff --git a/ppocr/modeling/heads/rec_latexocr_head.py b/ppocr/modeling/heads/rec_latexocr_head.py new file mode 100644 index 0000000000..4e368da0dd --- /dev/null +++ b/ppocr/modeling/heads/rec_latexocr_head.py @@ -0,0 +1,1027 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/lukas-blecher/LaTeX-OCR/blob/main/pix2tex/models/transformer.py +""" + +import math +import paddle +from paddle import nn, einsum +import paddle.nn.functional as F +from functools import partial +from inspect import isfunction +from collections import namedtuple + +from paddle.nn.initializer import ( + TruncatedNormal, + Constant, + Normal, + KaimingUniform, + XavierUniform, +) + +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) +normal_ = Normal(std=0.02) +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple("Intermediates", ["pre_softmax_attn", "post_softmax_attn"]) + +LayerIntermediates = namedtuple("Intermediates", ["hiddens", "attn_intermediates"]) + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +class always: + def __init__(self, val): + self.val = val + + def __call__(self, *args, **kwargs): + return self.val + + +class not_equals: + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x != self.val + + +class equals: + def __init__(self, val): + self.val = val + + def __call__(self, x, *args, **kwargs): + return x == self.val + + +def max_neg_value(tensor): + return -paddle.finfo(tensor.dtype).max + + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val,) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key( + partial(string_begins_with, prefix), d + ) + kwargs_without_prefix = dict( + map(lambda x: (x[0][len(prefix) :], x[1]), tuple(kwargs_with_prefix.items())) + ) + return kwargs_without_prefix, kwargs + + +# positional embeddings + + +class DepthWiseConv1d(nn.Layer): + def __init__( + self, dim_in, dim_out, kernel_size, padding=0, stride=1, bias=True, groups=False + ): + super().__init__() + groups = default(groups, dim_in) + self.net = nn.Sequential( + nn.Conv1D( + dim_in, + dim_in, + kernel_size=kernel_size, + padding=padding, + groups=dim_in, + stride=stride, + bias_attr=bias, + ), + nn.Conv1D(dim_in, dim_out, 1), + ) + + def forward(self, x): + return self.net(x) + + +class AbsolutePositionalEmbedding(nn.Layer): + def __init__(self, dim, max_seq_len): + super().__init__() + self.emb = nn.Embedding(max_seq_len, dim) + self.init_() + + def init_(self): + + normal_(self.emb.weight) + + def forward(self, x): + n = paddle.arange(x.shape[1]) + return self.emb(n)[None, :, :] + + +class FixedPositionalEmbedding(nn.Layer): + def __init__(self, dim): + super().__init__() + inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = ( + paddle.arange( + x.shape[seq_dim], + ).type_as(self.inv_freq) + + offset + ) + sinusoid_inp = paddle.einsum("i , j -> i j", t, self.inv_freq) + emb = paddle.concat((sinusoid_inp.sin(), sinusoid_inp.cos()), axis=-1) + return emb[None, :, :] + + +class Scale(nn.Layer): + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.value, *rest) + + +class Rezero(nn.Layer): + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = paddle.create_parameter([1], dtype="float32") + zeros_(self.g) + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.g, *rest) + + +class ScaleNorm(nn.Layer): + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = paddle.create_parameter([1], dtype="float32") + ones_(self.g) + + def forward(self, x): + norm = paddle.norm(x, axis=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Layer): + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = paddle.create_parameter([dim]) + ones_(self.g) + + def forward(self, x): + norm = paddle.norm(x, axis=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class Residual(nn.Layer): + def forward(self, x, residual): + return x + residual + + +class GEGLU(nn.Layer): + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, axis=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Layer): + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = ( + nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU()) + if not glu + else GEGLU(dim, inner_dim) + ) + + self.net = nn.Sequential( + project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out) + ) + + def forward(self, x): + return self.net(x) + + +class Attention(nn.Layer): + def __init__( + self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + mask=None, + talking_heads=False, + collab_heads=False, + collab_compression=0.3, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0.0, + on_attn=False, + gate_values=False, + is_export=False, + ): + super().__init__() + self.scale = dim_head**-0.5 + self.heads = heads + self.causal = causal + self.mask = mask + self.is_export = is_export + + qk_dim = v_dim = dim_head * heads + + # collaborative heads + self.collab_heads = collab_heads + if self.collab_heads: + qk_dim = int(collab_compression * qk_dim) + self.collab_mixing = nn.Parameter(paddle.randn(heads, qk_dim)) + + self.to_q = nn.Linear(dim, qk_dim, bias_attr=False) + self.to_k = nn.Linear(dim, qk_dim, bias_attr=False) + self.to_v = nn.Linear(dim, v_dim, bias_attr=False) + + self.dropout = nn.Dropout(dropout) + + # add GLU gating for aggregated values, from alphafold2 + self.to_v_gate = None + if gate_values: + self.to_v_gate = nn.Linear(dim, v_dim) + zeros_(self.to_v_gate.weight) + ones_(self.to_v_gate.bias) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(paddle.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(paddle.randn(heads, heads)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + self.attn_fn = F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(paddle.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(paddle.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = ( + nn.Sequential(nn.Linear(v_dim, dim * 2), nn.GLU()) + if on_attn + else nn.Linear(v_dim, dim) + ) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + rel_pos=None, + sinusoidal_emb=None, + rotary_pos_emb=None, + prev_attn=None, + mem=None, + seq_len=0, + ): + b, n, _, h, talking_heads, collab_heads, has_context = ( + *x.shape, + self.heads, + self.talking_heads, + self.collab_heads, + exists(context), + ) + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = paddle.concat((mem, k_input), axis=-2) + v_input = paddle.concat((mem, v_input), axis=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + def rearrange_q_k_v(x, h, is_export): + if is_export: + b, n, h_d = paddle.shape(x) + else: + b, n, h_d = x.shape + d = h_d // h + return x.reshape([b, n, h, d]).transpose([0, 2, 1, 3]) + + q, k, v = map( + lambda t: rearrange_q_k_v(t, h, is_export=self.is_export), (q, k, v) + ) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default( + mask, + lambda: paddle.ones( + (b, n), + ).cast(paddle.bool), + ) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default( + k_mask, lambda: paddle.ones((b, k.shape[-2])).cast(paddle.bool) + ) + + q_mask = q_mask.reshape([q_mask.shape[0], 1, q_mask.shape[1], 1]) + k_mask = k_mask.reshape([k_mask.shape[0], 1, 1, k_mask.shape[1]]) + input_mask = q_mask * k_mask + + if collab_heads: + k = k.expand(-1, h, -1, -1) + dots = einsum("b h i d, b h j d -> b h i j", q, k) * self.scale + + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots.clone() + + if talking_heads: + dots = einsum( + "b h i j, h k -> b k i j", dots, self.pre_softmax_proj + ).contiguous() + + if exists(rel_pos): + dots = rel_pos(dots) + + input_mask = input_mask.cast(paddle.bool) + if exists(input_mask): + + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if self.causal: + i, j = dots.shape[-2:] + r = paddle.arange(i) + r_shape = r.shape[0] + mask = r.reshape([1, 1, r_shape, 1]) < r.reshape([1, 1, 1, r_shape]) + + if self.is_export: + pad_list = [ + paddle.to_tensor(0, dtype="int32"), + paddle.to_tensor(0, dtype="int32"), + paddle.to_tensor(j - i, dtype="int32"), + paddle.to_tensor(0, dtype="int32"), + ] + mask = F.pad( + mask.cast(paddle.int32), + paddle.to_tensor(pad_list).cast(paddle.int32), + value=False, + ).cast(paddle.bool) + dots = dots.masked_fill_(mask, mask_value) + else: + mask = F.pad(mask.cast(paddle.int32), (0, 0, j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, axis=-1) + post_softmax_attn = attn.clone() + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum( + "b h i j, h k -> b k i j", attn, self.post_softmax_proj + ).contiguous() + out = einsum("b h i j, b h j d -> b h i d", attn, v) + + b, h, n, d = out.shape + out = out.transpose([0, 2, 1, 3]).reshape([b, n, h * d]) + + if exists(self.to_v_gate): + gates = self.gate_v(x) + out = out * gates.sigmoid() + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, post_softmax_attn=post_softmax_attn + ) + + return self.to_out(out), intermediates + + +class AttentionLayers(nn.Layer): + def __init__( + self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rmsnorm=False, + use_rezero=False, + rel_pos_bias=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + position_infused_attn=False, + rotary_pos_emb=False, + rotary_emb_dim=None, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + is_export=False, + **kwargs, + ): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim("ff_", kwargs) + attn_kwargs, _ = groupby_prefix_and_trim("attn_", kwargs) + + dim_head = attn_kwargs.get("dim_head", DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.LayerList([]) + + self.has_pos_emb = position_infused_attn or rel_pos_bias or rotary_pos_emb + self.pia_pos_emb = ( + FixedPositionalEmbedding(dim) if position_infused_attn else None + ) + + assert ( + rel_pos_num_buckets <= rel_pos_max_distance + ), "number of relative position buckets must be less than the relative position max distance" + + self.pre_norm = pre_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + self.cross_attend = cross_attend + self.rel_pos = None + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ("a", "c", "f") + elif cross_attend and only_cross: + default_block = ("c", "f") + else: + default_block = ("a", "f") + if macaron: + default_block = ("f",) + default_block + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, "par ratio out of range" + default_block = tuple(filter(not_equals("f"), default_block)) + par_attn = par_depth // par_ratio + depth_cut = ( + par_depth * 2 // 3 + ) # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert ( + len(default_block) <= par_width + ), "default block is too large for par_ratio" + par_block = default_block + ("f",) * (par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ("f",) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert ( + sandwich_coef > 0 and sandwich_coef <= depth + ), "sandwich coefficient should be less than the depth" + layer_types = ( + ("a",) * sandwich_coef + + default_block * (depth - sandwich_coef) + + ("f",) * sandwich_coef + ) + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals("a"), layer_types))) + for layer_type in self.layer_types: + if layer_type == "a": + layer = Attention( + dim, heads=heads, causal=causal, is_export=is_export, **attn_kwargs + ) + elif layer_type == "c": + layer = Attention(dim, heads=heads, is_export=is_export, **attn_kwargs) + elif layer_type == "f": + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f"invalid layer type {layer_type}") + if isinstance(layer, Attention) and exists(branch_fn): + layer = branch_fn(layer) + residual_fn = Residual() + self.layers.append(nn.LayerList([norm_fn(), layer, residual_fn])) + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + mems=None, + seq_len=0, + return_hiddens=False, + ): + assert not ( + self.cross_attend ^ exists(context) + ), "context must be passed in if cross_attend is set to True" + + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + rotary_pos_emb = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + + for ind, (layer_type, (norm, block, residual_fn)) in enumerate( + zip(self.layer_types, self.layers) + ): + is_last = ind == (len(self.layers) - 1) + + if layer_type == "a": + hiddens.append(x) + layer_mem = mems.pop(0) + + residual = x + + if self.pre_norm: + x = norm(x) + + if layer_type == "a": + out, inter = block( + x, + mask=mask, + sinusoidal_emb=self.pia_pos_emb, + rel_pos=self.rel_pos, + rotary_pos_emb=rotary_pos_emb, + prev_attn=prev_attn, + mem=layer_mem, + ) + elif layer_type == "c": + out, inter = block( + x, + context=context, + mask=mask, + context_mask=context_mask, + prev_attn=prev_cross_attn, + ) + elif layer_type == "f": + out = block(x) + + x = residual_fn(out, residual) + + if layer_type in ("a", "c"): + intermediates.append(inter) + + if layer_type == "a" and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == "c" and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if not self.pre_norm and not is_last: + x = norm(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, attn_intermediates=intermediates + ) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + def __init__(self, **kwargs): + assert "causal" not in kwargs, "cannot set causality on encoder" + super().__init__(causal=False, **kwargs) + + +class Decoder(AttentionLayers): + def __init__(self, **kwargs): + assert "causal" not in kwargs, "cannot set causality on decoder" + super().__init__(causal=True, **kwargs) + + +class CrossAttender(AttentionLayers): + def __init__(self, **kwargs): + super().__init__(cross_attend=True, only_cross=True, **kwargs) + + +def create_latex_parameter(shape): + return paddle.create_parameter( + shape=shape, + dtype="float32", + default_initializer=paddle.nn.initializer.Assign(paddle.randn(shape)), + ) + + +class TransformerDecoder(nn.Layer): + def __init__( + self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0.0, + emb_dropout=0.0, + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True, + is_export=False, + ): + super().__init__() + assert isinstance( + attn_layers, AttentionLayers + ), "attention layers must be one of Encoder or Decoder" + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = ( + AbsolutePositionalEmbedding(emb_dim, max_seq_len) + if (use_pos_emb and not attn_layers.has_pos_emb) + else always(0) + ) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + self.is_export = is_export + + self.init_() + + self.to_logits = ( + nn.Linear(dim, num_tokens) + if not tie_embedding + else lambda t: t @ self.token_emb.weight.t() + ) + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = create_latex_parameter([num_memory_tokens, dim]) + + # let funnel encoder know number of memory tokens, if specified + # TODO: think of a cleaner solution + if hasattr(attn_layers, "num_memory_tokens"): + attn_layers.num_memory_tokens = num_memory_tokens + + def init_(self): + normal_(self.token_emb.weight) + + def forward( + self, + x, + return_embeddings=False, + mask=None, + return_mems=False, + return_attn=False, + seq_len=0, + mems=None, + **kwargs, + ): + b, n, num_mem = *x.shape, self.num_memory_tokens + x = self.token_emb(x) + x = x + self.pos_emb(x) + + x = self.emb_dropout(x) + x = self.project_emb(x) + + x, intermediates = self.attn_layers( + x, mask=mask, mems=mems, return_hiddens=True, seq_len=seq_len, **kwargs + ) + x = self.norm(x) + mem, x = x[:, :num_mem], x[:, num_mem:] + out = self.to_logits(x) if not return_embeddings else x + if return_mems: + hiddens = intermediates.hiddens + new_mems = ( + list(map(lambda pair: paddle.concat(pair, axis=-2), zip(mems, hiddens))) + if exists(mems) + else hiddens + ) + new_mems = list( + map(lambda t: t[..., -self.max_mem_len :, :].detach(), new_mems) + ) + return out, new_mems + + if return_attn: + attn_maps = list( + map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates) + ) + return out, attn_maps + + return out + + +def top_p(logits, thres=0.9): + sorted_logits, sorted_indices = paddle.sort(logits, descending=True) + cum_probs = paddle.cumsum(F.softmax(sorted_logits, axis=-1), axis=-1) + + sorted_indices_to_remove = cum_probs > (1 - thres) + sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone() + sorted_indices_to_remove[:, 0] = 0 + + sorted_logits[sorted_indices_to_remove] = float("-inf") + return sorted_logits.scatter(1, sorted_indices, sorted_logits) + + +# topk + + +def top_k(logits, thres=0.9): + k = int((1 - thres) * logits.shape[-1]) + val, ind = paddle.topk(logits, k) + probs = paddle.full_like(logits, float("-inf")) + probs = paddle.put_along_axis(probs, ind, val, 1) + return probs + + +class LaTeXOCRHead(nn.Layer): + """Implementation of LaTeX OCR decoder. + + Args: + encoded_feat: The encoded features with shape[N, 1, H//16, W//16] + tgt_seq: LaTeX-OCR labels with shape [N, L] , L is the max sequence length + xi: The first N-1 LaTeX-OCR sequences in tgt_seq with shape [N, L-1] + mask: The first N-1 LaTeX-OCR attention mask with shape [N, L-1] , L is the max sequence length + + Returns: + The predicted LaTeX sequences with shape [N, L-1, C], C is the number of LaTeX classes + """ + + def __init__( + self, + net=None, + in_channels=256, + out_channels=256, + pad_value=0, + decoder_args=None, + is_export=False, + ): + super().__init__() + decoder = Decoder( + dim=256, depth=4, heads=8, is_export=is_export, **decoder_args + ) + transformer_decoder = TransformerDecoder( + num_tokens=8000, + max_seq_len=512, + attn_layers=decoder, + is_export=is_export, + ) + self.temperature = 0.333 + self.bos_token = 1 + self.eos_token = 2 + self.max_length = 512 + self.pad_value = pad_value + + self.net = transformer_decoder + self.max_seq_len = self.net.max_seq_len + self.is_export = is_export + + @paddle.no_grad() + def generate( + self, + start_tokens, + seq_len, + eos_token=None, + temperature=1.0, + filter_logits_fn=top_k, + filter_thres=0.9, + **kwargs, + ): + was_training = self.net.training + num_dims = len(start_tokens.shape) + + if num_dims == 1: + start_tokens = start_tokens[None, :] + + b, t = start_tokens.shape + + self.net.eval() + out = start_tokens + mask = kwargs.pop("mask", None) + + if mask is None: + mask = paddle.full_like(out, True, dtype=paddle.bool) + + for _ in range(seq_len): + x = out[:, -self.max_seq_len :] + mask = mask[:, -self.max_seq_len :] + logits = self.net(x, mask=mask, **kwargs)[:, -1, :] + if filter_logits_fn in {top_k, top_p}: + filtered_logits = filter_logits_fn(logits, thres=filter_thres) + + probs = F.softmax(filtered_logits / temperature, axis=-1) + else: + raise NotImplementedError("The filter_logits_fn is not supported ") + + sample = paddle.multinomial(probs, 1) + out = paddle.concat((out, sample), axis=-1) + pad_mask = paddle.full(shape=[mask.shape[0], 1], fill_value=1, dtype="bool") + mask = paddle.concat((mask, pad_mask), axis=1) + if ( + eos_token is not None + and ( + paddle.cumsum((out == eos_token).cast(paddle.int64), 1)[:, -1] >= 1 + ).all() + ): + break + out = out[:, t:] + if num_dims == 1: + out = out.squeeze(0) + return out + + @paddle.no_grad() + def generate_export( + self, + start_tokens, + seq_len, + eos_token=None, + context=None, + temperature=1.0, + filter_logits_fn=None, + filter_thres=0.9, + **kwargs, + ): + was_training = self.net.training + num_dims = len(start_tokens.shape) + + if num_dims == 1: + start_tokens = start_tokens[None, :] + + b, t = start_tokens.shape + + self.net.eval() + out = start_tokens + mask = kwargs.pop("mask", None) + + if mask is None: + mask = paddle.full_like(out, True, dtype=paddle.bool) + + i_idx = paddle.full([], 0) + while i_idx < paddle.to_tensor(seq_len): + x = out[:, -self.max_seq_len :] + paddle.jit.api.set_dynamic_shape(x, [-1, -1]) + mask = mask[:, -self.max_seq_len :] + paddle.jit.api.set_dynamic_shape(mask, [-1, -1]) + logits = self.net(x, mask=mask, context=context, seq_len=i_idx, **kwargs)[ + :, -1, : + ] + if filter_logits_fn in {top_k, top_p}: + filtered_logits = filter_logits_fn(logits, thres=filter_thres) + + probs = F.softmax(filtered_logits / temperature, axis=-1) + + sample = paddle.multinomial(probs, 1) + out = paddle.concat((out, sample), axis=-1) + + pad_mask = paddle.full(shape=[mask.shape[0], 1], fill_value=1, dtype="bool") + mask = paddle.concat((mask, pad_mask), axis=1) + if ( + eos_token is not None + and ( + paddle.cumsum((out == eos_token).cast(paddle.int64), 1)[:, -1] >= 1 + ).all() + ): + out = out[:, t:] + if num_dims == 1: + out = out.squeeze(0) + return out + i_idx += 1 + out = out[:, t:] + if num_dims == 1: + out = out.squeeze(0) + return out + + # forward for export + def forward(self, inputs, targets=None): + if not self.training: + encoded_feat = inputs + batch_num = encoded_feat.shape[0] + bos_tensor = paddle.full([batch_num, 1], self.bos_token, dtype=paddle.int64) + if self.is_export: + word_pred = self.generate_export( + bos_tensor, + self.max_seq_len, + eos_token=self.eos_token, + context=encoded_feat, + temperature=self.temperature, + filter_logits_fn=top_k, + ) + else: + word_pred = self.generate( + bos_tensor, + self.max_seq_len, + eos_token=self.eos_token, + context=encoded_feat, + temperature=self.temperature, + filter_logits_fn=top_k, + ) + return word_pred + + encoded_feat, tgt_seq, mask = inputs + kwargs = {"context": encoded_feat, "mask": mask.cast(paddle.bool)} + x = tgt_seq + xi = x[:, :-1] + + mask = kwargs.get("mask", None) + if mask is not None and mask.shape[1] == x.shape[1]: + mask = mask[:, :-1] + kwargs["mask"] = mask + out = self.net(xi, **kwargs) + + return out diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py index 67376a0644..50a26e8117 100644 --- a/ppocr/modeling/heads/table_att_head.py +++ b/ppocr/modeling/heads/table_att_head.py @@ -368,7 +368,7 @@ def forward(self, inputs, targets=None): loc_preds = loc_preds[:, : max_len + 1] else: structure_ids = paddle.zeros( - (batch_size, self.max_text_length + 1), dtype=paddle.int64 + (batch_size, self.max_text_length + 1), dtype="int32" ) pre_chars = paddle.zeros(shape=[batch_size], dtype="int32") max_text_length = paddle.to_tensor(self.max_text_length) diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index e0a6a87fd3..04579a376a 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -42,6 +42,7 @@ SATRNLabelDecode, ParseQLabelDecode, CPPDLabelDecode, + LaTeXOCRDecode, ) from .cls_postprocess import ClsPostProcess from .pg_postprocess import PGPostProcess @@ -96,6 +97,7 @@ def build_post_process(config, global_config=None): "SATRNLabelDecode", "ParseQLabelDecode", "CPPDLabelDecode", + "LaTeXOCRDecode", ] if config["name"] == "PSEPostProcess": diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 46b629d531..3902c3f92d 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -1210,3 +1210,55 @@ def __call__(self, preds, label=None, *args, **kwargs): def add_special_char(self, dict_character): dict_character = [""] + dict_character return dict_character + + +class LaTeXOCRDecode(object): + """Convert between latex-symbol and symbol-index""" + + def __init__(self, rec_char_dict_path, **kwargs): + from tokenizers import Tokenizer as TokenizerFast + + super(LaTeXOCRDecode, self).__init__() + self.tokenizer = TokenizerFast.from_file(rec_char_dict_path) + + def post_process(self, s): + text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})" + letter = "[a-zA-Z]" + noletter = "[\W_^\d]" + names = [x[0].replace(" ", "") for x in re.findall(text_reg, s)] + s = re.sub(text_reg, lambda match: str(names.pop(0)), s) + news = s + while True: + s = news + news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, noletter), r"\1\2", s) + news = re.sub(r"(?!\\ )(%s)\s+?(%s)" % (noletter, letter), r"\1\2", news) + news = re.sub(r"(%s)\s+?(%s)" % (letter, noletter), r"\1\2", news) + if news == s: + break + return s + + def decode(self, tokens): + if len(tokens.shape) == 1: + tokens = tokens[None, :] + dec = [self.tokenizer.decode(tok) for tok in tokens] + dec_str_list = [ + "".join(detok.split(" ")) + .replace("Ġ", " ") + .replace("[EOS]", "") + .replace("[BOS]", "") + .replace("[PAD]", "") + .strip() + for detok in dec + ] + return [self.post_process(dec_str) for dec_str in dec_str_list] + + def __call__(self, preds, label=None, mode="eval", *args, **kwargs): + if mode == "train": + preds_idx = np.array(preds.argmax(axis=2)) + text = self.decode(preds_idx) + else: + text = self.decode(np.array(preds)) + if label is None: + return text + label = self.decode(np.array(label)) + return text, label diff --git a/ppocr/utils/dict/bn_dict.txt b/ppocr/utils/dict/bn_dict.txt new file mode 100644 index 0000000000..655c5853a8 --- /dev/null +++ b/ppocr/utils/dict/bn_dict.txt @@ -0,0 +1,477 @@ + +! +# +$ +% +& +' +( ++ + +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +' +" +ঃ +, +; +! +? +। +— + ঃ- +- +( +) +[ +] +{ +} +√ +... +. +/ +\ += +< +> +৳ +° +% +অ +আ +ই +ঈ +উ +ঊ +ঋ +এ +ঐ +ও +ঔ +্‌ +া +ি +ী +ু +ূ +ে +ৈ +ো +ৌ +ৃ +‍্য +‍্র +‍্ব +ক +খ +গ +ঘ +ঙ +চ +ছ +জ +ঝ +ঞ +ট +ঠ +ড +ঢ +ণ +ত +থ +দ +ধ +ন +প +ফ +ব +ভ +ম +য +র +ল +শ +ষ +স +হ +ড় +ঢ় +য় +ং +ৎ +ঁ +ক্ক +ক্ট +ক্ত +ত্র +ক্ব +ক্ম +ক্য +ক্র +ক্ল +ক্ষ +ক্ষ্ণ +ক্ষ্ব +ক্ষ্ম +ক্ষ্য +ক্স +র্ক +র্ক্য +খ্য +খ্র +র্খ +গ্ণ +গ্ধ +গ্ধ্য +গ্ধ্র +গ্ন +গ্ন্য +গ্ব +গ্ম +গ্য +গ্র +গ্র্য +গ্ল +র্গ +র্গ্য +র্গ্র +ঘ্ন +ঘ্য +ঘ্র +র্ঘ্য +র্ঘ +ঙ্ক +ঙ্ক্য +ঙ্ক্ষ +ঙ্খ +ঙ্খ্য +ঙ্গ +ঙ্গ্য +ঙ্ঘ +ঙ্ঘ্য +ঙ্ঘ্র +ঙ্ম +র্ঙ্গ +চ্চ +চ্ছ +চ্ছ্ব +চ্ছ্র +চ্ঞ +চ্ব +চ্য +র্চ্য +র্চ +র্ছ +জ্জ +জ্জ্ব +জ্ঝ +জ্ঞ +জ্ব +জ্য +জ্র +র্জ্য +র্জ্জ +র্জ্ঞ +র্জ +র্ঝ +ঞ্চ +ঞ্ছ +ঞ্জ +ঞ্ঝ +ট্ট +ট্ব +ট্ম +ট্য +ট্র +র্ট +ড্ড +ড্ব +ড্ম +ড্য +ড্র +র্ড +ঢ্য +ঢ্র +র্ঢ্য +ণ্ট +ণ্ঠ +ণ্ঠ্য +ণ্ড +ণ্ড্য +ণ্ড্র +ণ্ঢ +ণ্ণ +ণ্ব +ণ্ম +ণ্য +র্ণ্য +র্ণ +ত্ত +ত্ত্ব +ত্ত্য +ত্থ +ত্ন +ত্ব +ত্ম +ত্ম্য +ত্য +ত্র +ত্র্য +র্ত্য +র্ত +র্ত্ম +র্ত্র +থ্ব +থ্য +থ্র +র্থ্য +র্থ +দ্গ +দ্ঘ +দ্দ +দ্দ্ব +দ্ধ +দ্ব +দ্ভ +দ্ভ্র +দ্ম +দ্য +দ্র +দ্র্য +র্দ +র্দ্ব +র্দ্র +ধ্ন +ধ্ব +ধ্ম +ধ্য +ধ্র +র্ধ +র্ধ্ব +ন্ট +ন্ট্র +ন্ঠ +ন্ড +ন্ড্র +ন্ত +ন্ত্ব +ন্ত্য +ন্ত্র +ন্ত্র্য +ন্থ +ন্থ্র +ন্দ +ন্দ্য +ন্দ্ব +ন্দ্র +ন্ধ +ন্ধ্য +ন্ধ্র +ন্ন +ন্ব +ন্ম +ন্য +র্ন +প্ট +প্ত +প্ন +প্প +প্য +প্র +প্র্য +প্ল +প্স +র্প +ফ্র +ফ্ল +র্ফ +ব্জ +ব্দ +ব্ধ +ব্ব +ব্য +ব্র +ব্ল +র্ব্য +র্ব +ভ্ব +ভ্য +ভ্র +ভ্ল +র্ভ +ম্ন +ম্প +ম্প্র +ম্ফ +ম্ব +ম্ব্র +ম্ভ +ম্ভ্র +ম্ম +ম্য +ম্র +ম্ল +র্ম্য +র্ম +য্য +র্য +ল্ক +ল্ক্য +ল্গ +ল্ট +ল্ড +ল্প +ল্ফ +ল্ব +ল্ভ +ল্ম +ল্য +ল্ল +র্ল +শ্চ +শ্ছ +শ্ন +শ্ব +শ্ম +শ্য +শ্র +শ্ল +র্শ্য +র্শ +র্শ্ব +ষ্ক +ষ্ক্ব +ষ্ক্র +ষ্ট +ষ্ট্য +ষ্ট্র +ষ্ঠ +ষ্ঠ্য +ষ্ণ +ষ্ণ্ব +ষ্প +ষ্প্র +ষ্ফ +ষ্ব +ষ্ম +ষ্য +র্ষ্য +র্ষ +র্ষ্ট +র্ষ্ণ +র্ষ্ণ্য +স্ক +স্ক্র +স্খ +স্ট +স্ট্র +স্ত +স্ত্ব +স্ত্য +স্ত্র +স্থ +স্থ্য +স্ন +স্ন্য +স্প +স্প্র +স্প্ল +স্ফ +স্ব +স্ম +স্য +স্র +স্ল +স্ক্ল +র্স +হ্ণ +হ্ন +হ্ব +হ্ম +হ্য +হ্র +হ্ল +র্হ্য +র্হ +ড়্গ +র্ৎ +০ +১ +২ +৩ +৪ +৫ +৬ +৭ +৮ +৯ diff --git a/ppocr/utils/dict/latex_ocr_tokenizer.json b/ppocr/utils/dict/latex_ocr_tokenizer.json new file mode 100644 index 0000000000..e8fd4f6d82 --- /dev/null +++ b/ppocr/utils/dict/latex_ocr_tokenizer.json @@ -0,0 +1 @@ +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[PAD]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[BOS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[EOS]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":true},"post_processor":null,"decoder":null,"model":{"dropout":null,"unk_token":null,"continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[PAD]":0,"[BOS]":1,"[EOS]":2,"!":3,"\"":4,"#":5,"$":6,"&":7,"'":8,"(":9,")":10,"*":11,"+":12,",":13,"-":14,".":15,"/":16,"0":17,"1":18,"2":19,"3":20,"4":21,"5":22,"6":23,"7":24,"8":25,"9":26,":":27,";":28,"<":29,"=":30,">":31,"?":32,"@":33,"A":34,"B":35,"C":36,"D":37,"E":38,"F":39,"G":40,"H":41,"I":42,"J":43,"K":44,"L":45,"M":46,"N":47,"O":48,"P":49,"Q":50,"R":51,"S":52,"T":53,"U":54,"V":55,"W":56,"X":57,"Y":58,"Z":59,"[":60,"\\":61,"]":62,"^":63,"_":64,"`":65,"a":66,"b":67,"c":68,"d":69,"e":70,"f":71,"g":72,"h":73,"i":74,"j":75,"k":76,"l":77,"m":78,"n":79,"o":80,"p":81,"q":82,"r":83,"s":84,"t":85,"u":86,"v":87,"w":88,"x":89,"y":90,"z":91,"{":92,"|":93,"}":94,"~":95,"½":96,"¿":97,"ï":98,"Ċ":99,"č":100,"Ġ":101,"Ġ}":102,"Ġ{":103,"Ġ\\":104,"Ġ_":105,"Ġ^":106,"Ġ2":107,"Ġ)":108,"Ġ(":109,"Ġ1":110,"ra":111,"Ġ=":112,"Ġ-":113,"čĊ":114,"Ġ,":115,"fra":116,"frac":117,"Ġ+":118,"ma":119,"ta":120,"ig":121,"Ġ0":122,"ar":123,"al":124,"le":125,"Ġi":126,"th":127,"Ġx":128,"ft":129,"igh":130,"me":131,"righ":132,"math":133,"Ġn":134,"Ġ.":135,"Ġ\\,":136,"in":137,"ph":138,"Ġd":139,"left":140,"Ġa":141,"right":142,"am":143,"eta":144,"ti":145,"Ġm":146,"mu":147,"Ġ3":148,"Ġk":149,"Ġt":150,"Ġe":151,"Ġr":152,"Ġ&":153,"Ġc":154,"Ġp":155,"si":156,"rm":157,"de":158,"mathrm":159,"Ġ4":160,"Ġs":161,"pr":162,"Ġ~":163,"pha":164,"Ġl":165,"alpha":166,"da":167,"ĠA":168,"Ġ\\;":169,"ot":170,"pi":171,"par":172,"tial":173,"partial":174,"ime":175,"prime":176,"psi":177,"dot":178,"Ġj":179,"Ġb":180,"Ġf":181,"lta":182,"Ġ|":183,"amma":184,"bda":185,"ambda":186,"phi":187,"Ġq":188,"bf":189,"Ġg":190,"nu":191,"Ġz":192,"ray":193,"array":194,"ĠM":195,"ĠT":196,"Ġy":197,"cal":198,"bar":199,"ĠN":200,"igma":201,"ĠR":202,"rt":203,"lambda":204,"int":205,"ĠB":206,"ve":207,"ng":208,"qu":209,"ĠL":210,"Ġ/":211,"lo":212,"beta":213,"ngle":214,"Ġu":215,"delta":216,"sq":217,"sqrt":218,"theta":219,"Ġ\\\\":220,"gamma":221,"Ġ]":222,"sigma":223,"ga":224,"mega":225,"ĠD":226,"ĠF":227,"Ġ[":228,"ĠS":229,"mathbf":230,"su":231,"ĠP":232,"lon":233,"Ġv":234,"sum":235,"psilon":236,"ĠV":237,"ĠC":238,"cdot":239,"epsilon":240,"at":241,"hat":242,"ad":243,"quad":244,"Ġh":245,"ho":246,"rho":247,"hi":248,"to":249,"ĠE":250,"la":251,"ĠH":252,"lde":253,"tilde":254,"ĠQ":255,"Ġ5":256,"var":257,"ĠX":258,"ĠG":259,"be":260,"nd":261,"omega":262,"end":263,"gin":264,"begin":265,"tau":266,"Ġ6":267,"na":268,"vec":269,"ĠI":270,"Ġo":271,"rangle":272,"Ġ*":273,"De":274,"Delta":275,"Gamma":276,"pe":277,"fty":278,"infty":279,"ĠK":280,"xi":281,"Ġ8":282,"ow":283,"ĠJ":284,"ĠU":285,"row":286,"tar":287,"ge":288,"Phi":289,"ap":290,"ĠW":291,"co":292,"mes":293,"times":294,"sin":295,"ĠZ":296,"langle":297,"ope":298,"rna":299,"rato":300,"operato":301,"rname":302,"operatorname":303,"tarrow":304,"lin":305,"line":306,"varphi":307,"pm":308,"rline":309,"Lambda":310,"Ġ\\!":311,"Ġ;":312,"dots":313,"cos":314,"Ġw":315,"rightarrow":316,"big":317,"chi":318,"ove":319,"Ġ\\}":320,"overline":321,"Ġ7":322,"ex":323,"pa":324,"st":325,"pro":326,"qquad":327,"iv":328,"equ":329,"equiv":330,"ĠO":331,"ln":332,"Omega":333,"ll":334,"Ġ9":335,"kap":336,"kappa":337,"Big":338,"Ġ\\{":339,"dag":340,"ĠY":341,"\\{":342,"varepsilon":343,"cdots":344,"Ġ:":345,"mathcal":346,"Psi":347,"Ġ>":348,"bo":349,"bol":350,"Ġ<":351,"ger":352,"dagger":353,"ldots":354,"ell":355,"bla":356,"nabla":357,"exp":358,"yle":359,"style":360,"zeta":361,"Sigma":362,"wi":363,"wide":364,"sim":365,"leq":366,"Ġ!":367,"bigg":368,"mathb":369,"mathbb":370,"Ġ\\:":371,"hbar":372,"otimes":373,"bold":374,"\\}":375,"mi":376,"prox":377,"approx":378,"Pi":379,"log":380,"mid":381,"sp":382,"vert":383,"di":384,"prod":385,"per":386,"perp":387,"ystyle":388,"laystyle":389,"splaystyle":390,"displaystyle":391,"meq":392,"simeq":393,"ed":394,"wed":395,"wedge":396,"widetilde":397,"sy":398,"sym":399,"symbol":400,"boldsymbol":401,"ck":402,"tex":403,"text":404,"ri":405,"Th":406,"Theta":407,"geq":408,"se":409,"eq":410,"nde":411,"unde":412,"tan":413,"sc":414,"ast":415,"rc":416,"set":417,"pt":418,"widehat":419,"ci":420,"circ":421,"re":422,"ript":423,"script":424,"underline":425,"Ġ\\|":426,"rel":427,"neq":428,"sta":429,"stack":430,"stackrel":431,"sinh":432,"op":433,"us":434,"cosh":435,"Bigg":436,"ce":437,"textstyle":438,"star":439,"not":440,"frak":441,"mathfrak":442,"mp":443,"biggr":444,"lus":445,"oplus":446,"vartheta":447,"biggl":448,"Bigr":449,"bra":450,"Bigl":451,"fo":452,"sf":453,"sub":454,"subset":455,"ngrightarrow":456,"ec":457,"boldmath":458,"rall":459,"forall":460,"scriptstyle":461,"ect":462,"parrow":463,"uparrow":464,"bj":465,"bject":466,"pto":467,"propto":468,"Ġ'":469,"longrightarrow":470,"bigl":471,"bigr":472,"oint":473,"ps":474,"maps":475,"mapsto":476,"om":477,"lle":478,"\\|":479,"ddot":480,"cu":481,"bin":482,"binom":483,"vdots":484,"angle":485,"leftrightarrow":486,"over":487,"or":488,"mathsf":489,"cup":490,"brace":491,"no":492,"arc":493,"flo":494,"floor":495,"tri":496,"triangle":497,"Xi":498,"cot":499,"bot":500,"cong":501,"it":502,"mbe":503,"numbe":504,"nonumbe":505,"nonumber":506,"cap":507,"Righ":508,"Rightarrow":509,"ze":510,"size":511,"textrm":512,"ne":513,"arctan":514,"ralle":515,"paralle":516,"parallel":517,"cfrac":518,"Ġ--":519,"object":520,"ĠObject":521,"brack":522,"sh":523,"arrow":524,"own":525,"varrho":526,"subseteq":527,"rbrace":528,"textbf":529,"imath":530,"od":531,"down":532,"he":533,"land":534,"scriptscriptstyle":535,"scriptsize":536,"che":537,"check":538,"sla":539,"overrightarrow":540,"downarrow":541,"Biggl":542,"gg":543,"nto":544,"phanto":545,"phantom":546,"exi":547,"hline":548,"sts":549,"exists":550,"Biggr":551,"bu":552,"rfloor":553,"ddots":554,"io":555,"iota":556,"llet":557,"bullet":558,"colon":559,"inus":560,"Upsilon":561,"lfloor":562,"lbrack":563,"underbrace":564,"neg":565,"Im":566,"mathit":567,"tin":568,"tiny":569,"jmath":570,"lef":571,"slash":572,"vee":573,"minus":574,"setminus":575,"Re":576,"iint":577,"leftarrow":578,"Ve":579,"Vert":580,"atop":581,"sup":582,"bigcup":583,"wp":584,"dim":585,"sec":586,"supset":587,"Lo":588,"lor":589,"pmod":590,"mod":591,"bigoplus":592,"il":593,"bmod":594,"coth":595,"Le":596,"ftrightarrow":597,"Leftrightarrow":598,"ngleftrightarrow":599,"sma":600,"upsilon":601,"\\,":602,"csc":603,"eph":604,"aleph":605,"bigwedge":606,"arcsin":607,"small":608,"odot":609,"overset":610,"rbrack":611,"mit":612,"lbrace":613,"li":614,"arp":615,"arge":616,"Ġ\\#":617,"bre":618,"textsf":619,"Longrightarrow":620,"breve":621,"em":622,"yset":623,"varpi":624,"ptyset":625,"emptyset":626,"ff":627,"iff":628,"nt":629,"er":630,"lap":631,"lnot":632,"dash":633,"under":634,"slant":635,"arg":636,"underset":637,"Bo":638,"Box":639,"Ġ\"":640,"spa":641,"space":642,"deg":643,"iiint":644,"oo":645,"otnot":646,"footnot":647,"arpoo":648,"footnote":649,"rlap":650,"es":651,"imp":652,"sb":653,"te":654,"bigtriangle":655,"lies":656,"implies":657,"\\;":658,"ker":659,"footnotesize":660,"tharpoo":661,"up":662,"acu":663,"acute":664,"longleftrightarrow":665,"eil":666,"lce":667,"rceil":668,"lceil":669,"vphantom":670,"en":671,"thin":672,"ack":673,"back":674,"tt":675,"backslash":676,"xrightarrow":677,"vdash":678,"top":679,"rightharpoo":680,"varsigma":681,"Longleftrightarrow":682,"mathop":683,"large":684,"bigcap":685,"leqslant":686,"Ġ`":687,"overbrace":688,"nup":689,"rightharpoonup":690,"bigotimes":691,"triangleq":692,"Large":693,"ru":694,"null":695,"bigtriangleup":696,"varno":697,"thing":698,"varnothing":699,"doteq":700,"Ġ\\_":701,"overleftarrow":702,"hf":703,"bigstar":704,"enspace":705,"\\!":706,"stru":707,"strut":708,"ominus":709,"div":710,"ond":711,"amond":712,"ddagger":713,"Ġcm":714,"ni":715,"sk":716,"diamond":717,"rVert":718,"prot":719,"protect":720,"ip":721,"varDelta":722,"notin":723,"skip":724,"lVert":725,"Ġ\\/":726,"dotsc":727,"ill":728,"ule":729,"\\:":730,"hfill":731,"krightarrow":732,"okrightarrow":733,"hookrightarrow":734,"sharp":735,"Vdash":736,"bigvee":737,"subsetneq":738,"supseteq":739,"Ġ?":740,"ngmapsto":741,"longmapsto":742,"cdotp":743,"geqslant":744,"bigtriangledown":745,"dotsb":746,"lim":747,"fl":748,"triangleleft":749,"flat":750,"sl":751,"box":752,"Ġ---":753,"sqcup":754,"jlim":755,"ls":756,"mo":757,"dels":758,"ref":759,"models":760,"tag":761,"Pr":762,"mal":763,"ou":764,"llap":765,"thinspace":766,"enskip":767,"Vec":768,"ebox":769,"kebox":770,"nor":771,"rd":772,"squ":773,"vline":774,"¿½":775,"�":776,"Ġ�":777,"makebox":778,"surd":779,"normal":780,"are":781,"square":782,"pou":783,"mathrel":784,"varOmega":785,"nds":786,"smallsetminus":787,"pounds":788,"ns":789,"ss":790,"smi":791,"mathor":792,"rightlef":793,"textup":794,"tharpoons":795,"smile":796,"mathord":797,"rightleftharpoons":798,"cc":799,"Ġ\\-":800,"succ":801,"ftarrow":802,"rtimes":803,"det":804,"prec":805,"texttt":806,"oslash":807,"Ġ\\&":808,"arrowvert":809,"lg":810,"Ġmm":811,"inter":812,"ngleftarrow":813,"hfil":814,"intercal":815,"frow":816,"Ġ\\*":817,"frown":818,"mpe":819,"Ġpt":820,"varpro":821,"searrow":822,"bumpe":823,"varprojlim":824,"bumpeq":825,"Down":826,"SS":827,"cd":828,"ere":829,"gcd":830,"ohe":831,"tw":832,"leme":833,"there":834,"injlim":835,"tit":836,"adrightarrow":837,"varinjlim":838,"comp":839,"textit":840,"fore":841,"overleftrightarrow":842,"Downarrow":843,"oheadrightarrow":844,"twoheadrightarrow":845,"lement":846,"therefore":847,"complement":848,"ca":849,"thi":850,"longleftarrow":851,"bigm":852,"triangleright":853,"nearrow":854,"\\#":855,"nce":856,"ral":857,"cance":858,"thick":859,"cancel":860,"Uparrow":861,"nat":862,"ural":863,"mathstrut":864,"suit":865,"bigcirc":866,"smallskip":867,"diamondsuit":868,"normalsize":869,"natural":870,"gt":871,"less":872,"mathtt":873,"bigsqcup":874,"thicksim":875,"lesssim":876,"bow":877,"llde":878,"tie":879,"nullde":880,"miter":881,"limiter":882,"kern":883,"bowtie":884,"nulldelimiter":885,"nulldelimiterspace":886,"Da":887,"hphantom":888,"ro":889,"vDa":890,"barwedge":891,"beth":892,"eqno":893,"vDash":894,"AR":895,"Di":896,"GE":897,"LAR":898,"dskip":899,"ts":900,"Ġ@":901,"medskip":902,"ndown":903,"gets":904,"coprod":905,"dotsm":906,"smash":907,"rightharpoondown":908,"Diamond":909,"LARGE":910,"nrightarrow":911,"nleftrightarrow":912,"rsim":913,"rne":914,"warrow":915,"mathc":916,"corne":917,"textnormal":918,"preceq":919,"gtrsim":920,"roup":921,"corner":922,"Ġ\\[":923,"Ġ\\]":924,"mathope":925,"lefteq":926,"lose":927,"varkappa":928,"Bigm":929,"Biggm":930,"mathclose":931,"mathopen":932,"lefteqn":933,"Bar":934,"Ti":935,"lr":936,"swarrow":937,"uge":938,"vru":939,"xleftarrow":940,"mathnormal":941,"rightrightarrow":942,"rightleftarrow":943,"sqsubseteq":944,"succeq":945,"Tilde":946,"lrcorner":947,"vrule":948,"rightrightarrows":949,"rightleftarrows":950,"AA":951,"Hat":952,"ak":953,"ble":954,"dou":955,"hss":956,"min":957,"nright":958,"nleftarrow":959,"uph":960,"wbre":961,"allo":962,"side":963,"sqcap":964,"hom":965,"bigodot":966,"arpoonright":967,"blebarwedge":968,"doublebarwedge":969,"upharpoonright":970,"wbreak":971,"allowbreak":972,"sideset":973,"--":974,"Huge":975,"amal":976,"do":977,"fbox":978,"group":979,"hskip":980,"lse":981,"pprox":982,"rk":983,"rgroup":984,"rapprox":985,"Ġin":986,"arrayco":987,"sure":988,"varlim":989,"pmb":990,"cite":991,"substack":992,"leftrightarrows":993,"supsetneq":994,"Longleftarrow":995,"updownarrow":996,"ensure":997,"lgroup":998,"gtrapprox":999,"amalg":1000,"lsep":1001,"arraycolsep":1002,"ensuremath":1003,"asym":1004,"ch":1005,"dig":1006,"ddag":1007,"ew":1008,"gra":1009,"gime":1010,"jo":1011,"ltimes":1012,"nleq":1013,"tch":1014,"frame":1015,"max":1016,"thde":1017,"inrel":1018,"ver":1019,"withde":1020,"ointop":1021,"notag":1022,"smallint":1023,"skew":1024,"lims":1025,"asymp":1026,"digamma":1027,"grave":1028,"gimel":1029,"joinrel":1030,"framebox":1031,"withdelims":1032,"Ar":1033,"Rrightarrow":1034,"ae":1035,"ag":1036,"fill":1037,"hspace":1038,"huge":1039,"lq":1040,"nwarrow":1041,"wline":1042,"Ġ14":1043,"mark":1044,"led":1045,"inf":1046,"inde":1047,"Ġex":1048,"pitch":1049,"dotsi":1050,"intop":1051,"rowvert":1052,"llcorner":1053,"black":1054,"leqq":1055,"biggm":1056,"approxeq":1057,"diag":1058,"textsc":1059,"textsl":1060,"circled":1061,"fork":1062,"cur":1063,"newline":1064,"negthick":1065,"atopwithdelims":1066,"Leftarrow":1067,"footnotemark":1068,"uplus":1069,"subsetneqq":1070,"---":1071,"varlimsup":1072,"varliminf":1073,"verb":1074,"Arrowvert":1075,"pitchfork":1076,"blacksquare":1077,"diagup":1078,"negthickspace":1079,"23":1080,"25":1081,"\\-":1082,"\\/":1083,"ape":1084,"ckap":1085,"dddot":1086,"erline":1087,"ever":1088,"ij":1089,"ice":1090,"ly":1091,"md":1092,"nda":1093,"nnu":1094,"nmid":1095,"nRightarrow":1096,"nVdash":1097,"of":1098,"off":1099,"sho":1100,"spe":1101,"wr":1102,"ymath":1103,"Ġ#":1104,"Ġ\\'":1105,"Ġ\\^":1106,"Ġ10":1107,"Ġ15":1108,"mannu":1109,"igarrow":1110,"fter":1111,"meral":1112,"leftrightharpoo":1113,"rightsqu":1114,"def":1115,"arrayst":1116,"rtmid":1117,"interline":1118,"vearrow":1119,"ngeq":1120,"hoice":1121,"lax":1122,"varGamma":1123,"varpropto":1124,"vartriangle":1125,"varUpsilon":1126,"biguplus":1127,"expa":1128,"Ġ<$":1129,"mathbin":1130,"perca":1131,"textcircled":1132,"textmd":1133,"scsh":1134,"cial":1135,"retch":1136,"relax":1137,"overwithdelims":1138,"noinde":1139,"owns":1140,"veebar":1141,"underbar":1142,"underrightarrow":1143,"upperca":1144,"backsimeq":1145,"trianglelefteq":1146,"boxtimes":1147,"boxed":1148,"preccur":1149,"thickap":1150,"root":1151,"romannu":1152,"mathchoice":1153,"index":1154,"circledcirc":1155,"curvearrow":1156,"everymath":1157,"lyeq":1158,"ndafter":1159,"offinterline":1160,"shortmid":1161,"special":1162,"leftrightharpoons":1163,"rightsquigarrow":1164,"arraystretch":1165,"expandafter":1166,"scshape":1167,"noindent":1168,"uppercase":1169,"preccurlyeq":1170,"thickapprox":1171,"romannumeral":1172,"curvearrowright":1173,"offinterlineskip":1174},"merges":["Ġ }","Ġ {","Ġ \\","Ġ _","Ġ ^","Ġ 2","Ġ )","Ġ (","Ġ 1","r a","Ġ =","Ġ -","č Ċ","Ġ ,","f ra","fra c","Ġ +","m a","t a","i g","Ġ 0","a r","a l","l e","Ġ i","t h","Ġ x","f t","ig h","m e","r igh","ma th","Ġ n","Ġ .","Ġ\\ ,","i n","p h","Ġ d","le ft","Ġ a","righ t","a m","e ta","t i","Ġ m","m u","Ġ 3","Ġ k","Ġ t","Ġ e","Ġ r","Ġ &","Ġ c","Ġ p","s i","r m","d e","math rm","Ġ 4","Ġ s","p r","Ġ ~","ph a","Ġ l","al pha","d a","Ġ A","Ġ\\ ;","o t","p i","p ar","ti al","par tial","i me","pr ime","p si","d ot","Ġ j","Ġ b","Ġ f","l ta","Ġ |","am ma","b da","am bda","ph i","Ġ q","b f","Ġ g","n u","Ġ z","ra y","ar ray","Ġ M","Ġ T","Ġ y","c al","b ar","Ġ N","ig ma","Ġ R","r t","l ambda","in t","Ġ B","v e","n g","q u","Ġ L","Ġ /","l o","b eta","ng le","Ġ u","de lta","s q","sq rt","th eta","Ġ\\ \\","g amma","Ġ ]","s igma","g a","me ga","Ġ D","Ġ F","Ġ [","Ġ S","math bf","s u","Ġ P","lo n","Ġ v","su m","psi lon","Ġ V","Ġ C","c dot","e psilon","a t","h at","a d","qu ad","Ġ h","h o","r ho","h i","t o","Ġ E","l a","Ġ H","l de","ti lde","Ġ Q","Ġ 5","v ar","Ġ X","Ġ G","b e","n d","o mega","e nd","g in","be gin","ta u","Ġ 6","n a","ve c","Ġ I","Ġ o","ra ngle","Ġ *","D e","De lta","G amma","p e","ft y","in fty","Ġ K","x i","Ġ 8","o w","Ġ J","Ġ U","r ow","ta r","g e","P hi","a p","Ġ W","c o","me s","ti mes","s in","Ġ Z","la ngle","o pe","r na","ra to","ope rato","rna me","operato rname","tar row","l in","lin e","var phi","p m","r line","L ambda","Ġ\\ !","Ġ ;","dot s","co s","Ġ w","righ tarrow","b ig","c hi","o ve","Ġ\\ }","ove rline","Ġ 7","e x","p a","s t","pr o","q quad","i v","e qu","equ iv","Ġ O","l n","O mega","l l","Ġ 9","k ap","kap pa","B ig","Ġ\\ {","da g","Ġ Y","\\ {","var epsilon","cdot s","Ġ :","math cal","P si","Ġ >","b o","bo l","Ġ <","ge r","dag ger","l dots","e ll","b la","na bla","ex p","y le","st yle","z eta","S igma","w i","wi de","si m","le q","Ġ !","big g","math b","mathb b","Ġ\\ :","h bar","o times","bol d","\\ }","m i","pro x","ap prox","P i","lo g","mi d","s p","ve rt","d i","pro d","pe r","per p","y style","la ystyle","sp laystyle","di splaystyle","me q","si meq","e d","w ed","wed ge","wide tilde","s y","sy m","sym bol","bold symbol","c k","t ex","tex t","r i","T h","Th eta","ge q","s e","e q","n de","u nde","ta n","s c","a st","r c","se t","p t","wide hat","c i","ci rc","r e","ri pt","sc ript","unde rline","Ġ\\ |","re l","n eq","s ta","sta ck","stack rel","sin h","o p","u s","cos h","Big g","c e","text style","s tar","n ot","fra k","math frak","m p","bigg r","l us","op lus","var theta","bigg l","Big r","b ra","Big l","f o","s f","su b","sub set","ng rightarrow","e c","bold math","ra ll","fo rall","script style","ec t","par row","u parrow","b j","bj ect","p to","pro pto","Ġ '","lo ngrightarrow","big l","big r","o int","p s","ma ps","maps to","o m","l le","\\ |","d dot","c u","b in","bin om","v dots","a ngle","left rightarrow","ove r","o r","math sf","cu p","bra ce","n o","ar c","f lo","flo or","t ri","tri angle","X i","c ot","b ot","co ng","i t","m be","nu mbe","no numbe","nonumbe r","c ap","R igh","Righ tarrow","z e","si ze","text rm","n e","arc tan","ra lle","pa ralle","paralle l","c frac","Ġ- -","o bject","ĠO bject","bra ck","s h","ar row","ow n","var rho","subset eq","r brace","text bf","i math","o d","d own","h e","la nd","script scriptstyle","script size","c he","che ck","s la","over rightarrow","down arrow","Bigg l","g g","n to","pha nto","phanto m","e xi","h line","st s","exi sts","Bigg r","b u","r floor","d dots","i o","io ta","lle t","bu llet","co lon","in us","U psilon","l floor","l brack","unde rbrace","ne g","I m","math it","t in","tin y","j math","le f","sla sh","ve e","m inus","set minus","R e","i int","lef tarrow","V e","Ve rt","at op","su p","big cup","w p","di m","se c","sup set","L o","lo r","pm od","m od","big oplus","i l","b mod","co th","L e","ft rightarrow","Le ftrightarrow","ng leftrightarrow","s ma","u psilon","\\ ,","c sc","e ph","al eph","big wedge","arc sin","sma ll","o dot","over set","r brack","mi t","l brace","l i","ar p","ar ge","Ġ\\ #","b re","text sf","Lo ngrightarrow","bre ve","e m","y set","var pi","pt yset","em ptyset","f f","i ff","n t","e r","la p","ln ot","da sh","unde r","sla nt","ar g","under set","B o","Bo x","Ġ \"","s pa","spa ce","de g","i iint","o o","ot not","fo otnot","arp oo","footnot e","r lap","e s","i mp","s b","t e","big triangle","li es","imp lies","\\ ;","k er","footnote size","th arpoo","u p","a cu","acu te","lo ngleftrightarrow","e il","l ce","rc eil","lce il","v phantom","e n","th in","a ck","b ack","t t","back slash","x rightarrow","v dash","to p","righ tharpoo","var sigma","Lo ngleftrightarrow","math op","l arge","big cap","leq slant","Ġ `","over brace","nu p","rightharpoo nup","big otimes","triangle q","L arge","r u","nu ll","bigtriangle up","var no","thin g","varno thing","dot eq","Ġ\\ _","over leftarrow","h f","big star","en space","\\ !","st ru","stru t","om inus","d iv","o nd","am ond","d dagger","Ġc m","n i","s k","di amond","r Vert","pr ot","prot ect","i p","var Delta","not in","sk ip","l Vert","Ġ\\ /","dots c","i ll","u le","\\ :","hf ill","k rightarrow","o krightarrow","ho okrightarrow","sh arp","V dash","big vee","subset neq","supset eq","Ġ ?","ng mapsto","lo ngmapsto","cdot p","geq slant","bigtriangle down","dots b","li m","f l","triangle left","fl at","s l","bo x","Ġ-- -","sq cup","j lim","l s","m o","de ls","re f","mo dels","ta g","P r","ma l","o u","l lap","thin space","en skip","V ec","e box","k ebox","n or","r d","s qu","v line","¿ ½","ï ¿½","Ġ �","ma kebox","su rd","nor mal","ar e","squ are","p ou","math rel","var Omega","nd s","small setminus","pou nds","n s","s s","s mi","math or","right lef","text up","tharpoo ns","smi le","mathor d","rightlef tharpoons","c c","Ġ\\ -","su cc","f tarrow","r times","de t","pr ec","text tt","o slash","Ġ\\ &","arrow vert","l g","Ġm m","int er","ngle ftarrow","hf il","inter cal","f row","Ġ\\ *","frow n","m pe","Ġp t","var pro","se arrow","bu mpe","varpro jlim","bumpe q","D own","S S","c d","e re","g cd","o he","t w","le me","th ere","in jlim","ti t","ad rightarrow","var injlim","co mp","tex tit","fo re","over leftrightarrow","Down arrow","ohe adrightarrow","tw oheadrightarrow","leme nt","there fore","comp lement","c a","th i","lo ngleftarrow","big m","triangle right","ne arrow","\\ #","n ce","ra l","ca nce","thi ck","cance l","U parrow","n at","u ral","math strut","su it","big circ","small skip","diamond suit","normal size","nat ural","g t","le ss","math tt","big sqcup","thick sim","less sim","b ow","l lde","ti e","nu llde","mit er","li miter","ker n","bow tie","nullde limiter","nulldelimiter space","D a","h phantom","r o","v Da","bar wedge","be th","eq no","vDa sh","A R","D i","G E","L AR","d skip","t s","Ġ @","me dskip","nd own","ge ts","co prod","dots m","sma sh","rightharpoo ndown","Di amond","LAR GE","n rightarrow","n leftrightarrow","r sim","r ne","w arrow","math c","co rne","text normal","prec eq","gt rsim","ro up","corne r","Ġ\\ [","Ġ\\ ]","math ope","left eq","lo se","var kappa","Big m","Bigg m","mathc lose","mathope n","lefteq n","B ar","T i","l r","s warrow","u ge","v ru","x leftarrow","math normal","right rightarrow","right leftarrow","sq subseteq","succ eq","Ti lde","lr corner","vru le","rightrightarrow s","rightleftarrow s","A A","H at","a k","b le","d ou","h ss","m in","n right","n leftarrow","u ph","w bre","al lo","si de","sq cap","ho m","big odot","arpoo nright","ble barwedge","dou blebarwedge","uph arpoonright","wbre ak","allo wbreak","side set","- -","H uge","a mal","d o","f box","g roup","h skip","l se","p prox","r k","r group","ra pprox","Ġi n","array co","su re","var lim","pm b","ci te","sub stack","leftrightarrow s","supset neq","Lo ngleftarrow","up downarrow","en sure","lg roup","gt rapprox","amal g","lse p","arrayco lsep","ensure math","a sym","c h","d ig","d dag","e w","g ra","g ime","j o","l times","n leq","t ch","fra me","ma x","th de","in rel","ve r","wi thde","oint op","no tag","small int","sk ew","lim s","asym p","dig amma","gra ve","gime l","jo inrel","frame box","withde lims","A r","R rightarrow","a e","a g","f ill","h space","h uge","l q","n warrow","w line","Ġ1 4","ma rk","le d","in f","in de","Ġe x","pi tch","dot si","int op","row vert","ll corner","bla ck","leq q","bigg m","approx eq","di ag","text sc","text sl","circ led","fo rk","cu r","ne wline","neg thick","atop withdelims","Le ftarrow","footnote mark","up lus","subsetneq q","-- -","varlim sup","varlim inf","ver b","Ar rowvert","pitch fork","black square","diag up","negthick space","2 3","2 5","\\ -","\\ /","a pe","c kap","d ddot","e rline","e ver","i j","i ce","l y","m d","n da","n nu","n mid","n Rightarrow","n Vdash","o f","o ff","s ho","s pe","w r","y math","Ġ #","Ġ\\ '","Ġ\\ ^","Ġ1 0","Ġ1 5","ma nnu","ig arrow","ft er","me ral","left rightharpoo","right squ","de f","array st","rt mid","int erline","ve arrow","ng eq","ho ice","la x","var Gamma","var propto","var triangle","var Upsilon","big uplus","ex pa","Ġ< $","mathb in","per ca","text circled","text md","sc sh","ci al","re tch","re lax","over withdelims","no inde","own s","vee bar","under bar","under rightarrow","up perca","back simeq","triangleleft eq","box times","box ed","prec cur","thi ckap","ro ot","ro mannu","mathc hoice","inde x","circled circ","cur vearrow","ever ymath","ly eq","nda fter","off interline","sho rtmid","spe cial","leftrightharpoo ns","rightsqu igarrow","arrayst retch","expa ndafter","scsh ape","noinde nt","upperca se","preccur lyeq","thickap prox","romannu meral","curvearrow right","offinterline skip"]}} diff --git a/ppocr/utils/formula_utils/math_txt2pkl.py b/ppocr/utils/formula_utils/math_txt2pkl.py new file mode 100644 index 0000000000..e7ddcb5d44 --- /dev/null +++ b/ppocr/utils/formula_utils/math_txt2pkl.py @@ -0,0 +1,70 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pickle +from tqdm import tqdm +import os +from paddle.utils import try_import +from collections import defaultdict +import glob +from os.path import join +import argparse + + +def txt2pickle(images, equations, save_dir): + imagesize = try_import("imagesize") + save_p = os.path.join(save_dir, "latexocr_{}.pkl".format(images.split("/")[-1])) + min_dimensions = (32, 32) + max_dimensions = (672, 192) + max_length = 512 + data = defaultdict(lambda: []) + if images is not None and equations is not None: + images_list = [ + path.replace("\\", "/") for path in glob.glob(join(images, "*.png")) + ] + indices = [int(os.path.basename(img).split(".")[0]) for img in images_list] + eqs = open(equations, "r").read().split("\n") + for i, im in tqdm(enumerate(images_list), total=len(images_list)): + width, height = imagesize.get(im) + if ( + min_dimensions[0] <= width <= max_dimensions[0] + and min_dimensions[1] <= height <= max_dimensions[1] + ): + data[(width, height)].append((eqs[indices[i]], im)) + data = dict(data) + with open(save_p, "wb") as file: + pickle.dump(data, file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--image_dir", + type=str, + default=".", + help="Input_label or input path to be converted", + ) + parser.add_argument( + "--mathtxt_path", + type=str, + default=".", + help="Input_label or input path to be converted", + ) + parser.add_argument( + "--output_dir", type=str, default="out_label.txt", help="Output file name" + ) + + args = parser.parse_args() + txt2pickle(args.image_dir, args.mathtxt_path, args.output_dir) diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index a2d21df79b..69f8e27765 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -121,7 +121,7 @@ def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): time_dict["layout"] += elapse else: h, w = ori_im.shape[:2] - layout_res = [dict(bbox=None, label="table")] + layout_res = [dict(bbox=None, label="table", score=0.0)] # As reported in issues such as #10270 and #11665, the old # implementation, which recognizes texts from the layout regions, diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py index 51e6250f47..ae32b4b153 100755 --- a/ppstructure/table/matcher.py +++ b/ppstructure/table/matcher.py @@ -14,6 +14,7 @@ import numpy as np from ppstructure.table.table_master_match import deal_eb_token, deal_bb +import html def distance(box_1, box_2): @@ -133,6 +134,8 @@ def get_pred_html(self, pred_structures, matched_index, ocr_contents): and " " != content[-1] ): content += " " + # escape content + content = html.escape(content) end_html.extend(content) if b_with: end_html.extend("") diff --git a/pyproject.toml b/pyproject.toml index bc437656a1..a382edd138 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,8 +56,15 @@ dependencies = [ "beautifulsoup4", "fonttools>=4.24.0", "fire>=0.3.0", + "requests" ] +[project.urls] +homepage = "https://github.com/PaddlePaddle/PaddleOCR" +documentation = "https://github.com/PaddlePaddle/PaddleOCR/blob/main/README.md" +repository = "https://github.com/PaddlePaddle/PaddleOCR.git" +issues = "https://github.com/PaddlePaddle/PaddleOCR/issues" + [project.scripts] paddleocr = "paddleocr.paddleocr:main" diff --git a/requirements.txt b/requirements.txt index e513a2e8d9..61a6022de1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ cython Pillow pyyaml requests +albumentations==1.4.10 diff --git a/tools/eval.py b/tools/eval.py index 9ac5498b75..59a36e15a9 100755 --- a/tools/eval.py +++ b/tools/eval.py @@ -105,6 +105,8 @@ def main(): if "model_type" in config["Architecture"].keys(): if config["Architecture"]["algorithm"] == "CAN": model_type = "can" + elif config["Architecture"]["algorithm"] == "LaTeXOCR": + model_type = "latexocr" else: model_type = config["Architecture"]["model_type"] else: diff --git a/tools/export_model.py b/tools/export_model.py index 8ca31c9d58..c10f81d223 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -131,6 +131,11 @@ def export_single_model( ] ] model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "LaTeXOCR": + other_shape = [ + paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: input_spec = [ paddle.static.InputSpec(shape=[None, 512], dtype="int64"), # input_ids diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py index ce73508a17..02bce45b73 100755 --- a/tools/infer/predict_det.py +++ b/tools/infer/predict_det.py @@ -293,7 +293,7 @@ def predict(self, img): et = time.time() return dt_boxes, et - st - def __call__(self, img): + def __call__(self, img, use_slice=False): # For image like poster with one side much greater than the other side, # splitting recursively and processing with overlap to enhance performance. MIN_BOUND_DISTANCE = 50 @@ -302,6 +302,7 @@ def __call__(self, img): if ( img.shape[0] / img.shape[1] > 2 and img.shape[0] > self.args.det_limit_side_len + and use_slice ): start_h = 0 end_h = 0 @@ -349,6 +350,7 @@ def __call__(self, img): elif ( img.shape[1] / img.shape[0] > 3 and img.shape[1] > self.args.det_limit_side_len * 3 + and use_slice ): start_w = 0 end_w = 0 diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 06b318eb35..239b09ef19 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -133,6 +133,11 @@ def __init__(self, args, logger=None): "character_dict_path": args.rec_char_dict_path, "use_space_char": args.use_space_char, } + elif self.rec_algorithm == "LaTeXOCR": + postprocess_params = { + "name": "LaTeXOCRDecode", + "rec_char_dict_path": args.rec_char_dict_path, + } elif self.rec_algorithm == "ParseQ": postprocess_params = { "name": "ParseQLabelDecode", @@ -450,6 +455,90 @@ def norm_img_can(self, img, image_shape): return img + def pad_(self, img, divable=32): + threshold = 128 + data = np.array(img.convert("LA")) + if data[..., -1].var() == 0: + data = (data[..., 0]).astype(np.uint8) + else: + data = (255 - data[..., -1]).astype(np.uint8) + data = (data - data.min()) / (data.max() - data.min()) * 255 + if data.mean() > threshold: + # To invert the text to white + gray = 255 * (data < threshold).astype(np.uint8) + else: + gray = 255 * (data > threshold).astype(np.uint8) + data = 255 - data + + coords = cv2.findNonZero(gray) # Find all non-zero points (text) + a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box + rect = data[b : b + h, a : a + w] + im = Image.fromarray(rect).convert("L") + dims = [] + for x in [w, h]: + div, mod = divmod(x, divable) + dims.append(divable * (div + (1 if mod > 0 else 0))) + padded = Image.new("L", dims, 255) + padded.paste(im, (0, 0, im.size[0], im.size[1])) + return padded + + def minmax_size_( + self, + img, + max_dimensions, + min_dimensions, + ): + if max_dimensions is not None: + ratios = [a / b for a, b in zip(img.size, max_dimensions)] + if any([r > 1 for r in ratios]): + size = np.array(img.size) // max(ratios) + img = img.resize(tuple(size.astype(int)), Image.BILINEAR) + if min_dimensions is not None: + # hypothesis: there is a dim in img smaller than min_dimensions, and return a proper dim >= min_dimensions + padded_size = [ + max(img_dim, min_dim) + for img_dim, min_dim in zip(img.size, min_dimensions) + ] + if padded_size != list(img.size): # assert hypothesis + padded_im = Image.new("L", padded_size, 255) + padded_im.paste(img, img.getbbox()) + img = padded_im + return img + + def norm_img_latexocr(self, img): + # CAN only predict gray scale image + shape = (1, 1, 3) + mean = [0.7931, 0.7931, 0.7931] + std = [0.1738, 0.1738, 0.1738] + scale = 255.0 + min_dimensions = [32, 32] + max_dimensions = [672, 192] + mean = np.array(mean).reshape(shape).astype("float32") + std = np.array(std).reshape(shape).astype("float32") + + im_h, im_w = img.shape[:2] + if ( + min_dimensions[0] <= im_w <= max_dimensions[0] + and min_dimensions[1] <= im_h <= max_dimensions[1] + ): + pass + else: + img = Image.fromarray(np.uint8(img)) + img = self.minmax_size_(self.pad_(img), max_dimensions, min_dimensions) + img = np.array(img) + im_h, im_w = img.shape[:2] + img = np.dstack([img, img, img]) + img = (img.astype("float32") * scale - mean) / std + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + divide_h = math.ceil(im_h / 16) * 16 + divide_w = math.ceil(im_w / 16) * 16 + img = np.pad( + img, ((0, divide_h - im_h), (0, divide_w - im_w)), constant_values=(1, 1) + ) + img = img[:, :, np.newaxis].transpose(2, 0, 1) + img = img.astype("float32") + return img + def __call__(self, img_list): img_num = len(img_list) # Calculate the aspect ratio of all text bars @@ -552,6 +641,10 @@ def __call__(self, img_list): word_label_list = [] norm_img_mask_batch.append(norm_image_mask) word_label_list.append(word_label) + elif self.rec_algorithm == "LaTeXOCR": + norm_img = self.norm_img_latexocr(img_list[indices[ino]]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) else: norm_img = self.resize_norm_img( img_list[indices[ino]], max_wh_ratio @@ -666,6 +759,29 @@ def __call__(self, img_list): if self.benchmark: self.autolog.times.stamp() preds = outputs + elif self.rec_algorithm == "LaTeXOCR": + inputs = [norm_img_batch] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, input_dict) + preds = outputs + else: + input_names = self.predictor.get_input_names() + input_tensor = [] + for i in range(len(input_names)): + input_tensor_i = self.predictor.get_input_handle(input_names[i]) + input_tensor_i.copy_from_cpu(inputs[i]) + input_tensor.append(input_tensor_i) + self.input_tensor = input_tensor + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs else: if self.use_onnx: input_dict = {} @@ -692,6 +808,9 @@ def __call__(self, img_list): wh_ratio_list=wh_ratio_list, max_wh_ratio=max_wh_ratio, ) + elif self.postprocess_params["name"] == "LaTeXOCRDecode": + preds = [p.reshape([-1]) for p in preds] + rec_result = self.postprocess_op(preds) else: rec_result = self.postprocess_op(preds) for rno in range(len(rec_result)): diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index aaf63922c5..bcb0758eb6 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -91,7 +91,7 @@ def __call__(self, img, cls=True, slice={}): elapsed = [] dt_slice_boxes = [] for slice_crop, v_start, h_start in slice_gen: - dt_boxes, elapse = self.text_detector(slice_crop) + dt_boxes, elapse = self.text_detector(slice_crop, use_slice=True) if dt_boxes.size: dt_boxes[:, :, 0] += h_start dt_boxes[:, :, 1] += v_start diff --git a/tools/infer/utility.py b/tools/infer/utility.py index 6f6b8c5dd7..f019e97e86 100644 --- a/tools/infer/utility.py +++ b/tools/infer/utility.py @@ -15,7 +15,6 @@ import argparse import os import sys -import platform import cv2 import numpy as np import paddle @@ -23,11 +22,13 @@ from PIL import Image, ImageDraw, ImageFont import math from paddle import inference -import time import random from ppocr.utils.logging import get_logger +logger = get_logger() + + def str2bool(v): return v.lower() in ("true", "yes", "t", "y", "1") @@ -333,20 +334,22 @@ def get_output_tensors(args, mode, predictor): def get_infer_gpuid(): - sysstr = platform.system() - if sysstr == "Windows": - return 0 + """ + Get the GPU ID to be used for inference. + Returns: + int: The GPU ID to be used for inference. + """ if not paddle.device.is_compiled_with_rocm: - cmd = "env | grep CUDA_VISIBLE_DEVICES" + gpu_id_str = os.environ.get("CUDA_VISIBLE_DEVICES", "0") else: - cmd = "env | grep HIP_VISIBLE_DEVICES" - env_cuda = os.popen(cmd).readlines() - if len(env_cuda) == 0: - return 0 - else: - gpu_id = env_cuda[0].strip().split("=")[1] - return int(gpu_id[0]) + gpu_id_str = os.environ.get("HIP_VISIBLE_DEVICES", "0") + + gpu_ids = gpu_id_str.split(",") + logger.warning( + "The first GPU is used for inference by default, GPU ID: {}".format(gpu_ids[0]) + ) + return int(gpu_ids[0]) def draw_e2e_res(dt_boxes, strs, img_path): diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 0e04c8b636..22df30f866 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -183,6 +183,8 @@ def main(): elif isinstance(post_result, list) and isinstance(post_result[0], int): # for RFLearning CNT branch info = str(post_result[0]) + elif config["Architecture"]["algorithm"] == "LaTeXOCR": + info = str(post_result[0]) else: if len(post_result[0]) >= 2: info = post_result[0][0] + "\t" + str(post_result[0][1]) diff --git a/tools/program.py b/tools/program.py index b2f3dbf107..1cc5bbac1c 100755 --- a/tools/program.py +++ b/tools/program.py @@ -324,6 +324,8 @@ def train( preds = model(batch) elif algorithm in ["CAN"]: preds = model(batch[:3]) + elif algorithm in ["LaTeXOCR"]: + preds = model(batch) else: preds = model(images) preds = to_float32(preds) @@ -339,6 +341,8 @@ def train( preds = model(batch) elif algorithm in ["CAN"]: preds = model(batch[:3]) + elif algorithm in ["LaTeXOCR"]: + preds = model(batch) else: preds = model(images) loss = loss_class(preds, batch) @@ -360,6 +364,10 @@ def train( elif algorithm in ["CAN"]: model_type = "can" eval_class(preds[0], batch[2:], epoch_reset=(idx == 0)) + elif algorithm in ["LaTeXOCR"]: + model_type = "latexocr" + post_result = post_process_class(preds, batch[1], mode="train") + eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0)) else: if config["Loss"]["name"] in [ "MultiLoss", @@ -442,7 +450,7 @@ def train( and dist.get_rank() == 0 ): if model_average: - Model_Average = paddle.incubate.optimizer.ModelAverage( + Model_Average = paddle.incubate.ModelAverage( 0.15, parameters=model.parameters(), min_average_window=10000, @@ -600,6 +608,8 @@ def eval( preds = model(batch) elif model_type in ["can"]: preds = model(batch[:3]) + elif model_type in ["latexocr"]: + preds = model(batch) elif model_type in ["sr"]: preds = model(batch) sr_img = preds["sr_img"] @@ -614,6 +624,8 @@ def eval( preds = model(batch) elif model_type in ["can"]: preds = model(batch[:3]) + elif model_type in ["latexocr"]: + preds = model(batch) elif model_type in ["sr"]: preds = model(batch) sr_img = preds["sr_img"] @@ -640,6 +652,9 @@ def eval( eval_class(preds, batch_numpy) elif model_type in ["can"]: eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0)) + elif model_type in ["latexocr"]: + post_result = post_process_class(preds, batch[1], "eval") + eval_class(post_result[0], post_result[1], epoch_reset=(idx == 0)) else: post_result = post_process_class(preds, batch_numpy[1]) eval_class(post_result, batch_numpy) @@ -777,6 +792,7 @@ def preprocess(is_train=False): "SVTR_HGNet", "ParseQ", "CPPD", + "LaTeXOCR", ] if use_xpu: