feat: add direction classifier

gutenye · May 20, 2024 · 386cc70 · 386cc70
1 parent f6a8d2b
commit 386cc70
Show file tree

Hide file tree

Showing 19 changed files with 206 additions and 143 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -90,7 +90,8 @@
     "*.ipp": "cpp",
     "format": "cpp",
     "type_traits": "cpp",
-    "__memory": "cpp"
+    "__memory": "cpp",
+    "memory": "cpp"
   },
   "[jsonc]": {
     "editor.defaultFormatter": "esbenp.prettier-vscode"

diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ Ocr.create({
   detectionUnclipRatiop?: number // RN only
   detectionUseDilate?: boolean // RN only
   detectionUsePolygonScore?: boolean // RN only
-  detectionUseDirectionClassify?: boolean // RN only
+  useDirectionClassify?: boolean // RN only
   onnxOptions?: {}       // Node only. Pass to ONNX Runtime
 }): Promise<Ocr>
 

diff --git a/ake b/ake
@@ -30,9 +30,13 @@ def 'main react-native example' [] {
 }
 
 # Runc cpp example
-def 'main cpp example' [] {
+def 'main cpp example' [path?: string] {
   cd packages/react-native/cpp/example
-  ./ake start
+  if $path == null {
+    ./ake start
+  } else {
+    ./ake start $path
+  }
 }
 
 def publish [package: string, version: string] {

diff --git a/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx b/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx
@@ -0,0 +1 @@
+../packages/models/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx
diff --git a/assets/direction.jpg b/assets/direction.jpg
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@gutenye/ocr",
-  "version": "1.4.1",
+  "version": "1.4.2",
   "workspaces": [
     "packages/browser",
     "packages/browser/example",

diff --git a/packages/models/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx b/packages/models/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx
diff --git a/packages/models/package.json b/packages/models/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@gutenye/ocr-models",
   "description": "Guten OCR is a high accurate text detection (OCR) Javascript/Typescript library that runs on Node.js, Browser, React Native and C++. Based on PaddleOCR and ONNX runtime",
-  "version": "1.4.0",
+  "version": "1.4.2",
   "type": "module",
   "license": "MIT",
   "repository": {

diff --git a/packages/react-native/cpp/classifier_process.cpp b/packages/react-native/cpp/classifier_process.cpp
@@ -13,95 +13,100 @@
 // limitations under the License.
 
 #include "classifier_process.h"
+#include <iostream>
 #include "timer.h"
 
-const std::vector<int> cls_image_shape {3, 48, 192};
-cv::Mat ClsResizeImg(cv::Mat img) {
-  int imgC, imgH, imgW;
-  imgC = cls_image_shape[0];
-  imgH = cls_image_shape[1];
-  imgW = cls_image_shape[2];
-
-  float ratio = static_cast<float>(img.cols) / static_cast<float>(img.rows);
-
-  int resize_w, resize_h;
-  if (ceilf(imgH * ratio) > imgW)
-    resize_w = imgW;
-  else
-    resize_w = int(ceilf(imgH * ratio));
-  cv::Mat resize_img;
-  cv::resize(img, resize_img, cv::Size(resize_w, imgH), 0.f, 0.f, cv::INTER_LINEAR);
-  if (resize_w < imgW) {
-    cv::copyMakeBorder(resize_img, resize_img, 0, 0, 0, imgW - resize_w, cv::BORDER_CONSTANT, cv::Scalar(0, 0, 0));
-  }
-  return resize_img;
-}
+const std::vector<int> classifier_image_shape {3, 48, 192};
+
+cv::Mat classifier_resize_image(cv::Mat source_image, int resize_height, int max_resize_width, Options &options);
+
+ClassifierPredictor::ClassifierPredictor(Options &options)
+    : m_options {options}, m_onnx {Onnx(options.models.classifier_model_path)} {}
+
+ClassifierResult ClassifierPredictor::predict(const cv::Mat &source_image, const float thresh) {
+  ModelPerformance performance {};
+
+  Timer timer;
+  timer.start();
+  auto image = preprocess(source_image);
+  timer.end();
+  performance.preprocess_time = timer.get_average_ms();
+
+  // Run predictor
+  std::vector<int64_t> input_shape {1, image.channels, image.height, image.width};
+  timer.start();
+  // TODO: hangs on run
+  auto model_output = m_onnx.run(image.data, input_shape);
+  timer.end();
+  performance.predict_time = timer.get_average_ms();
+
+  // Process Output
+  timer.start();
+  std::cout << "[DEBUG] 0" << std::endl;
+  auto result_image = postprocess(model_output, source_image, thresh);
+  timer.end();
+  performance.postprocess_time = timer.get_average_ms();
 
-ClassifierPredictor::ClassifierPredictor(const std::string &modelDir, const int cpu_thread_num,
-                                         const std::string &cpu_power_mode) {}
+  performance.total_time = performance.preprocess_time + performance.predict_time + performance.postprocess_time;
 
-void ClassifierPredictor::preprocess(const cv::Mat &img) {
+  return ClassifierResult {.data = result_image, .performance = performance};
+}
+
+ImageRaw ClassifierPredictor::preprocess(const cv::Mat &source_image) {
   std::vector<float> mean = {0.5f, 0.5f, 0.5f};
   std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};
-  cv::Mat crop_img;
-  img.copyTo(crop_img);
-  cv::Mat resize_img;
 
-  int index = 0;
-  float wh_ratio = static_cast<float>(crop_img.cols) / static_cast<float>(crop_img.rows);
+  cv::Mat resized_image =
+      classifier_resize_image(source_image, classifier_image_shape[1], classifier_image_shape[2], m_options);
+  resized_image.convertTo(resized_image, CV_32FC3, 1 / 255.f);
 
-  resize_img = ClsResizeImg(crop_img);
-  resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);
+  const float *destination_image = reinterpret_cast<const float *>(resized_image.data);
+  std::vector<float> data(resized_image.rows * resized_image.cols * 3);
+  NHWC3ToNC3HW(destination_image, data.data(), resized_image.rows * resized_image.cols, mean, scale);
 
-  const float *dimg = reinterpret_cast<const float *>(resize_img.data);
+  ImageRaw image_raw {.data = data, .width = resized_image.cols, .height = resized_image.rows, .channels = 3};
+
+  return image_raw;
 }
 
-cv::Mat ClassifierPredictor::postprocess(const cv::Mat &srcimg, const float thresh) {
-  // Get output and run postprocess
-  // std::unique_ptr<const Tensor> softmax_out(
-  //     std::move(predictor_->GetOutput(0)));
-  // auto *softmax_scores = softmax_out->mutable_data<float>();
-  // auto softmax_out_shape = softmax_out->shape();
-  // float score = 0;
-  // int label = 0;
-  // for (int i = 0; i < softmax_out_shape[1]; i++)
-  // {
-  //   if (softmax_scores[i] > score)
-  //   {
-  //     score = softmax_scores[i];
-  //     label = i;
-  //   }
-  // }
-  // if (label % 2 == 1 && score > thresh)
-  // {
-  //   cv::rotate(srcimg, srcimg, 1);
-  // }
-  // return srcimg;
-  cv::Mat a {};
-  return a;
+ClassifierResultData ClassifierPredictor::postprocess(ModelOutput &model_output, const cv::Mat &source_image,
+                                                      const float thresh) {
+  std::cout << "[DEBUG] 1" << std::endl;
+  auto softmax_scores = model_output.data;
+  auto softmax_out_shape = model_output.shape;
+  float score = 0;
+  int label = 0;
+  for (int i = 0; i < softmax_out_shape[1]; i++) {
+    if (softmax_scores[i] > score) {
+      score = softmax_scores[i];
+      label = i;
+    }
+  }
+  if (label % 2 == 1 && score > thresh) {
+    std::cout << "[DEBUG] 2" << std::endl;
+    cv::rotate(source_image, source_image, 1);
+    std::cout << "[DEBUG] 3" << std::endl;
+  }
+  return source_image;
 }
 
-cv::Mat ClassifierPredictor::predict(const cv::Mat &img, double *preprocess_time, double *predictTime,
-                                     double *postprocessTime, const float thresh) {
-  cv::Mat src_img;
-  img.copyTo(src_img);
-  //  Timer tic;
-  //  tic.start();
-  preprocess(img);
-  // tic.end();
-  // *preprocess_time = tic.get_average_ms();
-  // std::cout << "cls predictor preprocess costs" <<  *preprocess_time;
-
-  // tic.start();
-  // predictor_->Run();
-  // tic.end();
-  // *predictTime = tic.get_average_ms();
-  // std::cout << "cls predictor predict costs" <<  *predictTime;
-
-  //  tic.start();
-  cv::Mat srcimg = postprocess(src_img, thresh);
-  // tic.end();
-  // *postprocessTime = tic.get_average_ms();
-  // std::cout << "cls predictor predict costs" <<  *postprocessTime;
-  return srcimg;
+cv::Mat classifier_resize_image(cv::Mat source_image, int resize_height, int max_resize_width, Options &options) {
+  auto source_width = source_image.cols;
+  auto source_height = source_image.rows;
+  float wh_ratio = static_cast<float>(source_width) / static_cast<float>(source_height);
+  auto resize_width = int(ceilf(resize_height * wh_ratio));
+  if (resize_width > max_resize_width) {
+    resize_width = max_resize_width;
+  }
+  if (options.is_debug) {
+    std::cout << "[DEBUG] Classifier resize image from " << source_width << "x" << source_height << " to "
+              << resize_width << "x" << resize_height << std::endl;
+  }
+  cv::Mat resized_image;
+  cv::resize(source_image, resized_image, cv::Size(resize_width, resize_height), 0.f, 0.f, cv::INTER_LINEAR);
+  if (resize_width < max_resize_width) {
+    cv::copyMakeBorder(resized_image, resized_image, 0, 0, 0, max_resize_width - resize_width, cv::BORDER_CONSTANT,
+                       cv::Scalar(0, 0, 0));
+  }
+  return resized_image;
 }
diff --git a/packages/react-native/cpp/classifier_process.h b/packages/react-native/cpp/classifier_process.h
@@ -13,19 +13,33 @@
 // limitations under the License.
 
 #pragma once
+
+#include "onnx.h"
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
+#include "options.h"
+#include "shared.h"
 #include "utils.h"
 
+using ClassifierResultData = cv::Mat;
+
+struct ClassifierResult {
+  ClassifierResultData data {};
+  ModelPerformance performance {};
+};
+
 class ClassifierPredictor {
 public:
-  explicit ClassifierPredictor(const std::string &modelDir, const int cpu_thread_num,
-                               const std::string &cpu_power_mode);
+  explicit ClassifierPredictor(Options &options);
 
-  cv::Mat predict(const cv::Mat &rgb_image, double *preprocess_time, double *predictTime, double *postprocessTime,
-                  const float thresh);
+  ClassifierResult predict(const cv::Mat &image, const float thresh);
 
 private:
-  void preprocess(const cv::Mat &rgba_image);
-  cv::Mat postprocess(const cv::Mat &img, const float thresh);
+  Options m_options {};
+
+  Onnx m_onnx;
+
+  ImageRaw preprocess(const cv::Mat &image);
+
+  ClassifierResultData postprocess(ModelOutput &model_output, const cv::Mat &source_image, const float thresh);
 };
diff --git a/packages/react-native/cpp/detection_process.cpp b/packages/react-native/cpp/detection_process.cpp
@@ -14,21 +14,18 @@
 
 #include "detection_process.h"
 #include <cmath>
-#include <map>
 #include <memory>
-#include <string>
 #include <utility>
-#include <vector>
 #include "db_post_process.h"
 #include "timer.h"
 
 void resize_image(const cv::Mat image, cv::Mat &resized_image, Options &options);
 
-DetectionPredictor::DetectionPredictor(Options &options, const int cpu_thread_num, const std::string &cpu_power_mode)
+DetectionPredictor::DetectionPredictor(Options &options)
     : m_options {options}, m_onnx {Onnx(options.models.detection_model_path)} {}
 
 DetectionResult DetectionPredictor::predict(cv::Mat &image) {
-  ModelPerformance performance;
+  ModelPerformance performance {};
 
   cv::Mat source_image;
   image.copyTo(source_image);
@@ -38,12 +35,12 @@ DetectionResult DetectionPredictor::predict(cv::Mat &image) {
   auto preprocess_result = preprocess(image);
   timer.end();
   performance.preprocess_time = timer.get_average_ms();
+
   auto &model_input = preprocess_result.model_input;
   auto &resized_image = preprocess_result.resized_image;
 
   // Run predictor
   std::vector<int64_t> input_shape = {1, model_input.channels, model_input.height, model_input.width};
-
   timer.start();
   auto model_output = m_onnx.run(model_input.data, input_shape);
   timer.end();
@@ -60,8 +57,8 @@ DetectionResult DetectionPredictor::predict(cv::Mat &image) {
   return DetectionResult {.data = filter_boxes, .performance = performance};
 }
 
-PreprocessResult DetectionPredictor::preprocess(const cv::Mat &source_image) {
-  PreprocessResult result {};
+DetectionPreprocessResult DetectionPredictor::preprocess(const cv::Mat &source_image) {
+  DetectionPreprocessResult result {};
   resize_image(source_image, result.resized_image, m_options);
 
   cv::Mat model_data;

diff --git a/packages/react-native/cpp/detection_process.h b/packages/react-native/cpp/detection_process.h
@@ -15,14 +15,16 @@
 #pragma once
 
 #include <map>
+#include <string>
+#include <vector>
 #include "onnx.h"
 #include "opencv2/core.hpp"
 #include "opencv2/imgproc.hpp"
 #include "options.h"
 #include "shared.h"
 #include "utils.h"
 
-struct PreprocessResult {
+struct DetectionPreprocessResult {
   ImageRaw model_input {};
   cv::Mat resized_image {};
 };
@@ -36,15 +38,16 @@ struct DetectionResult {
 
 class DetectionPredictor {
 public:
-  explicit DetectionPredictor(Options &options, const int cpu_thread_num, const std::string &cpu_power_mode);
+  explicit DetectionPredictor(Options &options);
 
   DetectionResult predict(cv::Mat &rgb_image);
 
 private:
   Options m_options {};
+
   Onnx m_onnx;
 
-  PreprocessResult preprocess(const cv::Mat &image);
+  DetectionPreprocessResult preprocess(const cv::Mat &image);
 
   DetectionResultData postprocess(ModelOutput &model_output, const cv::Mat &source_image, const cv::Mat &resized_image,
                                   Options &options);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../packages/models/assets/ch_ppocr_mobile_v2.0_cls_infer.onnx