From 05734c1e3ae0a3070217e6fccdaffd75852c7dc2 Mon Sep 17 00:00:00 2001
From: Guten Ye <ywzhaifei@gmail.com>
Date: Tue, 28 May 2024 21:39:30 +0800
Subject: [PATCH] feat(cpp): detect returns vector<TextLine>

---
 README.md                                     |  9 ++++++++-
 .../react-native/cpp/example/cpp-example.cpp  |  6 +++---
 packages/react-native/cpp/native-ocr.cpp      | 19 ++++++++-----------
 packages/react-native/cpp/native-ocr.h        |  2 +-
 .../react-native/cpp/recognition_process.cpp  | 15 +++++++++------
 .../react-native/cpp/recognition_process.h    | 18 ++++++++++++++----
 6 files changed, 43 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index 0bf01c9..7db974f 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,14 @@ Ocr.create({
 
 ocr.detect(imagePath, { 
   onnxOptions?: {}     // Node only. Pass to ONNX Runtime
-}): Promise<Result>
+}): Promise<TextLine[]>
+
+TextLine {
+  text: string
+  score: number
+  frame: { top, left, width, height }
+}
+
 ```
 
 ## Development
diff --git a/packages/react-native/cpp/example/cpp-example.cpp b/packages/react-native/cpp/example/cpp-example.cpp
index d7606ef..4af3f3f 100644
--- a/packages/react-native/cpp/example/cpp-example.cpp
+++ b/packages/react-native/cpp/example/cpp-example.cpp
@@ -26,10 +26,10 @@ int main(int argc, char* argv[]) {
     fs::remove_all(debug_output_dir);
     fs::create_directory(debug_output_dir);
     NativeOcr* ocr = new NativeOcr(rawOptions, asset_dir, debug_output_dir);
-    auto lines = ocr->detect(image_path);
+    auto text_lines = ocr->detect(image_path);
 
-    // for (auto line : lines) {
-    //   std::cout << line << std::endl;
+    // for (auto text_line : text_lines) {
+    //   std::cout << text_line.score << " " << text_line.text << std::endl;
     // }
     return 0;
   } catch (const std::exception& e) {
diff --git a/packages/react-native/cpp/native-ocr.cpp b/packages/react-native/cpp/native-ocr.cpp
index 4a8d3be..fe074ad 100644
--- a/packages/react-native/cpp/native-ocr.cpp
+++ b/packages/react-native/cpp/native-ocr.cpp
@@ -42,7 +42,7 @@ NativeOcr::NativeOcr(std::unordered_map<std::string, std::any> rawOptions, const
   m_dictionary.push_back(" ");
 }
 
-std::vector<std::string> NativeOcr::detect(std::string &image_path) {
+std::vector<TextLine> NativeOcr::detect(std::string &image_path) {
   Timer timer;
   timer.start();
 
@@ -70,8 +70,7 @@ std::vector<std::string> NativeOcr::detect(std::string &image_path) {
   cv::Mat image_copy;
   image.copyTo(image_copy);
 
-  std::vector<std::string> recognition_text;
-  std::vector<float> recognition_text_score;
+  std::vector<TextLine> text_lines;
   std::vector<ClassifierResult> classifier_results;
   std::vector<RecognitionResult> recognition_results;
   for (int i = detection_result.data.size() - 1; i >= 0; i--) {
@@ -98,8 +97,7 @@ std::vector<std::string> NativeOcr::detect(std::string &image_path) {
     cv::Mat resized_image;
     auto recognition_result = m_recognition_predictor->predict(crop_image, m_dictionary, resized_image);
     recognition_results.push_back(recognition_result);
-    recognition_text.push_back(recognition_result.data.first);
-    recognition_text_score.push_back(recognition_result.data.second);
+    text_lines.push_back(recognition_result.data);
 
     // if (m_options.is_debug) {
     //   auto output_path =
@@ -157,15 +155,14 @@ std::vector<std::string> NativeOcr::detect(std::string &image_path) {
   }
 
   // print recognized text
-  std::vector<std::string> lines(recognition_text.size());
-  for (int i = 0; i < lines.size(); i++) {
-    if (m_options.is_debug) {
-      std::cout << "[DEBUG] " << i << "\t" << recognition_text_score[i] << "\t" << recognition_text[i] << std::endl;
+  if (m_options.is_debug) {
+    for (size_t index = 0; index < text_lines.size(); index++) {
+      auto text_line = text_lines[index];
+      std::cout << "[DEBUG] " << index << "\t" << text_line.score << "\t" << text_line.text << std::endl;
     }
-    lines[i] = recognition_text[i];
   }
 
-  return lines;
+  return text_lines;
 }
 
 cv::Mat get_rotate_crop_image(cv::Mat source_image, std::vector<std::vector<int>> box) {
diff --git a/packages/react-native/cpp/native-ocr.h b/packages/react-native/cpp/native-ocr.h
index 7789cb4..c99cc59 100644
--- a/packages/react-native/cpp/native-ocr.h
+++ b/packages/react-native/cpp/native-ocr.h
@@ -31,7 +31,7 @@ class NativeOcr {
   NativeOcr(std::unordered_map<std::string, std::any> rawOptions, const std::string &assetDir,
             const std::string &debugOutputDir);
 
-  std::vector<std::string> detect(std::string &image_path);
+  std::vector<TextLine> detect(std::string &image_path);
 
 private:
   Options m_options;
diff --git a/packages/react-native/cpp/recognition_process.cpp b/packages/react-native/cpp/recognition_process.cpp
index b161c0e..d39d4f4 100644
--- a/packages/react-native/cpp/recognition_process.cpp
+++ b/packages/react-native/cpp/recognition_process.cpp
@@ -47,14 +47,13 @@ RecognitionResult RecognitionPredictor::predict(const cv::Mat &source_image, std
   performance.predict_time = timer.get_average_ms();
 
   timer.start();
-  auto res = postprocess(model_output, source_image, charactor_dict);
+  auto text_line = postprocess(model_output, source_image, charactor_dict);
   timer.end();
-  auto postprocessTime = timer.get_average_ms();
   performance.postprocess_time = timer.get_average_ms();
 
   performance.total_time = performance.preprocess_time + performance.predict_time + performance.postprocess_time;
 
-  return RecognitionResult {.data = res, .performance = performance};
+  return RecognitionResult {.data = text_line, .performance = performance};
 }
 
 ImageRaw RecognitionPredictor::preprocess(const cv::Mat &source_image, cv::Mat &resized_image) {
@@ -76,8 +75,8 @@ ImageRaw RecognitionPredictor::preprocess(const cv::Mat &source_image, cv::Mat &
   return image_raw;
 }
 
-std::pair<std::string, float> RecognitionPredictor::postprocess(ModelOutput &model_output, const cv::Mat &source_image,
-                                                                std::vector<std::string> charactor_dict) {
+TextLine RecognitionPredictor::postprocess(ModelOutput &model_output, const cv::Mat &source_image,
+                                           std::vector<std::string> charactor_dict) {
   auto predict_batch = model_output.data;
   auto predict_shape = model_output.shape;
 
@@ -101,7 +100,11 @@ std::pair<std::string, float> RecognitionPredictor::postprocess(ModelOutput &mod
     last_index = argmax_idx;
   }
   score /= count;
-  return std::make_pair(text, score);
+
+  return TextLine {
+      .text = text,
+      .score = score,
+  };
 }
 
 cv::Mat recognition_resize_image(cv::Mat source_image, int resize_height, int resize_max_width, Options &options) {
diff --git a/packages/react-native/cpp/recognition_process.h b/packages/react-native/cpp/recognition_process.h
index 2738b08..535d6d7 100644
--- a/packages/react-native/cpp/recognition_process.h
+++ b/packages/react-native/cpp/recognition_process.h
@@ -21,10 +21,21 @@
 #include "shared.h"
 #include "utils.h"
 
-using RecognitionResultData = std::pair<std::string, float>;
+struct Frame {
+  double width {};
+  double height {};
+  double top {};
+  double left {};
+};
+
+struct TextLine {
+  std::string text {};
+  float score {};
+  Frame frame {};
+};
 
 struct RecognitionResult {
-  RecognitionResultData data {};
+  TextLine data {};
   ModelPerformance performance {};
 };
 
@@ -40,6 +51,5 @@ class RecognitionPredictor {
 
   ImageRaw preprocess(const cv::Mat &rgba_image, cv::Mat &resized_image);
 
-  RecognitionResultData postprocess(ModelOutput &model_output, const cv::Mat &rgba_image,
-                                    std::vector<std::string> charactor_dict);
+  TextLine postprocess(ModelOutput &model_output, const cv::Mat &rgba_image, std::vector<std::string> charactor_dict);
 };