Merge pull request #272 from priyanshuverma-dev/feat-ocr-magic

feat: OCR Magic Tool added
kom-senapati · Nov 5, 2024 · 02a7803 · 02a7803
2 parents 48f05f6 + 7b61876
commit 02a7803
Show file tree

Hide file tree

Showing 8 changed files with 183 additions and 3 deletions.
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -123,6 +123,14 @@ flask db migrate -m "Describe migration"
 flask db upgrade
 ```
 
+### (Optional)
+
+Install `tesseract` for OCR
+
+```
+sudo apt install tesseract-ocr
+```
+
 ---
 
 ## Running the Project

diff --git a/app/api_routes.py b/app/api_routes.py
@@ -12,6 +12,8 @@
 from .helpers import create_default_chatbots
 from .data_fetcher import fetch_contribution_data
 from datetime import datetime
+import PIL
+import pytesseract
 import re
 from flask_jwt_extended import (
     create_access_token,
@@ -316,7 +318,9 @@ def api_chatbot(chatbot_id: int) -> Union[Response, tuple[Response, int]]:
     query: str = data.get("query")
     apikey = request.headers["apikey"]
     engine = request.headers["engine"]
-    chat_to_pass: List[Dict[str, str]] = [{"role": "system", "content": chatbot.latest_version.prompt}]
+    chat_to_pass: List[Dict[str, str]] = [
+        {"role": "system", "content": chatbot.latest_version.prompt}
+    ]
     for chat in chats:
         chat_to_pass.append({"role": "user", "content": chat.user_query})
         chat_to_pass.append({"role": "assistant", "content": chat.response})
@@ -822,3 +826,27 @@ def api_translate():
 
     except Exception as e:
         return jsonify({"success": False, "message": str(e)}), 500
+
+
+@api_bp.route("/api/ocr", methods=["POST"])
+@jwt_required()
+def api_ocr():
+    try:
+        if "file" not in request.files:
+            return jsonify({"success": False, "error": "No file provided"}), 400
+
+        file = request.files["file"]
+        base_path = os.path.dirname(os.path.abspath(__file__))
+        temp_audio_dir = os.path.join(base_path, "temp_images")
+        os.makedirs(temp_audio_dir, exist_ok=True)
+        filepath = os.path.join(temp_audio_dir, file.filename)
+        file.save(filepath)
+
+        image = PIL.Image.open(filepath)
+        text = pytesseract.image_to_string(image)
+
+        os.remove(filepath)
+        return jsonify({"success": True, "text": text}), 200
+
+    except Exception as e:
+        return jsonify({"success": False, "message": str(e)}), 500
diff --git a/client/src/components/modals/command-modal.tsx b/client/src/components/modals/command-modal.tsx
@@ -20,10 +20,12 @@ import {
   Languages,
   PanelTopInactive,
   Plus,
+  TextCursorInput,
 } from "lucide-react";
 import {
   useCreateChatbotModal,
   useImagineModal,
+  useOcrMagic,
   useSettingsModal,
   useTranslateMagicModal,
   useTtsMagicModal,
@@ -38,6 +40,7 @@ export function CommandModal() {
   const settingsModal = useSettingsModal();
   const imagineModal = useImagineModal();
   const ttsModal = useTtsMagicModal();
+  const ocrModal = useOcrMagic();
   const translateModal = useTranslateMagicModal();
   const navigate = useNavigate();
 
@@ -72,6 +75,10 @@ export function CommandModal() {
               <Languages />
               <span>{t("commandbox.translate")}</span>
             </CommandItem>
+            <CommandItem onSelect={() => ocrModal.onOpen()}>
+              <TextCursorInput />
+              <span>Text Extractor (OCR)</span>
+            </CommandItem>
             <CommandItem onSelect={() => imagineModal.onOpen()}>
               <Image />
               <span>{t("commandbox.image_generation")}</span>

diff --git a/client/src/components/modals/ocr-magic-modal.tsx b/client/src/components/modals/ocr-magic-modal.tsx
@@ -0,0 +1,131 @@
+import {
+  AlertDialog,
+  AlertDialogContent,
+  AlertDialogDescription,
+  AlertDialogFooter,
+  AlertDialogHeader,
+  AlertDialogTitle,
+} from "@/components/ui/alert-dialog";
+import { SERVER_URL } from "@/lib/utils";
+import { useOcrMagic } from "@/stores/modal-store";
+import axios from "axios";
+import { useState } from "react";
+import { Button } from "../ui/button";
+import toast from "react-hot-toast";
+
+import { X } from "lucide-react";
+
+import { Input } from "../ui/input";
+import { Skeleton } from "../ui/skeleton";
+
+export default function OcrMagicModal() {
+  const modal = useOcrMagic();
+  const [loading, setLoading] = useState(false);
+  const [ocrText, setOcrText] = useState("");
+
+  const [selectedFile, setSelectedFile] = useState<File | null>(null);
+  const [imagePreview, setImagePreview] = useState<string | null>(null);
+  const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
+    const file = event.target.files ? event.target.files[0] : null;
+    if (file) {
+      setSelectedFile(file);
+      setImagePreview(URL.createObjectURL(file));
+    }
+  };
+
+  const handleOcrSubmit = async (event: any) => {
+    event.preventDefault();
+    setOcrText("");
+    if (!selectedFile) return toast.error("Please select a file!");
+
+    const formData = new FormData();
+    formData.append("file", selectedFile);
+    setLoading(true);
+    try {
+      const token = localStorage.getItem("token");
+
+      const authHeaders = {
+        Authorization: `Bearer ${token || ""}`,
+      };
+      const response = await axios.post(`${SERVER_URL}/api/ocr`, formData, {
+        headers: authHeaders,
+      });
+      setOcrText(response.data.text);
+    } catch (error) {
+      console.error("Error fetching OCR text:", error);
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  return (
+    <AlertDialog open={modal.isOpen} onOpenChange={() => modal.onClose()}>
+      <AlertDialogContent>
+        <AlertDialogHeader>
+          <AlertDialogTitle>
+            <div className="flex items-center justify-between">
+              <p>OCR Magic Tool</p>
+              <Button
+                variant={"outline"}
+                size={"icon"}
+                className="rounded-full"
+                onClick={() => modal.onClose()}
+              >
+                <X />
+              </Button>
+            </div>
+          </AlertDialogTitle>
+          <AlertDialogDescription>
+            Extract Text from Image.
+          </AlertDialogDescription>
+          <div className="grid gap-4 w-full">
+            {imagePreview && (
+              <div className="relative aspect-video">
+                <img
+                  src={imagePreview}
+                  alt="Uploaded image preview"
+                  className="object-contain w-full h-full"
+                />
+              </div>
+            )}
+            {loading && (
+              <div className="space-y-2">
+                <Skeleton className="h-4 w-full" />
+                <Skeleton className="h-4 w-[90%]" />
+                <Skeleton className="h-4 w-[75%]" />
+              </div>
+            )}
+            {ocrText && (
+              <div className="p-4 bg-muted rounded-md">
+                <p className="text-sm">{ocrText}</p>
+              </div>
+            )}
+            <div className="flex items-center gap-4">
+              <form
+                onSubmit={handleOcrSubmit}
+                className="w-full flex items-center flex-col gap-4"
+              >
+                <Input
+                  disabled={loading}
+                  type="file"
+                  onChange={handleFileChange}
+                  accept="image/*"
+                  className="cursor-pointer"
+                />
+                <Button
+                  disabled={loading}
+                  className="w-full"
+                  variant={"outline"}
+                  type="submit"
+                >
+                  {loading ? "Extracting..." : "Extract"}
+                </Button>
+              </form>
+            </div>
+          </div>
+        </AlertDialogHeader>
+        <AlertDialogFooter></AlertDialogFooter>
+      </AlertDialogContent>
+    </AlertDialog>
+  );
+}
diff --git a/client/src/contexts/modals.tsx b/client/src/contexts/modals.tsx
@@ -1,6 +1,7 @@
 import CreateChatbotModal from "@/components/modals/create-chatbot-modal";
 import DeleteChatbotModal from "@/components/modals/delete-chatbot-modal";
 import ImagineModal from "@/components/modals/imgine-modal";
+import OcrMagicModal from "@/components/modals/ocr-magic-modal";
 import SettingsModal from "@/components/modals/settings-modal";
 import ShareModal from "@/components/modals/share-modal";
 import TranslateMagicModal from "@/components/modals/translate-magic-modal";
@@ -16,6 +17,7 @@ export default function Modals() {
       <UpdateProfileModal />
       <SettingsModal />
       <ShareModal />
+      <OcrMagicModal />
       <TtsMagicModal />
       <TranslateMagicModal />
       <ImagineModal />

diff --git a/client/src/stores/modal-store.ts b/client/src/stores/modal-store.ts
@@ -10,3 +10,4 @@ export const useShareModal = create<DefaultModal>(defaultModalValues);
 export const useTtsMagicModal = create<DefaultModal>(defaultModalValues);
 export const useTranslateMagicModal = create<DefaultModal>(defaultModalValues);
 export const useImagineModal = create<DefaultModal>(defaultModalValues);
+export const useOcrMagic = create<DefaultModal>(defaultModalValues);
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -17,4 +17,6 @@ anthropic
 gTTS
 beautifulsoup4
 Markdown
-translate
+translate
+pytesseract
+pillow
diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,5 @@ anthropic
 gTTS
 beautifulsoup4
 Markdown
-translate
+pytesseract
+pillow