feat: convert images to flashcards

Related-to: #1156 Related-to: #1483
2anki · Dec 8, 2024 · bd8c86c · bd8c86c
1 parent 6e988b1
commit bd8c86c
Show file tree

Hide file tree

Showing 10 changed files with 223 additions and 37 deletions.
diff --git a/src/controllers/SettingsController/SettingsController.test.ts b/src/controllers/SettingsController/SettingsController.test.ts
@@ -48,6 +48,7 @@ describe('SettingsController', () => {
       'perserve-newlines': 'true',
       'vertex-ai-pdf-questions': 'false',
       'disable-indented-bullets': 'false',
+      'image-quiz-html-to-anki': 'false',
     });
   });
 
@@ -74,6 +75,7 @@ describe('SettingsController', () => {
       'max-one-toggle-per-card': 'true',
       'perserve-newlines': 'false',
       'page-emoji': 'first-emoji',
+      'image-quiz-html-to-anki': 'false',
     });
   });
 });
diff --git a/src/controllers/SettingsController/supportedOptions.ts b/src/controllers/SettingsController/supportedOptions.ts
@@ -130,6 +130,12 @@ const supportedOptions = (): CardOption[] => {
       'Disable indented bullets from becoming separate cards. This applies to bullet lists.',
       false
     ),
+    new CardOption(
+      'image-quiz-html-to-anki',
+      'Convert Image Quiz HTML to Anki Cards',
+      'Use OCR to extract images and answers from HTML quizzes and convert them into Anki flashcards for review. This is a premium feature.',
+      false
+    ),
   ];
 
   return v.filter(Boolean);

diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts
@@ -1,10 +1,11 @@
 import getDeckFilename from '../anki/getDeckFilename';
 import { DeckParser, DeckParserInput } from './DeckParser';
 import Deck from './Deck';
-import { isPDFFile, isPPTFile } from '../storage/checks';
+import { isImageFile, isPDFFile, isPPTFile } from '../storage/checks';
 import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML';
 import { convertPDFToImages } from '../pdf/convertPDFToImages';
 import { convertPPTToPDF } from '../pdf/ConvertPPTToPDF';
+import { convertImageToHTML } from './experimental/VertexAPI/convertImageToHTML';
 
 interface PrepareDeckResult {
   name: string;
@@ -16,8 +17,21 @@ export async function PrepareDeck(
   input: DeckParserInput
 ): Promise<PrepareDeckResult> {
   for (const file of input.files) {
-    if ((!isPDFFile(file.name) && !isPPTFile(file.name)) || !file.contents)
+    if (!file.contents) {
       continue;
+    }
+
+    if (
+      isImageFile(file.name) &&
+      input.settings.imageQuizHtmlToAnki &&
+      input.noLimits
+    ) {
+      file.contents = await convertImageToHTML(
+        file.contents?.toString('base64')
+      );
+    }
+
+    if (!isPDFFile(file.name) && !isPPTFile(file.name)) continue;
 
     if (
       isPDFFile(file.name) &&

diff --git a/src/lib/parser/Settings/Settings.ts b/src/lib/parser/Settings/Settings.ts
@@ -69,6 +69,8 @@ export class Settings {
   readonly vertexAIPDFQuestions: boolean;
   readonly disableIndentedBulletPoints: boolean;
 
+  readonly imageQuizHtmlToAnki: boolean;
+
   constructor(input: { [key: string]: string }) {
     this.deckName = input.deckName;
     if (this.deckName && !this.deckName.trim()) {
@@ -103,6 +105,7 @@ export class Settings {
     this.vertexAIPDFQuestions = input['vertex-ai-pdf-questions'] === 'true';
     this.disableIndentedBulletPoints =
       input['disable-indented-bullets'] === 'true';
+    this.imageQuizHtmlToAnki = input['image-quiz-html-to-anki'] === 'true';
     /* Is this really needed? */
     if (this.parentBlockId) {
       this.addNotionLink = true;
@@ -143,6 +146,7 @@ export class Settings {
       'max-one-toggle-per-card': 'true',
       'perserve-newlines': 'false',
       'page-emoji': 'first-emoji',
+      'image-quiz-html-to-anki': 'false',
     };
   }
 }
diff --git a/src/lib/parser/experimental/VertexAPI/constants.ts b/src/lib/parser/experimental/VertexAPI/constants.ts
@@ -0,0 +1,20 @@
+import { HarmBlockThreshold, HarmCategory } from '@google-cloud/vertexai';
+
+export const SAFETY_SETTINGS = [
+  {
+    category: HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+    threshold: HarmBlockThreshold.BLOCK_NONE,
+  },
+  {
+    category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+    threshold: HarmBlockThreshold.BLOCK_NONE,
+  },
+  {
+    category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+    threshold: HarmBlockThreshold.BLOCK_NONE,
+  },
+  {
+    category: HarmCategory.HARM_CATEGORY_HARASSMENT,
+    threshold: HarmBlockThreshold.BLOCK_NONE,
+  },
+];
diff --git a/src/lib/parser/experimental/VertexAPI/convertImageToHTML.ts b/src/lib/parser/experimental/VertexAPI/convertImageToHTML.ts
@@ -0,0 +1,76 @@
+import { VertexAI } from '@google-cloud/vertexai';
+import { SAFETY_SETTINGS } from './constants';
+
+export const convertImageToHTML = async (
+  imageData: string
+): Promise<string> => {
+  const vertexAI = new VertexAI({
+    project: 'notion-to-anki',
+    location: 'europe-west3',
+  });
+  const model = 'gemini-1.5-flash-002';
+
+  const generativeModel = vertexAI.preview.getGenerativeModel({
+    model: model,
+    generationConfig: {
+      maxOutputTokens: 8192,
+      temperature: 1,
+      topP: 0.95,
+    },
+    safetySettings: SAFETY_SETTINGS,
+  });
+
+  const text1 = {
+    text: `Convert the text in this image to the following format: 
+
+        <ul class=\"toggle\">
+          <li>
+           <details>
+            <summary>
+                n) question
+            </summary>
+        <p>A) ..., </p>
+        <p>B)... </p>
+        etc. 
+        <p>and finally Answer: D</p>
+           </details>
+          </li>
+          </ul>
+
+        —
+        - Extra rules: n=is the number for the question, question=the question text
+    - Add newline between the options
+    - If you are not able to detect the pattern above, try converting this into a question and answer format`,
+  };
+
+  const image1 = {
+    inlineData: {
+      mimeType: 'image/png',
+      data: imageData,
+    },
+  };
+
+  const req = {
+    contents: [{ role: 'user', parts: [text1, image1] }],
+  };
+
+  let htmlContent = '';
+  try {
+    const streamingResp = await generativeModel.generateContentStream(req);
+    for await (const item of streamingResp.stream) {
+      if (
+        item.candidates &&
+        item.candidates[0].content &&
+        item.candidates[0].content.parts
+      ) {
+        htmlContent += item.candidates[0].content.parts
+          .map((part) => part.text)
+          .join('');
+      }
+    }
+  } catch (error) {
+    console.error('Error generating content stream:', error);
+  }
+
+  return htmlContent;
+};
diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts
@@ -42,3 +42,12 @@ export const isPotentialZipFile = (
   }
   return filename.trim().endsWith('.') || !filename.includes('.');
 };
+
+export const isImageFile = (name: string) =>
+  isImageFileEmbedable(name) &&
+  (name.toLowerCase().endsWith('.png') ||
+    name.toLowerCase().endsWith('.jpg') ||
+    name.toLowerCase().endsWith('.jpeg') ||
+    name.toLowerCase().endsWith('.gif') ||
+    name.toLowerCase().endsWith('.bmp') ||
+    name.toLowerCase().endsWith('.svg'));
diff --git a/src/lib/zip/zip.tsx b/src/lib/zip/zip.tsx
@@ -2,8 +2,16 @@ import { strFromU8, unzipSync } from 'fflate';
 import { Body } from 'aws-sdk/clients/s3';
 import { renderToStaticMarkup } from 'react-dom/server';
 import { getUploadLimits } from '../misc/getUploadLimits';
-import { isHTMLFile, isMarkdownFile, isPDFFile } from '../storage/checks';
+import {
+  isHTMLFile,
+  isImageFile,
+  isMarkdownFile,
+  isPDFFile,
+} from '../storage/checks';
 import { processAndPrepareArchiveData } from './fallback/processAndPrepareArchiveData';
+import { convertImageToHTML } from '../parser/experimental/VertexAPI/convertImageToHTML';
+import Settings from '../parser/Settings';
+import { getRandomUUID } from '../../shared/helpers/getRandomUUID';
 
 interface File {
   name: string;
@@ -14,14 +22,16 @@ class ZipHandler {
   files: File[];
   zipFileCount: number;
   maxZipFiles: number;
+  combinedHTML: string;
 
   constructor(maxNestedZipFiles: number) {
     this.files = [];
     this.zipFileCount = 0;
     this.maxZipFiles = maxNestedZipFiles;
+    this.combinedHTML = '';
   }
 
-  async build(zipData: Uint8Array, paying: boolean) {
+  async build(zipData: Uint8Array, paying: boolean, settings: Settings) {
     const size = Buffer.byteLength(zipData);
     const limits = getUploadLimits(paying);
 
@@ -38,50 +48,89 @@ class ZipHandler {
       );
     }
 
-    await this.processZip(zipData, paying);
+    await this.processZip(zipData, paying, settings);
   }
 
-  private async processZip(zipData: Uint8Array, paying: boolean) {
+  private async processZip(
+    zipData: Uint8Array,
+    paying: boolean,
+    settings: Settings
+  ) {
     if (this.zipFileCount >= this.maxZipFiles) {
       throw new Error('Too many zip files in the upload.');
     }
 
     try {
       const loadedZip = unzipSync(zipData, {
-        filter(file) {
-          return !file.name.endsWith('/');
-        },
+        filter: (file) => !file.name.endsWith('/'),
       });
 
       for (const name in loadedZip) {
         const file = loadedZip[name];
-        let contents = file;
-
-        if (name.includes('__MACOSX/') || isPDFFile(name)) {
-          continue;
-        }
-
-        if (name.endsWith('.zip')) {
-          this.zipFileCount++;
-          await this.processZip(file, paying);
-        } else if ((isHTMLFile(name) || isMarkdownFile(name)) && contents) {
-          this.files.push({ name, contents: strFromU8(file) });
-        } else if (contents) {
-          this.files.push({ name, contents });
-        }
+        await this.handleFile(name, file, paying, settings);
       }
+
+      this.addCombinedHTMLToFiles(paying, settings);
     } catch (error: unknown) {
-      // Code 13 indicates we need to use fallback archive processing
-      const isArchiveProcessingError = (error as { code?: number }).code === 13;
-
-      if (isArchiveProcessingError) {
-        // Use fallback method to process archive
-        const foundFiles = await processAndPrepareArchiveData(zipData, paying);
-        this.files.push(...foundFiles);
-        console.log('Processed files using fallback method:', this.files);
-      } else {
-        throw error;
-      }
+      this.handleZipError(error, zipData, paying);
+    }
+  }
+
+  private async handleFile(
+    name: string,
+    file: Uint8Array,
+    paying: boolean,
+    settings: Settings
+  ) {
+    if (name.includes('__MACOSX/') || isPDFFile(name)) return;
+
+    if (name.endsWith('.zip')) {
+      this.zipFileCount++;
+      await this.processZip(file, paying, settings);
+    } else if (isHTMLFile(name) || isMarkdownFile(name)) {
+      this.files.push({ name, contents: strFromU8(file) });
+    } else if (paying && settings.imageQuizHtmlToAnki && isImageFile(name)) {
+      await this.convertAndAddImageToHTML(name, file);
+    } else {
+      this.files.push({ name, contents: file });
+    }
+  }
+
+  private async convertAndAddImageToHTML(name: string, file: Uint8Array) {
+    const html = await convertImageToHTML(Buffer.from(file).toString('base64'));
+    this.combinedHTML += html;
+    console.log('Converted image to HTML:', name, html);
+  }
+
+  private addCombinedHTMLToFiles(paying: boolean, settings: Settings) {
+    if (this.combinedHTML && paying) {
+      const finalHTML = `<!DOCTYPE html>
+<html>
+<head><title>${settings.deckName ?? 'Image Quiz'}</title></head>
+<body>
+${this.combinedHTML}
+</body>
+</html>`;
+      this.files.push({
+        name: `ocr-${getRandomUUID()}.html`,
+        contents: finalHTML,
+      });
+    }
+  }
+
+  private async handleZipError(
+    error: unknown,
+    zipData: Uint8Array,
+    paying: boolean
+  ) {
+    const isArchiveProcessingError = (error as { code?: number }).code === 13;
+
+    if (isArchiveProcessingError) {
+      const foundFiles = await processAndPrepareArchiveData(zipData, paying);
+      this.files.push(...foundFiles);
+      console.log('Processed files using fallback method:', this.files);
+    } else {
+      throw error;
     }
   }
 

diff --git a/src/usecases/uploads/getPackagesFromZip.ts b/src/usecases/uploads/getPackagesFromZip.ts
@@ -23,7 +23,7 @@ export const getPackagesFromZip = async (
     return { packages: [] };
   }
 
-  await zipHandler.build(fileContents as Uint8Array, paying);
+  await zipHandler.build(fileContents as Uint8Array, paying, settings);
 
   const fileNames = zipHandler.getFileNames();
 

diff --git a/src/usecases/uploads/worker.ts b/src/usecases/uploads/worker.ts
@@ -4,7 +4,11 @@ import Settings from '../../lib/parser/Settings';
 import Package from '../../lib/parser/Package';
 import fs from 'fs';
 import { PrepareDeck } from '../../lib/parser/PrepareDeck';
-import { isPotentialZipFile, isZIPFile } from '../../lib/storage/checks';
+import {
+  isImageFile,
+  isPotentialZipFile,
+  isZIPFile,
+} from '../../lib/storage/checks';
 import { getPackagesFromZip } from './getPackagesFromZip';
 import Workspace from '../../lib/parser/WorkSpace';
 import { isZipContentFileSupported } from './isZipContentFileSupported';
@@ -28,7 +32,9 @@ function doGenerationWork(data: GenerationData) {
       const filename = file.originalname;
       const key = file.key;
 
-      if (isZipContentFileSupported(filename)) {
+      const allowImageQuizHtmlToAnki =
+        paying && settings.imageQuizHtmlToAnki && isImageFile(filename);
+      if (isZipContentFileSupported(filename) || allowImageQuizHtmlToAnki) {
         const d = await PrepareDeck({
           name: filename,
           files: [{ name: filename, contents: fileContents }],