From 7aca698330263bf9066c7b4448c660972b9779ae Mon Sep 17 00:00:00 2001
From: Alexander Alemayhu <alexander@alemayhu.com>
Date: Sun, 1 Dec 2024 17:24:11 +0100
Subject: [PATCH] feat: Support PDF file uploads

Closes: https://github.com/2anki/server/issues/1471
---
 src/lib/parser/DeckParser.ts                  |  20 ++--
 src/lib/parser/PrepareDeck.ts                 |  23 ++--
 src/lib/parser/WorkSpace.ts                   |   1 +
 src/lib/parser/exporters/embedFile.ts         |  34 ++++--
 src/lib/parser/pdf/convertPDFToImages.ts      | 100 ++++++++++++++++++
 src/lib/storage/checks.ts                     |   7 +-
 src/usecases/uploads/allowPDFUpload.ts        |   9 --
 src/usecases/uploads/getPackagesFromZip.ts    |   6 +-
 .../uploads/isZipContentFileSupported.ts      |   4 +-
 src/usecases/uploads/worker.ts                |   6 +-
 10 files changed, 165 insertions(+), 45 deletions(-)
 create mode 100644 src/lib/parser/pdf/convertPDFToImages.ts
 delete mode 100644 src/usecases/uploads/allowPDFUpload.ts

diff --git a/src/lib/parser/DeckParser.ts b/src/lib/parser/DeckParser.ts
index c72eb775..e298a03e 100644
--- a/src/lib/parser/DeckParser.ts
+++ b/src/lib/parser/DeckParser.ts
@@ -390,11 +390,12 @@ export class DeckParser {
               images.each((_i, elem) => {
                 const originalName = dom(elem).attr('src');
                 if (originalName && isImageFileEmbedable(originalName)) {
-                  const newName = embedFile(
+                  const newName = embedFile({
                     exporter,
-                    this.files,
-                    decodeURIComponent(originalName)
-                  );
+                    files: this.files,
+                    filePath: decodeURIComponent(originalName),
+                    workspace: ws,
+                  });
                   if (newName) {
                     dom(elem).attr('src', newName);
                     card.media.push(newName);
@@ -418,11 +419,12 @@ export class DeckParser {
               ''
             );
           }
-          const newFileName = embedFile(
+          const newFileName = embedFile({
             exporter,
-            this.files,
-            global.decodeURIComponent(audiofile)
-          );
+            files: this.files,
+            filePath: global.decodeURIComponent(audiofile),
+            workspace: ws,
+          });
           if (newFileName) {
             card.back += `[sound:${newFileName}]`;
             card.media.push(newFileName);
@@ -608,6 +610,7 @@ export class DeckParser {
             : validSummary;
           if (toggle || this.settings.maxOne) {
             const toggleHTML = toggle.html();
+            console.log(toggleHTML);
             if (toggleHTML) {
               let b = toggleHTML.replace(summary.html() || '', '');
               if (this.settings.isTextOnlyBack) {
@@ -630,6 +633,7 @@ export class DeckParser {
                 }
                 return mangleBackSide;
               })();
+              console.log(front, backSide);
               const note = new Note(front || '', backSide);
               note.notionId = parentUL.attr('id');
               if (note.notionId && this.settings.addNotionLink) {
diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts
index 617509d1..d1a485d6 100644
--- a/src/lib/parser/PrepareDeck.ts
+++ b/src/lib/parser/PrepareDeck.ts
@@ -1,8 +1,12 @@
+import fs from 'fs';
+import path from 'path';
+
 import getDeckFilename from '../anki/getDeckFilename';
 import { DeckParser, DeckParserInput } from './DeckParser';
 import Deck from './Deck';
 import { isPDFFile } from '../storage/checks';
 import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML';
+import { convertPDFToImages } from './pdf/convertPDFToImages';
 
 interface PrepareDeckResult {
   name: string;
@@ -13,14 +17,17 @@ interface PrepareDeckResult {
 export async function PrepareDeck(
   input: DeckParserInput
 ): Promise<PrepareDeckResult> {
-  if (input.noLimits && input.settings.vertexAIPDFQuestions) {
-    // Check for PDF files and convert their contents to HTML
-    for (const file of input.files) {
-      if (isPDFFile(file.name) && file.contents) {
-        file.contents = await convertPDFToHTML(
-          file.contents.toString('base64')
-        );
-      }
+  for (const file of input.files) {
+    if (!isPDFFile(file.name) || !file.contents) continue;
+
+    if (input.noLimits && input.settings.vertexAIPDFQuestions) {
+      file.contents = await convertPDFToHTML(file.contents.toString('base64'));
+    } else {
+      file.contents = await convertPDFToImages(file.contents, input.workspace, input.noLimits);
+      fs.writeFileSync(
+        path.join(input.workspace.location, 'input.html'),
+        file.contents.toString()
+      );
     }
   }
 
diff --git a/src/lib/parser/WorkSpace.ts b/src/lib/parser/WorkSpace.ts
index 768b3dea..84aa98eb 100644
--- a/src/lib/parser/WorkSpace.ts
+++ b/src/lib/parser/WorkSpace.ts
@@ -18,6 +18,7 @@ class Workspace {
   }
 
   private ensureExists() {
+    console.log('Ensuring workspace exists', this.location);
     if (!fs.existsSync(this.location)) {
       fs.mkdirSync(this.location, { recursive: true });
     }
diff --git a/src/lib/parser/exporters/embedFile.ts b/src/lib/parser/exporters/embedFile.ts
index 3b78687e..f72c7690 100644
--- a/src/lib/parser/exporters/embedFile.ts
+++ b/src/lib/parser/exporters/embedFile.ts
@@ -1,13 +1,28 @@
+import fs from 'fs';
+import path from 'path';
+
 import { File } from '../../zip/zip';
 import { SuffixFrom } from '../../misc/file';
 import getUniqueFileName from '../../misc/getUniqueFileName';
 import CustomExporter from './CustomExporter';
+import { existsSync } from 'fs';
+import Workspace from '../WorkSpace';
 
 const getFile = (
   exporter: CustomExporter,
   files: File[],
-  filePath: string
+  filePath: string,
+  workspace: Workspace
 ): File | undefined => {
+  const fullPath = path.resolve(workspace.location, filePath);
+  if (fullPath.startsWith(workspace.location) && existsSync(fullPath)) {
+    const buffer = fs.readFileSync(fullPath);
+    return {
+      name: fullPath,
+      contents: buffer,
+    } as File;
+  }
+
   const asRootFile = files.find((f) => f.name === filePath);
   if (asRootFile) {
     return asRootFile;
@@ -34,13 +49,18 @@ const getFile = (
   return undefined;
 };
 
-export const embedFile = (
-  exporter: CustomExporter,
-  files: File[],
-  filePath: string
-): string | null => {
+interface EmbedFileInput {
+  exporter: CustomExporter;
+  files: File[];
+  filePath: string;
+  workspace: Workspace;
+}
+
+export const embedFile = (input: EmbedFileInput): string | null => {
+  const { exporter, files, filePath, workspace } = input;
+
   const suffix = SuffixFrom(filePath);
-  const file = getFile(exporter, files, filePath);
+  const file = getFile(exporter, files, filePath, workspace);
 
   if (file) {
     const newName = getUniqueFileName(filePath) + suffix;
diff --git a/src/lib/parser/pdf/convertPDFToImages.ts b/src/lib/parser/pdf/convertPDFToImages.ts
new file mode 100644
index 00000000..c5f3842c
--- /dev/null
+++ b/src/lib/parser/pdf/convertPDFToImages.ts
@@ -0,0 +1,100 @@
+import { writeFile } from 'fs/promises';
+import path from 'path';
+import { execFile } from 'child_process';
+import Workspace from '../WorkSpace';
+import { S3 } from 'aws-sdk';
+
+function getPageCount(pdfPath: string): Promise<number> {
+  return new Promise((resolve, reject) => {
+    execFile('/usr/local/bin/pdfinfo', [pdfPath], (error, stdout) => {
+      if (error) {
+        reject(new Error('Failed to execute pdfinfo'));
+        return;
+      }
+
+      const pageCount = parseInt(
+        stdout
+          .split('\n')
+          .find((line) => line.startsWith('Pages:'))
+          ?.split(/\s+/)[1] || '0'
+      );
+
+      if (!pageCount) {
+        reject(new Error('Failed to get page count'));
+        return;
+      }
+
+      resolve(pageCount);
+    });
+  });
+}
+
+function convertPage(pdfPath: string, page: number): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const outputBase = `${pdfPath}-page${page}`;
+    execFile(
+      'pdftoppm',
+      [
+        '-png',
+        '-f',
+        page.toString(),
+        '-l',
+        page.toString(),
+        pdfPath,
+        outputBase,
+      ],
+      (error) => {
+        if (error) {
+          reject(new Error(`Failed to convert page ${page} to PNG`));
+          return;
+        }
+        resolve(outputBase + `-${page}.png`);
+      }
+    );
+  });
+}
+
+function combineIntoHTML(imagePaths: string[]): string {
+  const html = `<!DOCTYPE html>
+<html>
+<body>
+  ${Array.from({ length: imagePaths.length / 2 }, (_, i) => {
+    const front = path.basename(imagePaths[i * 2]);
+    const back = path.basename(imagePaths[i * 2 + 1]);
+    return `<ul class="toggle">
+    <li>
+      <details>
+        <summary>
+        <img src="${front}" />
+        </summary>
+        <img src="${back}" />
+      </details>
+    </li>
+    </ul>`;
+  }).join('\n')}
+</body>
+</html>`;
+
+  return html;
+}
+
+export async function convertPDFToImages(
+  pdfBuffer: S3.Body,
+  workspace: Workspace,
+  noLimits = false
+): Promise<Buffer> {
+  const pdfPath = path.join(workspace.location, 'input.pdf');
+  await writeFile(pdfPath, Buffer.from(pdfBuffer as Buffer));
+
+  const pageCount = await getPageCount(pdfPath);
+  if (!noLimits && pageCount > 100) {
+    throw new Error('PDF exceeds maximum page limit of 100');
+  }
+
+  const imagePaths = await Promise.all(
+    Array.from({ length: pageCount }, (_, i) => convertPage(pdfPath, i + 1))
+  );
+
+  const html = await combineIntoHTML(imagePaths);
+  return Buffer.from(html);
+}
diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts
index cb178b8a..29c6047b 100644
--- a/src/lib/storage/checks.ts
+++ b/src/lib/storage/checks.ts
@@ -16,8 +16,11 @@ export const isTwitterURL = (url: string) => /twitter\.com/.exec(url);
 
 export const isVimeoURL = (url: string) => /vimeo\.com/.exec(url);
 
-export const isImageFileEmbedable = (url: string) =>
-  !url.startsWith('http') && !url.startsWith('data:image');
+export const isImageFileEmbedable = (url: string) => {
+  const isLocalPath = !url.startsWith('http') && !url.startsWith('data:image');
+  const hasTraversal = url.includes('../') || url.includes('..\\');
+  return isLocalPath && !hasTraversal;
+};
 
 export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName);
 
diff --git a/src/usecases/uploads/allowPDFUpload.ts b/src/usecases/uploads/allowPDFUpload.ts
deleted file mode 100644
index a0b89582..00000000
--- a/src/usecases/uploads/allowPDFUpload.ts
+++ /dev/null
@@ -1,9 +0,0 @@
-import { isPDFFile } from '../../lib/storage/checks';
-
-export const allowPDFUpload = (
-  fileName: string,
-  premium: boolean,
-  vertexAIPDFQuestions: boolean
-): null | false | boolean => {
-  return isPDFFile(fileName) && premium && vertexAIPDFQuestions;
-};
diff --git a/src/usecases/uploads/getPackagesFromZip.ts b/src/usecases/uploads/getPackagesFromZip.ts
index e8acf3a7..a7f020da 100644
--- a/src/usecases/uploads/getPackagesFromZip.ts
+++ b/src/usecases/uploads/getPackagesFromZip.ts
@@ -6,7 +6,6 @@ import Package from '../../lib/parser/Package';
 import { checkFlashcardsLimits } from '../../lib/User/checkFlashcardsLimits';
 import { PackageResult } from './GeneratePackagesUseCase';
 import Workspace from '../../lib/parser/WorkSpace';
-import { allowPDFUpload } from './allowPDFUpload';
 import { getMaxUploadCount } from '../../lib/misc/getMaxUploadCount';
 
 import { isZipContentFileSupported } from './isZipContentFileSupported';
@@ -33,10 +32,7 @@ export const getPackagesFromZip = async (
     /**
      * XXX: Should we also support files without extensions?
      */
-    if (
-      isZipContentFileSupported(fileName) ||
-      allowPDFUpload(fileName, paying, settings.vertexAIPDFQuestions)
-    ) {
+    if (isZipContentFileSupported(fileName)) {
       const deck = await PrepareDeck({
         name: fileName,
         files: zipHandler.files,
diff --git a/src/usecases/uploads/isZipContentFileSupported.ts b/src/usecases/uploads/isZipContentFileSupported.ts
index a2505662..8f03067c 100644
--- a/src/usecases/uploads/isZipContentFileSupported.ts
+++ b/src/usecases/uploads/isZipContentFileSupported.ts
@@ -3,6 +3,7 @@ import {
   isMarkdownFile,
   isPlainText,
   isCSVFile,
+  isPDFFile,
 } from '../../lib/storage/checks';
 
 /**
@@ -12,4 +13,5 @@ export const isZipContentFileSupported = (filename: string) =>
   isHTMLFile(filename) ??
   isMarkdownFile(filename) ??
   isPlainText(filename) ??
-  isCSVFile(filename);
+  isCSVFile(filename) ??
+  isPDFFile(filename);
diff --git a/src/usecases/uploads/worker.ts b/src/usecases/uploads/worker.ts
index 7134b0bc..fd0aec7f 100644
--- a/src/usecases/uploads/worker.ts
+++ b/src/usecases/uploads/worker.ts
@@ -7,7 +7,6 @@ import { PrepareDeck } from '../../lib/parser/PrepareDeck';
 import { isZIPFile } from '../../lib/storage/checks';
 import { getPackagesFromZip } from './getPackagesFromZip';
 import Workspace from '../../lib/parser/WorkSpace';
-import { allowPDFUpload } from './allowPDFUpload';
 import { isZipContentFileSupported } from './isZipContentFileSupported';
 
 interface GenerationData {
@@ -29,10 +28,7 @@ function doGenerationWork(data: GenerationData) {
       const filename = file.originalname;
       const key = file.key;
 
-      if (
-        isZipContentFileSupported(filename) ||
-        allowPDFUpload(filename, paying, settings.vertexAIPDFQuestions)
-      ) {
+      if (isZipContentFileSupported(filename)) {
         const d = await PrepareDeck({
           name: filename,
           files: [{ name: filename, contents: fileContents }],