feat: Support PDF file uploads

Closes: #1471
2anki · Dec 1, 2024 · c66b02f · c66b02f
1 parent 2477eab
commit c66b02f
Show file tree

Hide file tree

Showing 10 changed files with 182 additions and 45 deletions.
diff --git a/src/lib/parser/DeckParser.ts b/src/lib/parser/DeckParser.ts
@@ -390,11 +390,12 @@ export class DeckParser {
               images.each((_i, elem) => {
                 const originalName = dom(elem).attr('src');
                 if (originalName && isImageFileEmbedable(originalName)) {
-                  const newName = embedFile(
+                  const newName = embedFile({
                     exporter,
-                    this.files,
-                    decodeURIComponent(originalName)
-                  );
+                    files: this.files,
+                    filePath: decodeURIComponent(originalName),
+                    workspace: ws,
+                  });
                   if (newName) {
                     dom(elem).attr('src', newName);
                     card.media.push(newName);
@@ -418,11 +419,12 @@ export class DeckParser {
               ''
             );
           }
-          const newFileName = embedFile(
+          const newFileName = embedFile({
             exporter,
-            this.files,
-            global.decodeURIComponent(audiofile)
-          );
+            files: this.files,
+            filePath: global.decodeURIComponent(audiofile),
+            workspace: ws,
+          });
           if (newFileName) {
             card.back += `[sound:${newFileName}]`;
             card.media.push(newFileName);

diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts
@@ -1,8 +1,12 @@
+import fs from 'fs';
+import path from 'path';
+
 import getDeckFilename from '../anki/getDeckFilename';
 import { DeckParser, DeckParserInput } from './DeckParser';
 import Deck from './Deck';
 import { isPDFFile } from '../storage/checks';
 import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML';
+import { convertPDFToImages } from './pdf/convertPDFToImages';
 
 interface PrepareDeckResult {
   name: string;
@@ -13,14 +17,22 @@ interface PrepareDeckResult {
 export async function PrepareDeck(
   input: DeckParserInput
 ): Promise<PrepareDeckResult> {
-  if (input.noLimits && input.settings.vertexAIPDFQuestions) {
-    // Check for PDF files and convert their contents to HTML
-    for (const file of input.files) {
-      if (isPDFFile(file.name) && file.contents) {
-        file.contents = await convertPDFToHTML(
-          file.contents.toString('base64')
-        );
-      }
+  for (const file of input.files) {
+    if (!isPDFFile(file.name) || !file.contents) continue;
+
+    if (input.noLimits && input.settings.vertexAIPDFQuestions) {
+      file.contents = await convertPDFToHTML(file.contents.toString('base64'));
+    } else {
+      file.contents = await convertPDFToImages({
+        name: file.name,
+        workspace: input.workspace,
+        noLimits: input.noLimits,
+        contents: file.contents,
+      });
+      fs.writeFileSync(
+        path.join(input.workspace.location, 'input.html'),
+        file.contents.toString()
+      );
     }
   }
 

diff --git a/src/lib/parser/WorkSpace.ts b/src/lib/parser/WorkSpace.ts
@@ -18,6 +18,7 @@ class Workspace {
   }
 
   private ensureExists() {
+    console.log('Ensuring workspace exists', this.location);
     if (!fs.existsSync(this.location)) {
       fs.mkdirSync(this.location, { recursive: true });
     }

diff --git a/src/lib/parser/exporters/embedFile.ts b/src/lib/parser/exporters/embedFile.ts
@@ -1,13 +1,27 @@
+import fs, { existsSync } from 'fs';
+import path from 'path';
+
 import { File } from '../../zip/zip';
 import { SuffixFrom } from '../../misc/file';
 import getUniqueFileName from '../../misc/getUniqueFileName';
 import CustomExporter from './CustomExporter';
+import Workspace from '../WorkSpace';
 
 const getFile = (
   exporter: CustomExporter,
   files: File[],
-  filePath: string
+  filePath: string,
+  workspace: Workspace
 ): File | undefined => {
+  const fullPath = path.resolve(workspace.location, filePath);
+  if (fullPath.startsWith(workspace.location) && existsSync(fullPath)) {
+    const buffer = fs.readFileSync(fullPath);
+    return {
+      name: fullPath,
+      contents: buffer,
+    } as File;
+  }
+
   const asRootFile = files.find((f) => f.name === filePath);
   if (asRootFile) {
     return asRootFile;
@@ -34,13 +48,18 @@ const getFile = (
   return undefined;
 };
 
-export const embedFile = (
-  exporter: CustomExporter,
-  files: File[],
-  filePath: string
-): string | null => {
+interface EmbedFileInput {
+  exporter: CustomExporter;
+  files: File[];
+  filePath: string;
+  workspace: Workspace;
+}
+
+export const embedFile = (input: EmbedFileInput): string | null => {
+  const { exporter, files, filePath, workspace } = input;
+
   const suffix = SuffixFrom(filePath);
-  const file = getFile(exporter, files, filePath);
+  const file = getFile(exporter, files, filePath, workspace);
 
   if (file) {
     const newName = getUniqueFileName(filePath) + suffix;

diff --git a/src/lib/parser/pdf/convertPDFToImages.ts b/src/lib/parser/pdf/convertPDFToImages.ts
@@ -0,0 +1,115 @@
+import { writeFile } from 'fs/promises';
+import path from 'path';
+import { execFile } from 'child_process';
+import Workspace from '../WorkSpace';
+import { S3 } from 'aws-sdk';
+
+function getPageCount(pdfPath: string): Promise<number> {
+  return new Promise((resolve, reject) => {
+    execFile('/usr/local/bin/pdfinfo', [pdfPath], (error, stdout) => {
+      if (error) {
+        reject(new Error('Failed to execute pdfinfo'));
+        return;
+      }
+
+      const pageCount = parseInt(
+        stdout
+          .split('\n')
+          .find((line) => line.startsWith('Pages:'))
+          ?.split(/\s+/)[1] || '0'
+      );
+
+      if (!pageCount) {
+        reject(new Error('Failed to get page count'));
+        return;
+      }
+
+      resolve(pageCount);
+    });
+  });
+}
+
+function convertPage(
+  pdfPath: string,
+  page: number,
+  totalPages: number
+): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const outputBase = `${pdfPath}-page${page}`;
+    execFile(
+      'pdftoppm',
+      [
+        '-png',
+        '-f',
+        page.toString(),
+        '-l',
+        page.toString(),
+        pdfPath,
+        outputBase,
+      ],
+      (error) => {
+        if (error) {
+          reject(new Error(`Failed to convert page ${page} to PNG`));
+          return;
+        }
+        const pageNum = totalPages < 10 ? page : String(page).padStart(2, '0');
+        resolve(outputBase + `-${pageNum}.png`);
+      }
+    );
+  });
+}
+
+function combineIntoHTML(imagePaths: string[], title: string): string {
+  const html = `<!DOCTYPE html>
+<html>
+<head><title>${title}</title></head>
+<body>
+  ${Array.from({ length: imagePaths.length / 2 }, (_, i) => {
+    const front = path.basename(imagePaths[i * 2]);
+    const back = path.basename(imagePaths[i * 2 + 1]);
+    return `<ul class="toggle">
+    <li>
+      <details>
+        <summary>
+        <img src="${front}" />
+        </summary>
+        <img src="${back}" />
+      </details>
+    </li>
+    </ul>`;
+  }).join('\n')}
+</body>
+</html>`;
+
+  return html;
+}
+
+interface ConvertPDFToImagesInput {
+  workspace: Workspace;
+  noLimits: boolean;
+  contents?: S3.Body;
+  name?: string;
+}
+
+export async function convertPDFToImages(
+  input: ConvertPDFToImagesInput
+): Promise<Buffer> {
+  const { contents, workspace, noLimits, name } = input;
+  const pdfPath = path.join(workspace.location, name ?? 'Default.pdf');
+  await writeFile(pdfPath, Buffer.from(contents as Buffer));
+
+  const pageCount = await getPageCount(pdfPath);
+  const title = path.basename(pdfPath);
+  if (!noLimits && pageCount > 100) {
+    throw new Error('PDF exceeds maximum page limit of 100');
+  }
+
+  const imagePaths = await Promise.all(
+    Array.from({ length: pageCount }, (_, i) =>
+      convertPage(pdfPath, i + 1, pageCount)
+    )
+  );
+
+  const html = combineIntoHTML(imagePaths, title);
+  return Buffer.from(html);
+}
diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts
@@ -16,8 +16,11 @@ export const isTwitterURL = (url: string) => /twitter\.com/.exec(url);
 
 export const isVimeoURL = (url: string) => /vimeo\.com/.exec(url);
 
-export const isImageFileEmbedable = (url: string) =>
-  !url.startsWith('http') && !url.startsWith('data:image');
+export const isImageFileEmbedable = (url: string) => {
+  const isLocalPath = !url.startsWith('http') && !url.startsWith('data:image');
+  const hasTraversal = url.includes('../') || url.includes('..\\');
+  return isLocalPath && !hasTraversal;
+};
 
 export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName);
 

diff --git a/src/usecases/uploads/allowPDFUpload.ts b/src/usecases/uploads/allowPDFUpload.ts
diff --git a/src/usecases/uploads/getPackagesFromZip.ts b/src/usecases/uploads/getPackagesFromZip.ts
@@ -6,7 +6,6 @@ import Package from '../../lib/parser/Package';
 import { checkFlashcardsLimits } from '../../lib/User/checkFlashcardsLimits';
 import { PackageResult } from './GeneratePackagesUseCase';
 import Workspace from '../../lib/parser/WorkSpace';
-import { allowPDFUpload } from './allowPDFUpload';
 import { getMaxUploadCount } from '../../lib/misc/getMaxUploadCount';
 
 import { isZipContentFileSupported } from './isZipContentFileSupported';
@@ -33,10 +32,7 @@ export const getPackagesFromZip = async (
     /**
      * XXX: Should we also support files without extensions?
      */
-    if (
-      isZipContentFileSupported(fileName) ||
-      allowPDFUpload(fileName, paying, settings.vertexAIPDFQuestions)
-    ) {
+    if (isZipContentFileSupported(fileName)) {
       const deck = await PrepareDeck({
         name: fileName,
         files: zipHandler.files,

diff --git a/src/usecases/uploads/isZipContentFileSupported.ts b/src/usecases/uploads/isZipContentFileSupported.ts
@@ -3,6 +3,7 @@ import {
   isMarkdownFile,
   isPlainText,
   isCSVFile,
+  isPDFFile,
 } from '../../lib/storage/checks';
 
 /**
@@ -12,4 +13,5 @@ export const isZipContentFileSupported = (filename: string) =>
   isHTMLFile(filename) ??
   isMarkdownFile(filename) ??
   isPlainText(filename) ??
-  isCSVFile(filename);
+  isCSVFile(filename) ??
+  isPDFFile(filename);
diff --git a/src/usecases/uploads/worker.ts b/src/usecases/uploads/worker.ts
@@ -7,7 +7,6 @@ import { PrepareDeck } from '../../lib/parser/PrepareDeck';
 import { isZIPFile } from '../../lib/storage/checks';
 import { getPackagesFromZip } from './getPackagesFromZip';
 import Workspace from '../../lib/parser/WorkSpace';
-import { allowPDFUpload } from './allowPDFUpload';
 import { isZipContentFileSupported } from './isZipContentFileSupported';
 
 interface GenerationData {
@@ -29,10 +28,7 @@ function doGenerationWork(data: GenerationData) {
       const filename = file.originalname;
       const key = file.key;
 
-      if (
-        isZipContentFileSupported(filename) ||
-        allowPDFUpload(filename, paying, settings.vertexAIPDFQuestions)
-      ) {
+      if (isZipContentFileSupported(filename)) {
         const d = await PrepareDeck({
           name: filename,
           files: [{ name: filename, contents: fileContents }],