From 7aca698330263bf9066c7b4448c660972b9779ae Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sun, 1 Dec 2024 17:24:11 +0100 Subject: [PATCH] feat: Support PDF file uploads Closes: https://github.com/2anki/server/issues/1471 --- src/lib/parser/DeckParser.ts | 20 ++-- src/lib/parser/PrepareDeck.ts | 23 ++-- src/lib/parser/WorkSpace.ts | 1 + src/lib/parser/exporters/embedFile.ts | 34 ++++-- src/lib/parser/pdf/convertPDFToImages.ts | 100 ++++++++++++++++++ src/lib/storage/checks.ts | 7 +- src/usecases/uploads/allowPDFUpload.ts | 9 -- src/usecases/uploads/getPackagesFromZip.ts | 6 +- .../uploads/isZipContentFileSupported.ts | 4 +- src/usecases/uploads/worker.ts | 6 +- 10 files changed, 165 insertions(+), 45 deletions(-) create mode 100644 src/lib/parser/pdf/convertPDFToImages.ts delete mode 100644 src/usecases/uploads/allowPDFUpload.ts diff --git a/src/lib/parser/DeckParser.ts b/src/lib/parser/DeckParser.ts index c72eb775..e298a03e 100644 --- a/src/lib/parser/DeckParser.ts +++ b/src/lib/parser/DeckParser.ts @@ -390,11 +390,12 @@ export class DeckParser { images.each((_i, elem) => { const originalName = dom(elem).attr('src'); if (originalName && isImageFileEmbedable(originalName)) { - const newName = embedFile( + const newName = embedFile({ exporter, - this.files, - decodeURIComponent(originalName) - ); + files: this.files, + filePath: decodeURIComponent(originalName), + workspace: ws, + }); if (newName) { dom(elem).attr('src', newName); card.media.push(newName); @@ -418,11 +419,12 @@ export class DeckParser { '' ); } - const newFileName = embedFile( + const newFileName = embedFile({ exporter, - this.files, - global.decodeURIComponent(audiofile) - ); + files: this.files, + filePath: global.decodeURIComponent(audiofile), + workspace: ws, + }); if (newFileName) { card.back += `[sound:${newFileName}]`; card.media.push(newFileName); @@ -608,6 +610,7 @@ export class DeckParser { : validSummary; if (toggle || this.settings.maxOne) { const toggleHTML = toggle.html(); + console.log(toggleHTML); if (toggleHTML) { let b = toggleHTML.replace(summary.html() || '', ''); if (this.settings.isTextOnlyBack) { @@ -630,6 +633,7 @@ export class DeckParser { } return mangleBackSide; })(); + console.log(front, backSide); const note = new Note(front || '', backSide); note.notionId = parentUL.attr('id'); if (note.notionId && this.settings.addNotionLink) { diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts index 617509d1..d1a485d6 100644 --- a/src/lib/parser/PrepareDeck.ts +++ b/src/lib/parser/PrepareDeck.ts @@ -1,8 +1,12 @@ +import fs from 'fs'; +import path from 'path'; + import getDeckFilename from '../anki/getDeckFilename'; import { DeckParser, DeckParserInput } from './DeckParser'; import Deck from './Deck'; import { isPDFFile } from '../storage/checks'; import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML'; +import { convertPDFToImages } from './pdf/convertPDFToImages'; interface PrepareDeckResult { name: string; @@ -13,14 +17,17 @@ interface PrepareDeckResult { export async function PrepareDeck( input: DeckParserInput ): Promise { - if (input.noLimits && input.settings.vertexAIPDFQuestions) { - // Check for PDF files and convert their contents to HTML - for (const file of input.files) { - if (isPDFFile(file.name) && file.contents) { - file.contents = await convertPDFToHTML( - file.contents.toString('base64') - ); - } + for (const file of input.files) { + if (!isPDFFile(file.name) || !file.contents) continue; + + if (input.noLimits && input.settings.vertexAIPDFQuestions) { + file.contents = await convertPDFToHTML(file.contents.toString('base64')); + } else { + file.contents = await convertPDFToImages(file.contents, input.workspace, input.noLimits); + fs.writeFileSync( + path.join(input.workspace.location, 'input.html'), + file.contents.toString() + ); } } diff --git a/src/lib/parser/WorkSpace.ts b/src/lib/parser/WorkSpace.ts index 768b3dea..84aa98eb 100644 --- a/src/lib/parser/WorkSpace.ts +++ b/src/lib/parser/WorkSpace.ts @@ -18,6 +18,7 @@ class Workspace { } private ensureExists() { + console.log('Ensuring workspace exists', this.location); if (!fs.existsSync(this.location)) { fs.mkdirSync(this.location, { recursive: true }); } diff --git a/src/lib/parser/exporters/embedFile.ts b/src/lib/parser/exporters/embedFile.ts index 3b78687e..f72c7690 100644 --- a/src/lib/parser/exporters/embedFile.ts +++ b/src/lib/parser/exporters/embedFile.ts @@ -1,13 +1,28 @@ +import fs from 'fs'; +import path from 'path'; + import { File } from '../../zip/zip'; import { SuffixFrom } from '../../misc/file'; import getUniqueFileName from '../../misc/getUniqueFileName'; import CustomExporter from './CustomExporter'; +import { existsSync } from 'fs'; +import Workspace from '../WorkSpace'; const getFile = ( exporter: CustomExporter, files: File[], - filePath: string + filePath: string, + workspace: Workspace ): File | undefined => { + const fullPath = path.resolve(workspace.location, filePath); + if (fullPath.startsWith(workspace.location) && existsSync(fullPath)) { + const buffer = fs.readFileSync(fullPath); + return { + name: fullPath, + contents: buffer, + } as File; + } + const asRootFile = files.find((f) => f.name === filePath); if (asRootFile) { return asRootFile; @@ -34,13 +49,18 @@ const getFile = ( return undefined; }; -export const embedFile = ( - exporter: CustomExporter, - files: File[], - filePath: string -): string | null => { +interface EmbedFileInput { + exporter: CustomExporter; + files: File[]; + filePath: string; + workspace: Workspace; +} + +export const embedFile = (input: EmbedFileInput): string | null => { + const { exporter, files, filePath, workspace } = input; + const suffix = SuffixFrom(filePath); - const file = getFile(exporter, files, filePath); + const file = getFile(exporter, files, filePath, workspace); if (file) { const newName = getUniqueFileName(filePath) + suffix; diff --git a/src/lib/parser/pdf/convertPDFToImages.ts b/src/lib/parser/pdf/convertPDFToImages.ts new file mode 100644 index 00000000..c5f3842c --- /dev/null +++ b/src/lib/parser/pdf/convertPDFToImages.ts @@ -0,0 +1,100 @@ +import { writeFile } from 'fs/promises'; +import path from 'path'; +import { execFile } from 'child_process'; +import Workspace from '../WorkSpace'; +import { S3 } from 'aws-sdk'; + +function getPageCount(pdfPath: string): Promise { + return new Promise((resolve, reject) => { + execFile('/usr/local/bin/pdfinfo', [pdfPath], (error, stdout) => { + if (error) { + reject(new Error('Failed to execute pdfinfo')); + return; + } + + const pageCount = parseInt( + stdout + .split('\n') + .find((line) => line.startsWith('Pages:')) + ?.split(/\s+/)[1] || '0' + ); + + if (!pageCount) { + reject(new Error('Failed to get page count')); + return; + } + + resolve(pageCount); + }); + }); +} + +function convertPage(pdfPath: string, page: number): Promise { + return new Promise((resolve, reject) => { + const outputBase = `${pdfPath}-page${page}`; + execFile( + 'pdftoppm', + [ + '-png', + '-f', + page.toString(), + '-l', + page.toString(), + pdfPath, + outputBase, + ], + (error) => { + if (error) { + reject(new Error(`Failed to convert page ${page} to PNG`)); + return; + } + resolve(outputBase + `-${page}.png`); + } + ); + }); +} + +function combineIntoHTML(imagePaths: string[]): string { + const html = ` + + + ${Array.from({ length: imagePaths.length / 2 }, (_, i) => { + const front = path.basename(imagePaths[i * 2]); + const back = path.basename(imagePaths[i * 2 + 1]); + return `
    +
  • +
    + + + + +
    +
  • +
`; + }).join('\n')} + +`; + + return html; +} + +export async function convertPDFToImages( + pdfBuffer: S3.Body, + workspace: Workspace, + noLimits = false +): Promise { + const pdfPath = path.join(workspace.location, 'input.pdf'); + await writeFile(pdfPath, Buffer.from(pdfBuffer as Buffer)); + + const pageCount = await getPageCount(pdfPath); + if (!noLimits && pageCount > 100) { + throw new Error('PDF exceeds maximum page limit of 100'); + } + + const imagePaths = await Promise.all( + Array.from({ length: pageCount }, (_, i) => convertPage(pdfPath, i + 1)) + ); + + const html = await combineIntoHTML(imagePaths); + return Buffer.from(html); +} diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts index cb178b8a..29c6047b 100644 --- a/src/lib/storage/checks.ts +++ b/src/lib/storage/checks.ts @@ -16,8 +16,11 @@ export const isTwitterURL = (url: string) => /twitter\.com/.exec(url); export const isVimeoURL = (url: string) => /vimeo\.com/.exec(url); -export const isImageFileEmbedable = (url: string) => - !url.startsWith('http') && !url.startsWith('data:image'); +export const isImageFileEmbedable = (url: string) => { + const isLocalPath = !url.startsWith('http') && !url.startsWith('data:image'); + const hasTraversal = url.includes('../') || url.includes('..\\'); + return isLocalPath && !hasTraversal; +}; export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName); diff --git a/src/usecases/uploads/allowPDFUpload.ts b/src/usecases/uploads/allowPDFUpload.ts deleted file mode 100644 index a0b89582..00000000 --- a/src/usecases/uploads/allowPDFUpload.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { isPDFFile } from '../../lib/storage/checks'; - -export const allowPDFUpload = ( - fileName: string, - premium: boolean, - vertexAIPDFQuestions: boolean -): null | false | boolean => { - return isPDFFile(fileName) && premium && vertexAIPDFQuestions; -}; diff --git a/src/usecases/uploads/getPackagesFromZip.ts b/src/usecases/uploads/getPackagesFromZip.ts index e8acf3a7..a7f020da 100644 --- a/src/usecases/uploads/getPackagesFromZip.ts +++ b/src/usecases/uploads/getPackagesFromZip.ts @@ -6,7 +6,6 @@ import Package from '../../lib/parser/Package'; import { checkFlashcardsLimits } from '../../lib/User/checkFlashcardsLimits'; import { PackageResult } from './GeneratePackagesUseCase'; import Workspace from '../../lib/parser/WorkSpace'; -import { allowPDFUpload } from './allowPDFUpload'; import { getMaxUploadCount } from '../../lib/misc/getMaxUploadCount'; import { isZipContentFileSupported } from './isZipContentFileSupported'; @@ -33,10 +32,7 @@ export const getPackagesFromZip = async ( /** * XXX: Should we also support files without extensions? */ - if ( - isZipContentFileSupported(fileName) || - allowPDFUpload(fileName, paying, settings.vertexAIPDFQuestions) - ) { + if (isZipContentFileSupported(fileName)) { const deck = await PrepareDeck({ name: fileName, files: zipHandler.files, diff --git a/src/usecases/uploads/isZipContentFileSupported.ts b/src/usecases/uploads/isZipContentFileSupported.ts index a2505662..8f03067c 100644 --- a/src/usecases/uploads/isZipContentFileSupported.ts +++ b/src/usecases/uploads/isZipContentFileSupported.ts @@ -3,6 +3,7 @@ import { isMarkdownFile, isPlainText, isCSVFile, + isPDFFile, } from '../../lib/storage/checks'; /** @@ -12,4 +13,5 @@ export const isZipContentFileSupported = (filename: string) => isHTMLFile(filename) ?? isMarkdownFile(filename) ?? isPlainText(filename) ?? - isCSVFile(filename); + isCSVFile(filename) ?? + isPDFFile(filename); diff --git a/src/usecases/uploads/worker.ts b/src/usecases/uploads/worker.ts index 7134b0bc..fd0aec7f 100644 --- a/src/usecases/uploads/worker.ts +++ b/src/usecases/uploads/worker.ts @@ -7,7 +7,6 @@ import { PrepareDeck } from '../../lib/parser/PrepareDeck'; import { isZIPFile } from '../../lib/storage/checks'; import { getPackagesFromZip } from './getPackagesFromZip'; import Workspace from '../../lib/parser/WorkSpace'; -import { allowPDFUpload } from './allowPDFUpload'; import { isZipContentFileSupported } from './isZipContentFileSupported'; interface GenerationData { @@ -29,10 +28,7 @@ function doGenerationWork(data: GenerationData) { const filename = file.originalname; const key = file.key; - if ( - isZipContentFileSupported(filename) || - allowPDFUpload(filename, paying, settings.vertexAIPDFQuestions) - ) { + if (isZipContentFileSupported(filename)) { const d = await PrepareDeck({ name: filename, files: [{ name: filename, contents: fileContents }],