diff --git a/src/lib/misc/isLimitError.test.ts b/src/lib/misc/isLimitError.test.ts index e3e23b7a..f11590bd 100644 --- a/src/lib/misc/isLimitError.test.ts +++ b/src/lib/misc/isLimitError.test.ts @@ -1,21 +1,22 @@ -import { PDF_EXCEEDS_MAX_PAGE_LIMIT } from '../parser/pdf/convertPDFToImages'; +import { PDF_EXCEEDS_MAX_PAGE_LIMIT } from '../pdf/convertPDFToImages'; import { isLimitError } from './isLimitError'; -const MOCK_MSG = "

Your request has hit the limit

If you already have an account, please login and try again. If you are still experiencing issues, please contact support@2anki.net.

"; +const MOCK_MSG = + '

Your request has hit the limit

If you already have an account, please login and try again. If you are still experiencing issues, please contact support@2anki.net.

'; describe('isLimitError', () => { it('returns true ', () => { - expect(isLimitError(new Error("File too large"))).toBe(true); - expect(isLimitError(new Error("You can only add 100 cards"))).toBe(true); + expect(isLimitError(new Error('File too large'))).toBe(true); + expect(isLimitError(new Error('You can only add 100 cards'))).toBe(true); expect(isLimitError(new Error(PDF_EXCEEDS_MAX_PAGE_LIMIT))).toBe(true); }); it('returns true for html', () => { expect(isLimitError(new Error(MOCK_MSG))).toBe(true); - }) + }); - it("returns false", () => { - expect(isLimitError(new Error("File too small"))).toBe(false); + it('returns false', () => { + expect(isLimitError(new Error('File too small'))).toBe(false); expect(isLimitError()).toBe(false); - }) + }); }); diff --git a/src/lib/misc/isLimitError.ts b/src/lib/misc/isLimitError.ts index 997ed767..adbb6b5b 100644 --- a/src/lib/misc/isLimitError.ts +++ b/src/lib/misc/isLimitError.ts @@ -1,4 +1,4 @@ -import { PDF_EXCEEDS_MAX_PAGE_LIMIT } from '../parser/pdf/convertPDFToImages'; +import { PDF_EXCEEDS_MAX_PAGE_LIMIT } from '../pdf/convertPDFToImages'; const LIMIT_MESSAGES = [ 'File too large', diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts index 37bff7ed..f46a2da0 100644 --- a/src/lib/parser/PrepareDeck.ts +++ b/src/lib/parser/PrepareDeck.ts @@ -1,9 +1,10 @@ import getDeckFilename from '../anki/getDeckFilename'; import { DeckParser, DeckParserInput } from './DeckParser'; import Deck from './Deck'; -import { isPDFFile } from '../storage/checks'; +import { isPDFFile, isPPTFile } from '../storage/checks'; import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML'; -import { convertPDFToImages } from './pdf/convertPDFToImages'; +import { convertPDFToImages } from '../pdf/convertPDFToImages'; +import { convertPPTToPDF } from '../pdf/ConvertPPTToPDF'; interface PrepareDeckResult { name: string; @@ -15,11 +16,24 @@ export async function PrepareDeck( input: DeckParserInput ): Promise { for (const file of input.files) { - if (!isPDFFile(file.name) || !file.contents) continue; + if ((!isPDFFile(file.name) && !isPPTFile(file.name)) || !file.contents) + continue; - if (input.noLimits && input.settings.vertexAIPDFQuestions) { + if ( + isPDFFile(file.name) && + input.noLimits && + input.settings.vertexAIPDFQuestions + ) { file.contents = await convertPDFToHTML(file.contents.toString('base64')); } else { + if (isPPTFile(file.name)) { + file.contents = await convertPPTToPDF( + file.name, + file.contents, + input.workspace + ); + } + file.contents = await convertPDFToImages({ name: file.name, workspace: input.workspace, diff --git a/src/lib/parser/pdf/convertPDFToImages.ts b/src/lib/parser/pdf/convertPDFToImages.ts deleted file mode 100644 index c7bd15b9..00000000 --- a/src/lib/parser/pdf/convertPDFToImages.ts +++ /dev/null @@ -1,135 +0,0 @@ -import { writeFile } from 'fs/promises'; -import path from 'path'; -import { execFile } from 'child_process'; -import Workspace from '../WorkSpace'; -import { S3 } from 'aws-sdk'; - -function getPageCount(pdfPath: string): Promise { - return new Promise((resolve, reject) => { - const pdfinfoBin = - process.platform === 'darwin' - ? '/usr/local/bin/pdfinfo' - : '/usr/bin/pdfinfo'; - execFile(pdfinfoBin, [pdfPath], (error, stdout) => { - if (error) { - reject(new Error('Failed to execute pdfinfo')); - return; - } - - const pageCount = parseInt( - stdout - .split('\n') - .find((line) => line.startsWith('Pages:')) - ?.split(/\s+/)[1] || '0' - ); - - if (!pageCount) { - reject(new Error('Failed to get page count')); - return; - } - - resolve(pageCount); - }); - }); -} - -function convertPage( - pdfPath: string, - pageNumber: number, - totalPageCount: number -): Promise { - const outputFileNameBase = `${pdfPath}-page${pageNumber}`; - - const determinePaddingLength = (pageCount: number): number => { - if (pageCount >= 1000) return 4; - if (pageCount >= 100) return 3; - if (pageCount >= 10) return 2; - return 1; - }; - - const paddedPageNumber = String(pageNumber).padStart( - determinePaddingLength(totalPageCount), - '0' - ); - - return new Promise((resolve, reject) => { - execFile( - 'pdftoppm', - [ - '-png', - '-f', - pageNumber.toString(), - '-l', - pageNumber.toString(), - pdfPath, - outputFileNameBase, - ], - (error) => { - if (error) { - return reject( - new Error(`Failed to convert page ${pageNumber} to PNG`) - ); - } - resolve(`${outputFileNameBase}-${paddedPageNumber}.png`); - } - ); - }); -} - -function combineIntoHTML(imagePaths: string[], title: string): string { - const html = ` - -${title} - - ${Array.from({ length: imagePaths.length / 2 }, (_, i) => { - const front = path.basename(imagePaths[i * 2]); - const back = path.basename(imagePaths[i * 2 + 1]); - return `
    -
  • -
    - - - - -
    -
  • -
`; - }).join('\n')} - -`; - - return html; -} - -interface ConvertPDFToImagesInput { - workspace: Workspace; - noLimits: boolean; - contents?: S3.Body; - name?: string; -} - -export const PDF_EXCEEDS_MAX_PAGE_LIMIT = - 'PDF exceeds maximum page limit of 100 for free and anonymous users.'; - -export async function convertPDFToImages( - input: ConvertPDFToImagesInput -): Promise { - const { contents, workspace, noLimits, name } = input; - const pdfPath = path.join(workspace.location, name ?? 'Default.pdf'); - await writeFile(pdfPath, Buffer.from(contents as Buffer)); - - const pageCount = await getPageCount(pdfPath); - const title = path.basename(pdfPath); - if (!noLimits && pageCount > 100) { - throw new Error(PDF_EXCEEDS_MAX_PAGE_LIMIT); - } - - const imagePaths = await Promise.all( - Array.from({ length: pageCount }, (_, i) => - convertPage(pdfPath, i + 1, pageCount) - ) - ); - - const html = combineIntoHTML(imagePaths, title); - return Buffer.from(html); -} diff --git a/src/lib/pdf/ConvertPPTToPDF.ts b/src/lib/pdf/ConvertPPTToPDF.ts new file mode 100644 index 00000000..cf1e8104 --- /dev/null +++ b/src/lib/pdf/ConvertPPTToPDF.ts @@ -0,0 +1,50 @@ +import { S3 } from 'aws-sdk'; +import Workspace from '../parser/WorkSpace'; +import path from 'path'; +import fs from 'fs/promises'; +import { execFile } from 'child_process'; + +export function convertPPTToPDF( + name: string, + contents: S3.Body, + workspace: Workspace +): Promise { + return new Promise((resolve, reject) => { + const sofficeBin = + process.platform === 'darwin' + ? '/Applications/LibreOffice.app/Contents/MacOS/soffice' + : '/usr/bin/soffice'; + const tempFile = path.join(workspace.location, name); + + fs.writeFile(tempFile, Buffer.from(contents as Buffer)) + .then(() => { + const pdfFile = path.join( + workspace.location, + path.basename(name, path.extname(name)) + '.pdf' + ); + + execFile( + sofficeBin, + ['--headless', '--convert-to', 'pdf', tempFile], + { + cwd: workspace.location, + }, + async (error, stdout, stderr) => { + await fs.writeFile( + path.join(workspace.location, 'stdout.log'), + stdout + ); + await fs.writeFile( + path.join(workspace.location, 'stderr.log'), + stderr + ); + if (error) { + reject(new Error(error.message || 'Conversion failed')); + } + resolve(await fs.readFile(pdfFile)); + } + ); + }) + .catch((err) => reject(new Error(err.message || 'File write failed'))); + }); +} diff --git a/src/lib/pdf/combineIntoHTML.ts b/src/lib/pdf/combineIntoHTML.ts new file mode 100644 index 00000000..a8f1d84d --- /dev/null +++ b/src/lib/pdf/combineIntoHTML.ts @@ -0,0 +1,26 @@ +import path from 'path'; + +export function combineIntoHTML(imagePaths: string[], title: string): string { + const html = ` + +${title} + + ${Array.from({ length: imagePaths.length / 2 }, (_, i) => { + const front = path.basename(imagePaths[i * 2]); + const back = path.basename(imagePaths[i * 2 + 1]); + return `
    +
  • +
    + + + + +
    +
  • +
`; + }).join('\n')} + +`; + + return html; +} diff --git a/src/lib/pdf/convertPDFToImages.ts b/src/lib/pdf/convertPDFToImages.ts new file mode 100644 index 00000000..a111bf7e --- /dev/null +++ b/src/lib/pdf/convertPDFToImages.ts @@ -0,0 +1,40 @@ +import { writeFile } from 'fs/promises'; +import path from 'path'; +import Workspace from '../parser/WorkSpace'; +import { S3 } from 'aws-sdk'; +import { getPageCount } from './getPageCount'; +import { convertPage } from './convertPage'; +import { combineIntoHTML } from './combineIntoHTML'; + +interface ConvertPDFToImagesInput { + workspace: Workspace; + noLimits: boolean; + contents?: S3.Body; + name?: string; +} + +export const PDF_EXCEEDS_MAX_PAGE_LIMIT = + 'PDF exceeds maximum page limit of 100 for free and anonymous users.'; + +export async function convertPDFToImages( + input: ConvertPDFToImagesInput +): Promise { + const { contents, workspace, noLimits, name } = input; + const pdfPath = path.join(workspace.location, name ?? 'Default.pdf'); + await writeFile(pdfPath, Buffer.from(contents as Buffer)); + + const pageCount = await getPageCount(pdfPath); + const title = path.basename(pdfPath); + if (!noLimits && pageCount > 100) { + throw new Error(PDF_EXCEEDS_MAX_PAGE_LIMIT); + } + + const imagePaths = await Promise.all( + Array.from({ length: pageCount }, (_, i) => + convertPage(pdfPath, i + 1, pageCount) + ) + ); + + const html = combineIntoHTML(imagePaths, title); + return Buffer.from(html); +} diff --git a/src/lib/pdf/convertPage.ts b/src/lib/pdf/convertPage.ts new file mode 100644 index 00000000..0e1b1ac8 --- /dev/null +++ b/src/lib/pdf/convertPage.ts @@ -0,0 +1,50 @@ +import { execFile } from 'child_process'; +import os from 'os'; + +export function convertPage( + pdfPath: string, + pageNumber: number, + totalPageCount: number +): Promise { + const outputFileNameBase = `${pdfPath}-page${pageNumber}`; + + const determinePaddingLength = (pageCount: number): number => { + if (pageCount >= 1000) return 4; + if (pageCount >= 100) return 3; + if (pageCount >= 10) return 2; + return 1; + }; + + const paddedPageNumber = String(pageNumber).padStart( + determinePaddingLength(totalPageCount), + '0' + ); + + const pdftoppmPath = + os.platform() === 'darwin' + ? '/usr/local/bin/pdftoppm' + : '/usr/bin/pdftoppm'; + + return new Promise((resolve, reject) => { + execFile( + pdftoppmPath, + [ + '-png', + '-f', + pageNumber.toString(), + '-l', + pageNumber.toString(), + pdfPath, + outputFileNameBase, + ], + (error) => { + if (error) { + return reject( + new Error(`Failed to convert page ${pageNumber} to PNG`) + ); + } + resolve(`${outputFileNameBase}-${paddedPageNumber}.png`); + } + ); + }); +} diff --git a/src/lib/pdf/getPageCount.ts b/src/lib/pdf/getPageCount.ts new file mode 100644 index 00000000..67cab91f --- /dev/null +++ b/src/lib/pdf/getPageCount.ts @@ -0,0 +1,30 @@ +import { execFile } from 'child_process'; + +export function getPageCount(pdfPath: string): Promise { + return new Promise((resolve, reject) => { + const pdfinfoBin = + process.platform === 'darwin' + ? '/usr/local/bin/pdfinfo' + : '/usr/bin/pdfinfo'; + execFile(pdfinfoBin, [pdfPath], (error, stdout) => { + if (error) { + reject(new Error('Failed to execute pdfinfo')); + return; + } + + const pageCount = parseInt( + stdout + .split('\n') + .find((line) => line.startsWith('Pages:')) + ?.split(/\s+/)[1] || '0' + ); + + if (!pageCount) { + reject(new Error('Failed to get page count')); + return; + } + + resolve(pageCount); + }); + }); +} diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts index 5b7aafca..5914fc9e 100644 --- a/src/lib/storage/checks.ts +++ b/src/lib/storage/checks.ts @@ -26,6 +26,8 @@ export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName); export const isPDFFile = (fileName: string) => /.pdf$/i.exec(fileName); +export const isPPTFile = (fileName: string) => /\.(ppt|pptx)$/i.exec(fileName); + /** * A file is considered a potential zip file if it does not contain a period. * Since zip files are not named with a period, but it is possible to upload such files using drag and drop. diff --git a/src/usecases/uploads/isZipContentFileSupported.ts b/src/usecases/uploads/isZipContentFileSupported.ts index 8f03067c..4312f13b 100644 --- a/src/usecases/uploads/isZipContentFileSupported.ts +++ b/src/usecases/uploads/isZipContentFileSupported.ts @@ -4,6 +4,7 @@ import { isPlainText, isCSVFile, isPDFFile, + isPPTFile, } from '../../lib/storage/checks'; /** @@ -14,4 +15,5 @@ export const isZipContentFileSupported = (filename: string) => isMarkdownFile(filename) ?? isPlainText(filename) ?? isCSVFile(filename) ?? - isPDFFile(filename); + isPDFFile(filename) ?? + isPPTFile(filename);