Skip to content

Commit

Permalink
feat: Support PDF file uploads
Browse files Browse the repository at this point in the history
Closes: #1471
  • Loading branch information
aalemayhu committed Dec 1, 2024
1 parent 2477eab commit c66b02f
Show file tree
Hide file tree
Showing 10 changed files with 182 additions and 45 deletions.
18 changes: 10 additions & 8 deletions src/lib/parser/DeckParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -390,11 +390,12 @@ export class DeckParser {
images.each((_i, elem) => {
const originalName = dom(elem).attr('src');
if (originalName && isImageFileEmbedable(originalName)) {
const newName = embedFile(
const newName = embedFile({
exporter,
this.files,
decodeURIComponent(originalName)
);
files: this.files,
filePath: decodeURIComponent(originalName),
workspace: ws,
});
if (newName) {
dom(elem).attr('src', newName);
card.media.push(newName);
Expand All @@ -418,11 +419,12 @@ export class DeckParser {
''
);
}
const newFileName = embedFile(
const newFileName = embedFile({
exporter,
this.files,
global.decodeURIComponent(audiofile)
);
files: this.files,
filePath: global.decodeURIComponent(audiofile),
workspace: ws,
});
if (newFileName) {
card.back += `[sound:${newFileName}]`;
card.media.push(newFileName);
Expand Down
28 changes: 20 additions & 8 deletions src/lib/parser/PrepareDeck.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import fs from 'fs';
import path from 'path';

import getDeckFilename from '../anki/getDeckFilename';
import { DeckParser, DeckParserInput } from './DeckParser';
import Deck from './Deck';
import { isPDFFile } from '../storage/checks';
import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML';
import { convertPDFToImages } from './pdf/convertPDFToImages';

interface PrepareDeckResult {
name: string;
Expand All @@ -13,14 +17,22 @@ interface PrepareDeckResult {
export async function PrepareDeck(
input: DeckParserInput
): Promise<PrepareDeckResult> {
if (input.noLimits && input.settings.vertexAIPDFQuestions) {
// Check for PDF files and convert their contents to HTML
for (const file of input.files) {
if (isPDFFile(file.name) && file.contents) {
file.contents = await convertPDFToHTML(
file.contents.toString('base64')
);
}
for (const file of input.files) {
if (!isPDFFile(file.name) || !file.contents) continue;

if (input.noLimits && input.settings.vertexAIPDFQuestions) {
file.contents = await convertPDFToHTML(file.contents.toString('base64'));
} else {
file.contents = await convertPDFToImages({
name: file.name,
workspace: input.workspace,
noLimits: input.noLimits,
contents: file.contents,
});
fs.writeFileSync(
path.join(input.workspace.location, 'input.html'),
file.contents.toString()
);
}
}

Expand Down
1 change: 1 addition & 0 deletions src/lib/parser/WorkSpace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Workspace {
}

private ensureExists() {
console.log('Ensuring workspace exists', this.location);
if (!fs.existsSync(this.location)) {
fs.mkdirSync(this.location, { recursive: true });
}
Expand Down
33 changes: 26 additions & 7 deletions src/lib/parser/exporters/embedFile.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
import fs, { existsSync } from 'fs';
import path from 'path';

import { File } from '../../zip/zip';
import { SuffixFrom } from '../../misc/file';
import getUniqueFileName from '../../misc/getUniqueFileName';
import CustomExporter from './CustomExporter';
import Workspace from '../WorkSpace';

const getFile = (
exporter: CustomExporter,
files: File[],
filePath: string
filePath: string,
workspace: Workspace
): File | undefined => {
const fullPath = path.resolve(workspace.location, filePath);
if (fullPath.startsWith(workspace.location) && existsSync(fullPath)) {
const buffer = fs.readFileSync(fullPath);
return {
name: fullPath,
contents: buffer,
} as File;
}

const asRootFile = files.find((f) => f.name === filePath);
if (asRootFile) {
return asRootFile;
Expand All @@ -34,13 +48,18 @@ const getFile = (
return undefined;
};

export const embedFile = (
exporter: CustomExporter,
files: File[],
filePath: string
): string | null => {
interface EmbedFileInput {
exporter: CustomExporter;
files: File[];
filePath: string;
workspace: Workspace;
}

export const embedFile = (input: EmbedFileInput): string | null => {
const { exporter, files, filePath, workspace } = input;

const suffix = SuffixFrom(filePath);
const file = getFile(exporter, files, filePath);
const file = getFile(exporter, files, filePath, workspace);

if (file) {
const newName = getUniqueFileName(filePath) + suffix;
Expand Down
115 changes: 115 additions & 0 deletions src/lib/parser/pdf/convertPDFToImages.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import { writeFile } from 'fs/promises';
import path from 'path';
import { execFile } from 'child_process';
import Workspace from '../WorkSpace';
import { S3 } from 'aws-sdk';

function getPageCount(pdfPath: string): Promise<number> {
return new Promise((resolve, reject) => {
execFile('/usr/local/bin/pdfinfo', [pdfPath], (error, stdout) => {
if (error) {
reject(new Error('Failed to execute pdfinfo'));
return;
}

const pageCount = parseInt(
stdout
.split('\n')
.find((line) => line.startsWith('Pages:'))
?.split(/\s+/)[1] || '0'
);

if (!pageCount) {
reject(new Error('Failed to get page count'));
return;
}

resolve(pageCount);
});
});
}

function convertPage(
pdfPath: string,
page: number,
totalPages: number
): Promise<string> {
return new Promise((resolve, reject) => {
const outputBase = `${pdfPath}-page${page}`;
execFile(
'pdftoppm',
[
'-png',
'-f',
page.toString(),
'-l',
page.toString(),
pdfPath,
outputBase,
],
(error) => {
if (error) {
reject(new Error(`Failed to convert page ${page} to PNG`));
return;
}
const pageNum = totalPages < 10 ? page : String(page).padStart(2, '0');
resolve(outputBase + `-${pageNum}.png`);
}
);
});
}

function combineIntoHTML(imagePaths: string[], title: string): string {
const html = `<!DOCTYPE html>
<html>
<head><title>${title}</title></head>
<body>
${Array.from({ length: imagePaths.length / 2 }, (_, i) => {
const front = path.basename(imagePaths[i * 2]);
const back = path.basename(imagePaths[i * 2 + 1]);
return `<ul class="toggle">
<li>
<details>
<summary>
<img src="${front}" />
</summary>
<img src="${back}" />
</details>
</li>
</ul>`;
}).join('\n')}
</body>
</html>`;

return html;
}

interface ConvertPDFToImagesInput {
workspace: Workspace;
noLimits: boolean;
contents?: S3.Body;
name?: string;
}

export async function convertPDFToImages(
input: ConvertPDFToImagesInput
): Promise<Buffer> {
const { contents, workspace, noLimits, name } = input;
const pdfPath = path.join(workspace.location, name ?? 'Default.pdf');
await writeFile(pdfPath, Buffer.from(contents as Buffer));

const pageCount = await getPageCount(pdfPath);
const title = path.basename(pdfPath);
if (!noLimits && pageCount > 100) {
throw new Error('PDF exceeds maximum page limit of 100');
}

const imagePaths = await Promise.all(
Array.from({ length: pageCount }, (_, i) =>
convertPage(pdfPath, i + 1, pageCount)
)
);

const html = combineIntoHTML(imagePaths, title);
return Buffer.from(html);
}
7 changes: 5 additions & 2 deletions src/lib/storage/checks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ export const isTwitterURL = (url: string) => /twitter\.com/.exec(url);

export const isVimeoURL = (url: string) => /vimeo\.com/.exec(url);

export const isImageFileEmbedable = (url: string) =>
!url.startsWith('http') && !url.startsWith('data:image');
export const isImageFileEmbedable = (url: string) => {
const isLocalPath = !url.startsWith('http') && !url.startsWith('data:image');
const hasTraversal = url.includes('../') || url.includes('..\\');
return isLocalPath && !hasTraversal;
};

export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName);

Expand Down
9 changes: 0 additions & 9 deletions src/usecases/uploads/allowPDFUpload.ts

This file was deleted.

6 changes: 1 addition & 5 deletions src/usecases/uploads/getPackagesFromZip.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import Package from '../../lib/parser/Package';
import { checkFlashcardsLimits } from '../../lib/User/checkFlashcardsLimits';
import { PackageResult } from './GeneratePackagesUseCase';
import Workspace from '../../lib/parser/WorkSpace';
import { allowPDFUpload } from './allowPDFUpload';
import { getMaxUploadCount } from '../../lib/misc/getMaxUploadCount';

import { isZipContentFileSupported } from './isZipContentFileSupported';
Expand All @@ -33,10 +32,7 @@ export const getPackagesFromZip = async (
/**
* XXX: Should we also support files without extensions?
*/
if (
isZipContentFileSupported(fileName) ||
allowPDFUpload(fileName, paying, settings.vertexAIPDFQuestions)
) {
if (isZipContentFileSupported(fileName)) {
const deck = await PrepareDeck({
name: fileName,
files: zipHandler.files,
Expand Down
4 changes: 3 additions & 1 deletion src/usecases/uploads/isZipContentFileSupported.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
isMarkdownFile,
isPlainText,
isCSVFile,
isPDFFile,
} from '../../lib/storage/checks';

/**
Expand All @@ -12,4 +13,5 @@ export const isZipContentFileSupported = (filename: string) =>
isHTMLFile(filename) ??
isMarkdownFile(filename) ??
isPlainText(filename) ??
isCSVFile(filename);
isCSVFile(filename) ??
isPDFFile(filename);
6 changes: 1 addition & 5 deletions src/usecases/uploads/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import { PrepareDeck } from '../../lib/parser/PrepareDeck';
import { isZIPFile } from '../../lib/storage/checks';
import { getPackagesFromZip } from './getPackagesFromZip';
import Workspace from '../../lib/parser/WorkSpace';
import { allowPDFUpload } from './allowPDFUpload';
import { isZipContentFileSupported } from './isZipContentFileSupported';

interface GenerationData {
Expand All @@ -29,10 +28,7 @@ function doGenerationWork(data: GenerationData) {
const filename = file.originalname;
const key = file.key;

if (
isZipContentFileSupported(filename) ||
allowPDFUpload(filename, paying, settings.vertexAIPDFQuestions)
) {
if (isZipContentFileSupported(filename)) {
const d = await PrepareDeck({
name: filename,
files: [{ name: filename, contents: fileContents }],
Expand Down

0 comments on commit c66b02f

Please sign in to comment.