Skip to content

Commit

Permalink
Merge pull request #54 from gdmcdonald/main
Browse files Browse the repository at this point in the history
Pdf extraction adds metadata [Document: <filename.ext>, Page: <page number>] in extracted text
  • Loading branch information
do-me authored Aug 29, 2024
2 parents 824c417 + 840455d commit 2c480e1
Showing 1 changed file with 31 additions and 22 deletions.
53 changes: 31 additions & 22 deletions src/js/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ export function removeScatterplot() {

// pdf loading logic for local and remote

function processPdf(pdf, resolve, reject, updateProgress) {
function processPdf(pdf, documentIdentifier, resolve, reject, updateProgress) {
let numPages = pdf.numPages;
let pageTextPromises = [];
for (let i = 1; i <= numPages; i++) {
Expand All @@ -450,35 +450,45 @@ function processPdf(pdf, resolve, reject, updateProgress) {
}));
}
Promise.all(pageTextPromises).then(pagesText => {
// Concatenate text from all pages
let fullText = pagesText.join("\n\n");
resolve(fullText); // Resolve the promise with the full text
// Concatenate text from all pages with metadata including page number
let fullText = pagesText.map((pageText, index) =>
`[Document: ${documentIdentifier}, Page: ${index + 1}]\n${pageText}`
).join("\n\n");
resolve(fullText); // Resolve the promise with the full text including metadata
}).catch(error => {
reject(error); // Reject the promise if there's an error
});
}

function extractTextFromPDF(fileOrDataUri, updateProgress) {
return new Promise((resolve, reject) => {
// Check if the input is a File object or a data URI
let documentIdentifier;
let pdfSource;

if (fileOrDataUri instanceof File) {
// For local files, create a URL to use with pdfjsLib
const fileURL = URL.createObjectURL(fileOrDataUri);
pdfjsLib.getDocument(fileURL).promise.then(pdf => {
processPdf(pdf, resolve, reject, updateProgress);
}).catch(error => {
reject(error); // Reject the promise if there's an error loading the PDF
});
} else if (fileOrDataUri.startsWith('data:')) {
// For data URIs, directly use the data URI with pdfjsLib
pdfjsLib.getDocument(fileOrDataUri).promise.then(pdf => {
processPdf(pdf, resolve, reject, updateProgress);
}).catch(error => {
reject(error); // Reject the promise if there's an error loading the PDF
});
// For local files
documentIdentifier = fileOrDataUri.name;
pdfSource = URL.createObjectURL(fileOrDataUri);
} else if (typeof fileOrDataUri === 'string') {
if (fileOrDataUri.startsWith('data:')) {
// For data URIs (remote PDFs)
documentIdentifier = "RemotePDF";
pdfSource = fileOrDataUri;
} else {
// Assume it's a URL
documentIdentifier = fileOrDataUri;
pdfSource = fileOrDataUri;
}
} else {
reject(new Error('Invalid input type'));
return;
}

pdfjsLib.getDocument(pdfSource).promise.then(pdf => {
processPdf(pdf, documentIdentifier, resolve, reject, updateProgress);
}).catch(error => {
reject(error); // Reject the promise if there's an error loading the PDF
});
});
}

Expand Down Expand Up @@ -543,7 +553,7 @@ export async function handleRemotePdfFileUpload() {

try {
const dataUri = await fetchPdfAsDataUri(url);
const text = await extractTextFromPDF(dataUri);
const text = await extractTextFromPDF(url, null);
texts.push(text);
} catch (error) {
console.error('Error handling remote PDF file upload:', error);
Expand All @@ -554,7 +564,6 @@ export async function handleRemotePdfFileUpload() {
}

export async function handleMultipleRemotePdfFileUploads() {

const urls = document.getElementById("importPdfURL").value.split(" ")
const results = [];

Expand All @@ -563,7 +572,7 @@ export async function handleMultipleRemotePdfFileUploads() {

try {
const dataUri = await fetchPdfAsDataUri(url);
const text = await extractTextFromPDF(dataUri);
const text = await extractTextFromPDF(url, null);
results.push(text);
} catch (error) {
console.error(`Error handling remote PDF file upload for URL ${url}:`, error);
Expand Down

0 comments on commit 2c480e1

Please sign in to comment.