From eb5ecd7d6b3d48992067427bce9e355204b3bead Mon Sep 17 00:00:00 2001 From: Martynas Bagdonas Date: Wed, 28 Nov 2018 17:13:42 +0200 Subject: [PATCH] Add PDF handling --- config/default.json | 8 ++ lambda_template.yaml.j2 | 14 +++ package.json | 1 + src/http.js | 66 ++++++++----- src/lambda.js | 3 + src/recognizer.js | 213 ++++++++++++++++++++++++++++++++++++++++ src/server.js | 3 + src/webSession.js | 37 +++++-- 8 files changed, 314 insertions(+), 31 deletions(-) create mode 100644 src/recognizer.js diff --git a/config/default.json b/config/default.json index b99b213..c769400 100644 --- a/config/default.json +++ b/config/default.json @@ -3,10 +3,18 @@ "blacklistedDomains": [], "deproxifyURLs": false, // Automatically try deproxified versions of URLs "identifierSearchLambda": "", // Identifier search Lambda function for text search + "recognizerLambda": "", // PDF recognizer Lambda function "port": 1969, "translators": { "CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool }, + "s3Upload": { + "params": { + "Bucket": "" + }, + "accessKeyId": "", + "secretAccessKey": "" + }, "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0", "translatorsDirectory": "./modules/translators" } diff --git a/lambda_template.yaml.j2 b/lambda_template.yaml.j2 index bdaea51..8cbc49e 100644 --- a/lambda_template.yaml.j2 +++ b/lambda_template.yaml.j2 @@ -14,10 +14,24 @@ Resources: MemorySize: 2048 Timeout: 30 Policies: + - Statement: + Action: + - s3:PutObject + - s3:DeleteObject + Effect: Allow + Resource: !ImportValue RecognizerUploadBucketArn - LambdaInvokePolicy: FunctionName: {{ identifier_search_function_name }} + - LambdaInvokePolicy: + FunctionName: {{ recognizer_function_name }} Events: # API Gateway + GetAPI: + Type: Api + Properties: + # Proxy all GET requests to Lambda function + Path: /{proxy+} + Method: get PostAPI: Type: Api Properties: diff --git a/package.json b/package.json index 9a10e66..7522a23 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "request-promise-native": "^1.0.5", "serverless-http": "^1.6.0", "w3c-xmlserializer": "0.0.1", + "whatwg-mimetype": "^2.3.0", "wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d", "xregexp": "^4.2.0", "yargs": "^12.0.2" diff --git a/src/http.js b/src/http.js index 187aced..a41912f 100644 --- a/src/http.js +++ b/src/http.js @@ -29,6 +29,7 @@ var url = require('url'); var jsdom = require('jsdom'); var { JSDOM } = jsdom; var wgxpath = require('wicked-good-xpath'); +var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does /** * Functions for performing HTTP requests @@ -154,29 +155,22 @@ Zotero.HTTP = new function() { }; if (options.responseType == 'document') { - let dom; - try { + let mimeType = new MIMEType(response.headers['content-type']); + + // Filter content-type in the same way as JSDOM does + if (mimeType.isHTML() || mimeType.isXML()) { + result.type = 'document'; body = decodeContent(body, response.headers['content-type']); - dom = new JSDOM(body, { + let dom = new JSDOM(body, { url: result.responseURL, - // Inform JSDOM what content type it's parsing, - // so it could reject unsupported content types + // Inform JSDOM what content type it's parsing contentType: response.headers['content-type'] }); - } - catch (e) { - if (e.message.includes('not a HTML or XML content type')) { - Zotero.debug(e, 1) - throw new this.UnsupportedFormatError(result.responseURL, e.message); - } - throw e; - } - wgxpath.install(dom.window, true); - result.response = dom.window.document; - - // Follow meta redirects - if (response.headers['content-type'] - && response.headers['content-type'].startsWith('text/html')) { + + wgxpath.install(dom.window, true); + result.response = dom.window.document; + + // Follow meta redirects in HTML and XML files let meta = result.response.querySelector('meta[http-equiv=refresh]'); if (meta && meta.getAttribute('content')) { let parts = meta.getAttribute('content').split(/;\s*url=/); @@ -191,6 +185,19 @@ Zotero.HTTP = new function() { } } } + else if ( + options.fallbackToPDF && + mimeType.essence === 'application/pdf' + ) { + result.type = 'pdf'; + result.response = body; + } + else { + throw new this.UnsupportedFormatError( + result.responseURL, + response.headers['content-type'] + ' is not supported' + ); + } } else if (options.responseType == 'json') { result.response = JSON.parse(body.toString()); @@ -319,7 +326,6 @@ Zotero.HTTP = new function() { * * TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved */ -const MIMEType = require("whatwg-mimetype"); const sniffHTMLEncoding = require("html-encoding-sniffer"); const whatwgEncoding = require("whatwg-encoding"); @@ -389,6 +395,22 @@ function customRequest(method, requestURL, options) { .on('response', function (res) { if (returned) return; response = res; + + // Check content-type before starting the download + let mimeType = new MIMEType(response.headers['content-type']); + if (!( + mimeType.isHTML() || + mimeType.isXML() || + options.fallbackToPDF && mimeType.essence === 'application/pdf' + )) { + req.abort(); + returned = true; + return reject(new Zotero.HTTP.UnsupportedFormatError( + requestURL, + response.headers['content-type'] + ' is not supported' + )); + } + // Content-length doesn't always exists or it can be a length of a gzipped content, // but it's still worth to do the initial size check if ( @@ -399,8 +421,6 @@ function customRequest(method, requestURL, options) { returned = true; reject(new Zotero.HTTP.ResponseSizeError(requestURL)); } - - // TODO: Filter content-type too }) .on('end', function () { if (returned) return; @@ -408,6 +428,6 @@ function customRequest(method, requestURL, options) { resolve({response, body: Buffer.concat(buffers, bufferLength)}); }); }); -}; +} module.exports = Zotero.HTTP; diff --git a/src/lambda.js b/src/lambda.js index 8fb7162..d311778 100644 --- a/src/lambda.js +++ b/src/lambda.js @@ -35,6 +35,7 @@ var Translators; // Translators module is cashed const SearchEndpoint = require('./searchEndpoint'); const WebEndpoint = require('./webEndpoint'); const ExportEndpoint = require('./exportEndpoint'); +const Recognizer = require('./recognizer'); const app = module.exports = new Koa(); app.use(cors); @@ -42,6 +43,8 @@ app.use(bodyParser({enableTypes: ['text', 'json']})); app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint))); app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint))); app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint))); +app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer))); +app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer))); Debug.init(1); diff --git a/src/recognizer.js b/src/recognizer.js new file mode 100644 index 0000000..723e665 --- /dev/null +++ b/src/recognizer.js @@ -0,0 +1,213 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2018 Corporation for Digital Scholarship + Vienna, Virginia, USA + https://www.zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see . + + ***** END LICENSE BLOCK ***** +*/ + +// TODO: Move upload logic outside of recognizer.js if uploads will be needed for other purposes + +const config = require('config'); +const AWS = require('aws-sdk'); +const crypto = require('crypto'); +const Lambda = new AWS.Lambda({apiVersion: '2015-03-31'}); +const S3 = new AWS.S3(config.get('s3Upload')); + +let UPLOAD_EXPIRATION = 1 * 60; // 1 minute to initiate upload +let MAX_PDF_SIZE = 50 * 1024 * 1024; // 50 MB + +let Recognizer = module.exports = { + /** + * Directly upload file and get its uploadID + * + * @param buffer + * @return {Promise} uploadID + */ + upload: async function (buffer) { + // Generate UUID + let uploadID = crypto.randomBytes(16).toString('hex'); + await S3.upload({Key: uploadID, Body: buffer}).promise(); + return uploadID; + }, + + remove: async function(uploadID) { + await S3.deleteObject({Key: uploadID}).promise(); + }, + + /** + * Recognize the uploaded PDF by invoking recognizer Lambda function + * + * @param uploadID + * @return {Promise} Item metadata in translator format + */ + recognize: async function (uploadID) { + let params = { + FunctionName: config.get('recognizerLambda'), + InvocationType: 'RequestResponse', + // Inform recognizer Lambda that we are calling it internally, not over API gateway + Payload: JSON.stringify({type: 'INTERNAL', body: {action: "recognizeUpload", uploadID}}) + }; + + let res = await Lambda.invoke(params).promise(); + + if (res.FunctionError) { + throw new Error('Lambda error: ' + res.Payload); + } + + res = JSON.parse(res.Payload); + + // Retrieve metadata by using recognized identifiers + let identifiers = []; + + if (res.arxiv) { + identifiers.push({arXiv: res.arxiv}); + } + + if (res.doi) { + identifiers.push({DOI: res.doi}); + } + + if (res.isbn) { + identifiers.push({ISBN: res.isbn}); + } + + for (let identifier of identifiers) { + let translate = new Zotero.Translate.Search(); + translate.setIdentifier(identifier); + let translators = await translate.getTranslators(); + translate.setTranslator(translators); + + try { + let items = await translate.translate({libraryID: false}); + + if (items.length) { + let item = items[0]; + + // Add some fields if the translated item doesn't have them + + if (!item.abstractNote && res.abstract) { + item.abstractNote = res.abstract; + } + + if (!item.language && res.language) { + item.language = res.language; + } + return item; + } + } + catch (e) { + Zotero.debug(e); + } + } + + // Return the extracted metadata + if (res.title) { + let item = {}; + item.itemType = 'journalArticle'; + + if (res.type === 'book-chapter') { + item.itemType = 'bookSection'; + } + + item.title = res.title; + + item.creators = []; + for (let author of res.authors) { + item.creators.push({ + firstName: author.firstName, + lastName: author.lastName, + creatorType: 'author' + }) + } + + if (res.abstract) item.abstractNote = res.abstract; + if (res.year) item.date = res.year; + if (res.pages) item.pages = res.pages; + if (res.volume) item.volume = res.volume; + if (res.url) item.url = res.url; + if (res.language) item.language = res.language; + + if (item.itemType === 'journalArticle') { + if (res.issue) item.issue = res.issue; + if (res.issn) item.issn = res.issn; + if (res.container) item.publicationTitle = res.container; + } + else if (item.itemType === 'bookSection') { + if (res.container) item.bookTitle = res.container; + if (res.publisher) item.publisher = res.publisher; + } + + item.libraryCatalog = 'Zotero'; + return item; + } + + return null; + }, + + /** + * Generate presigned upload params + * + * @param ctx + * @return {Promise} + */ + handleUpload: async function (ctx) { + // Generate UUID + let uploadID = crypto.randomBytes(16).toString('hex'); + // Generate a presigned POST form, which have to posted from browser to S3. + // createPresignedPost is used instead of getSignedUrl because it + // doesn't support file size limiting + const data = S3.createPresignedPost({ + Fields: { + key: uploadID + }, + Expires: UPLOAD_EXPIRATION, + Conditions: [ + ['content-length-range', 0, MAX_PDF_SIZE], + ] + }); + ctx.body = {uploadID, data}; + }, + + /** + * Recognize the uploaded PDF file + * + * @param ctx + * @return {Promise} + */ + handleProcess: async function (ctx) { + let uploadID = ctx.request.body; + + if (!uploadID) { + ctx.throw(400, "uploadID not provided\n"); + } + + try { + let item = await this.recognize(uploadID); + ctx.body = Zotero.Utilities.itemToAPIJSON(item); + } + catch (e) { + throw e; + } + finally { + await this.remove(uploadID); + } + } +}; diff --git a/src/server.js b/src/server.js index d69ddcd..2fd33f7 100644 --- a/src/server.js +++ b/src/server.js @@ -36,6 +36,7 @@ const Translators = require('./translators'); const SearchEndpoint = require('./searchEndpoint'); const WebEndpoint = require('./webEndpoint'); const ExportEndpoint = require('./exportEndpoint'); +const Recognizer = require('./recognizer'); const app = module.exports = new Koa(); app.use(cors); @@ -43,6 +44,8 @@ app.use(bodyParser({ enableTypes: ['text', 'json']})); app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint))); app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint))); app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint))); +app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer))); +app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer))); Debug.init(1); Translators.init() diff --git a/src/webSession.js b/src/webSession.js index 1cf6c77..22be790 100644 --- a/src/webSession.js +++ b/src/webSession.js @@ -31,6 +31,7 @@ const HTTP = require('./http'); const Translators = require('./translators'); const SearchEndpoint = require('./searchEndpoint'); const { jar: cookieJar } = require('request'); +const Recognizer = require('./recognizer'); const SERVER_TRANSLATION_TIMEOUT = 30; const FORWARDED_HEADERS = ['Accept-Language']; @@ -153,19 +154,39 @@ WebSession.prototype.handleURL = async function () { translate.setRequestHeaders(headers); try { - await HTTP.processDocuments( - [url], - (doc) => { - translate.setDocument(doc); - // This could be optimized by only running detect on secondary translators - // if the first fails, but for now just run detect on all - return translate.getTranslators(true); - }, + let req = await Zotero.HTTP.request( + "GET", + url, { + responseType: 'document', + fallbackToPDF: true, cookieSandbox: this._cookieSandbox, headers } ); + if (req.type === 'document') { + translate.setDocument(req.response); + // This could be optimized by only running detect on secondary translators + // if the first fails, but for now just run detect on all + translate.getTranslators(true); + } + // If PDF received + else { + let uploadID; + try { + uploadID = await Recognizer.upload(req.response); + let item = await Recognizer.recognize(uploadID); + this.ctx.response.body = Zotero.Utilities.itemToAPIJSON(item); + resolve(); + } + catch (e) { + throw e; + } + finally { + if (uploadID) await Recognizer.remove(uploadID); + } + } + return promise; } catch (e) {