Add PDF handling

zotero · Jan 9, 2019 · d092ecf · d092ecf
1 parent 5848d9e
commit d092ecf
Show file tree

Hide file tree

Showing 8 changed files with 315 additions and 30 deletions.
diff --git a/config/default.json b/config/default.json
@@ -3,10 +3,18 @@
 	"blacklistedDomains": [],
 	"deproxifyURLs": false, // Automatically try deproxified versions of URLs
 	"identifierSearchLambda": "", // Identifier search Lambda function for text search
+	"recognizerLambda": "", // PDF recognizer Lambda function
 	"port": 1969,
 	"translators": {
 		"CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool
 	},
+	"s3Upload": {
+		"params": {
+			"Bucket": ""
+		},
+		"accessKeyId": "",
+		"secretAccessKey": ""
+	},
 	"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
 	"translatorsDirectory": "./modules/translators"
 }
diff --git a/lambda_template.yaml.j2 b/lambda_template.yaml.j2
@@ -14,10 +14,24 @@ Resources:
       MemorySize: 2048
       Timeout: 30
       Policies:
+        - Statement:
+           Action:
+            - s3:PutObject
+            - s3:DeleteObject
+           Effect: Allow
+           Resource: !ImportValue RecognizerUploadBucketArn
         - LambdaInvokePolicy:
            FunctionName: {{ identifier_search_function_name }}
+        - LambdaInvokePolicy:
+           FunctionName: {{ recognizer_function_name }}
       Events:
         # API Gateway
+        GetAPI:
+          Type: Api
+          Properties:
+            # Proxy all GET requests to Lambda function
+            Path: /{proxy+}
+            Method: get
         PostAPI:
           Type: Api
           Properties:

diff --git a/package.json b/package.json
@@ -20,6 +20,7 @@
     "request-promise-native": "^1.0.5",
     "serverless-http": "^1.6.0",
     "w3c-xmlserializer": "0.0.1",
+    "whatwg-mimetype": "^2.3.0",
     "wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d",
     "xregexp": "^4.2.0",
     "yargs": "^12.0.2"

diff --git a/src/http.js b/src/http.js
@@ -29,6 +29,7 @@ var url = require('url');
 var jsdom = require('jsdom');
 var { JSDOM } = jsdom;
 var wgxpath = require('wicked-good-xpath');
+var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does
 
 /**
  * Functions for performing HTTP requests
@@ -154,28 +155,22 @@ Zotero.HTTP = new function() {
 		};
 
 		if (options.responseType == 'document') {
-			let dom;
-			try {
-				dom = new JSDOM(body, {
+			let mimeType = new MIMEType(response.headers['content-type']);
+
+			// Filter content-type in the same way as JSDOM does
+			if (mimeType.isHTML() || mimeType.isXML()) {
+				result.type = 'document';
+
+				let dom = new JSDOM(body, {
 					url: result.responseURL,
-					// Inform JSDOM what content type it's parsing,
-					// so it could reject unsupported content types
+					// Inform JSDOM what content type it's parsing
 					contentType: response.headers['content-type']
 				});
-			}
-			catch (e) {
-				if (e.message.includes('not a HTML or XML content type')) {
-					Zotero.debug(e, 1)
-					throw new this.UnsupportedFormatError(result.responseURL, e.message);
-				}
-				throw e;
-			}
-			wgxpath.install(dom.window, true);
-			result.response = dom.window.document;
-
-			// Follow meta redirects
-			if (response.headers['content-type']
-					&& response.headers['content-type'].startsWith('text/html')) {
+
+				wgxpath.install(dom.window, true);
+				result.response = dom.window.document;
+
+				// Follow meta redirects in HTML and XML files
 				let meta = result.response.querySelector('meta[http-equiv=refresh]');
 				if (meta && meta.getAttribute('content')) {
 					let parts = meta.getAttribute('content').split(/;\s*url=/);
@@ -190,6 +185,19 @@ Zotero.HTTP = new function() {
 					}
 				}
 			}
+			else if (
+				options.fallbackToPDF &&
+				mimeType.essence === 'application/pdf'
+			) {
+				result.type = 'pdf';
+				result.response = body;
+			}
+			else {
+				throw new this.UnsupportedFormatError(
+					result.responseURL,
+					response.headers['content-type'] + ' is not supported'
+				);
+			}
 		}
 		else if (options.responseType == 'json') {
 			result.response = JSON.parse(body.toString());
@@ -362,6 +370,22 @@ function customRequest(method, requestURL, options) {
 			.on('response', function (res) {
 				if (returned) return;
 				response = res;
+
+				// Check content-type before starting the download
+				let mimeType = new MIMEType(response.headers['content-type']);
+				if (!(
+					mimeType.isHTML() ||
+					mimeType.isXML() ||
+					options.fallbackToPDF && mimeType.essence === 'application/pdf'
+				)) {
+					req.abort();
+					returned = true;
+					return reject(new Zotero.HTTP.UnsupportedFormatError(
+						requestURL,
+						response.headers['content-type'] + ' is not supported'
+					));
+				}
+
 				// Content-length doesn't always exists or it can be a length of a gzipped content,
 				// but it's still worth to do the initial size check
 				if (
@@ -372,15 +396,13 @@ function customRequest(method, requestURL, options) {
 					returned = true;
 					reject(new Zotero.HTTP.ResponseSizeError(requestURL));
 				}
-
-				// TODO: Filter content-type too
 			})
 			.on('end', function () {
 				if (returned) return;
 				returned = true;
 				resolve({response, body: Buffer.concat(buffers, bufferLength)});
 			});
 	});
-};
+}
 
 module.exports = Zotero.HTTP;
diff --git a/src/lambda.js b/src/lambda.js
@@ -35,13 +35,16 @@ var Translators; // Translators module is cashed
 const SearchEndpoint = require('./searchEndpoint');
 const WebEndpoint = require('./webEndpoint');
 const ExportEndpoint = require('./exportEndpoint');
+const Recognizer = require('./recognizer');
 
 const app = module.exports = new Koa();
 app.use(cors);
 app.use(bodyParser({enableTypes: ['text', 'json']}));
 app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
 app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
 app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
+app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
+app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));
 
 Debug.init(1);
 

diff --git a/src/recognizer.js b/src/recognizer.js
@@ -0,0 +1,213 @@
+/*
+    ***** BEGIN LICENSE BLOCK *****
+    
+    Copyright © 2018 Corporation for Digital Scholarship
+                     Vienna, Virginia, USA
+                     https://www.zotero.org
+    
+    This file is part of Zotero.
+    
+    Zotero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    
+    Zotero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    
+    You should have received a copy of the GNU Affero General Public License
+    along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
+    
+    ***** END LICENSE BLOCK *****
+*/
+
+// TODO: Move upload logic outside of recognizer.js if uploads will be needed for other purposes
+
+const config = require('config');
+const AWS = require('aws-sdk');
+const crypto = require('crypto');
+const Lambda = new AWS.Lambda({apiVersion: '2015-03-31'});
+const S3 = new AWS.S3(config.get('s3Upload'));
+
+let UPLOAD_EXPIRATION = 1 * 60; // 1 minute to initiate upload
+let MAX_PDF_SIZE = 50 * 1024 * 1024; // 50 MB
+
+let Recognizer = module.exports = {
+	/**
+	 * Directly upload file and get its uploadID
+	 *
+	 * @param buffer
+	 * @return {Promise<string>} uploadID
+	 */
+	upload: async function (buffer) {
+		// Generate UUID
+		let uploadID = crypto.randomBytes(16).toString('hex');
+		await S3.upload({Key: uploadID, Body: buffer}).promise();
+		return uploadID;
+	},
+
+	remove: async function(uploadID) {
+		await S3.deleteObject({Key: uploadID}).promise();
+	},
+
+	/**
+	 * Recognize the uploaded PDF by invoking recognizer Lambda function
+	 *
+	 * @param uploadID
+	 * @return {Promise<Object|null>} Item metadata in translator format
+	 */
+	recognize: async function (uploadID) {
+		let params = {
+			FunctionName: config.get('recognizerLambda'),
+			InvocationType: 'RequestResponse',
+			// Inform recognizer Lambda that we are calling it internally, not over API gateway
+			Payload: JSON.stringify({type: 'INTERNAL', body: {action: "recognizeUpload", uploadID}})
+		};
+
+		let res = await Lambda.invoke(params).promise();
+
+		if (res.FunctionError) {
+			throw new Error('Lambda error: ' + res.Payload);
+		}
+
+		res = JSON.parse(res.Payload);
+
+		// Retrieve metadata by using recognized identifiers
+		let identifiers = [];
+
+		if (res.arxiv) {
+			identifiers.push({arXiv: res.arxiv});
+		}
+
+		if (res.doi) {
+			identifiers.push({DOI: res.doi});
+		}
+
+		if (res.isbn) {
+			identifiers.push({ISBN: res.isbn});
+		}
+
+		for (let identifier of identifiers) {
+			let translate = new Zotero.Translate.Search();
+			translate.setIdentifier(identifier);
+			let translators = await translate.getTranslators();
+			translate.setTranslator(translators);
+
+			try {
+				let items = await translate.translate({libraryID: false});
+
+				if (items.length) {
+					let item = items[0];
+
+					// Add some fields if the translated item doesn't have them
+
+					if (!item.abstractNote && res.abstract) {
+						item.abstractNote = res.abstract;
+					}
+
+					if (!item.language && res.language) {
+						item.language = res.language;
+					}
+					return item;
+				}
+			}
+			catch (e) {
+				Zotero.debug(e);
+			}
+		}
+
+		// Return the extracted metadata
+		if (res.title) {
+			let item = {};
+			item.itemType = 'journalArticle';
+
+			if (res.type === 'book-chapter') {
+				item.itemType = 'bookSection';
+			}
+
+			item.title = res.title;
+
+			item.creators = [];
+			for (let author of res.authors) {
+				item.creators.push({
+					firstName: author.firstName,
+					lastName: author.lastName,
+					creatorType: 'author'
+				})
+			}
+
+			if (res.abstract) item.abstractNote = res.abstract;
+			if (res.year) item.date = res.year;
+			if (res.pages) item.pages = res.pages;
+			if (res.volume) item.volume = res.volume;
+			if (res.url) item.url = res.url;
+			if (res.language) item.language = res.language;
+
+			if (item.itemType === 'journalArticle') {
+				if (res.issue) item.issue = res.issue;
+				if (res.issn) item.issn = res.issn;
+				if (res.container) item.publicationTitle = res.container;
+			}
+			else if (item.itemType === 'bookSection') {
+				if (res.container) item.bookTitle = res.container;
+				if (res.publisher) item.publisher = res.publisher;
+			}
+
+			item.libraryCatalog = 'Zotero';
+			return item;
+		}
+
+		return null;
+	},
+
+	/**
+	 * Generate presigned upload params
+	 *
+	 * @param ctx
+	 * @return {Promise<void>}
+	 */
+	handleUpload: async function (ctx) {
+		// Generate UUID
+		let uploadID = crypto.randomBytes(16).toString('hex');
+		// Generate a presigned POST form, which have to posted from browser to S3.
+		// createPresignedPost is used instead of getSignedUrl because it
+		// doesn't support file size limiting
+		const data = S3.createPresignedPost({
+			Fields: {
+				key: uploadID
+			},
+			Expires: UPLOAD_EXPIRATION,
+			Conditions: [
+				['content-length-range', 0, MAX_PDF_SIZE],
+			]
+		});
+		ctx.body = {uploadID, data};
+	},
+
+	/**
+	 * Recognize the uploaded PDF file
+	 *
+	 * @param ctx
+	 * @return {Promise<void>}
+	 */
+	handleProcess: async function (ctx) {
+		let uploadID = ctx.request.body;
+
+		if (!uploadID) {
+			ctx.throw(400, "uploadID not provided\n");
+		}
+
+		try {
+			let item = await this.recognize(uploadID);
+			ctx.body = Zotero.Utilities.itemToAPIJSON(item);
+		}
+		catch (e) {
+			throw e;
+		}
+		finally {
+			await this.remove(uploadID);
+		}
+	}
+};