From eb5ecd7d6b3d48992067427bce9e355204b3bead Mon Sep 17 00:00:00 2001
From: Martynas Bagdonas <martbgd@gmail.com>
Date: Wed, 28 Nov 2018 17:13:42 +0200
Subject: [PATCH] Add PDF handling

---
 config/default.json     |   8 ++
 lambda_template.yaml.j2 |  14 +++
 package.json            |   1 +
 src/http.js             |  66 ++++++++-----
 src/lambda.js           |   3 +
 src/recognizer.js       | 213 ++++++++++++++++++++++++++++++++++++++++
 src/server.js           |   3 +
 src/webSession.js       |  37 +++++--
 8 files changed, 314 insertions(+), 31 deletions(-)
 create mode 100644 src/recognizer.js

diff --git a/config/default.json b/config/default.json
index b99b213..c769400 100644
--- a/config/default.json
+++ b/config/default.json
@@ -3,10 +3,18 @@
 	"blacklistedDomains": [],
 	"deproxifyURLs": false, // Automatically try deproxified versions of URLs
 	"identifierSearchLambda": "", // Identifier search Lambda function for text search
+	"recognizerLambda": "", // PDF recognizer Lambda function
 	"port": 1969,
 	"translators": {
 		"CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool
 	},
+	"s3Upload": {
+		"params": {
+			"Bucket": ""
+		},
+		"accessKeyId": "",
+		"secretAccessKey": ""
+	},
 	"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
 	"translatorsDirectory": "./modules/translators"
 }
diff --git a/lambda_template.yaml.j2 b/lambda_template.yaml.j2
index bdaea51..8cbc49e 100644
--- a/lambda_template.yaml.j2
+++ b/lambda_template.yaml.j2
@@ -14,10 +14,24 @@ Resources:
       MemorySize: 2048
       Timeout: 30
       Policies:
+        - Statement:
+           Action:
+            - s3:PutObject
+            - s3:DeleteObject
+           Effect: Allow
+           Resource: !ImportValue RecognizerUploadBucketArn
         - LambdaInvokePolicy:
            FunctionName: {{ identifier_search_function_name }}
+        - LambdaInvokePolicy:
+           FunctionName: {{ recognizer_function_name }}
       Events:
         # API Gateway
+        GetAPI:
+          Type: Api
+          Properties:
+            # Proxy all GET requests to Lambda function
+            Path: /{proxy+}
+            Method: get
         PostAPI:
           Type: Api
           Properties:
diff --git a/package.json b/package.json
index 9a10e66..7522a23 100644
--- a/package.json
+++ b/package.json
@@ -20,6 +20,7 @@
     "request-promise-native": "^1.0.5",
     "serverless-http": "^1.6.0",
     "w3c-xmlserializer": "0.0.1",
+    "whatwg-mimetype": "^2.3.0",
     "wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d",
     "xregexp": "^4.2.0",
     "yargs": "^12.0.2"
diff --git a/src/http.js b/src/http.js
index 187aced..a41912f 100644
--- a/src/http.js
+++ b/src/http.js
@@ -29,6 +29,7 @@ var url = require('url');
 var jsdom = require('jsdom');
 var { JSDOM } = jsdom;
 var wgxpath = require('wicked-good-xpath');
+var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does
 
 /**
  * Functions for performing HTTP requests
@@ -154,29 +155,22 @@ Zotero.HTTP = new function() {
 		};
 		
 		if (options.responseType == 'document') {
-			let dom;
-			try {
+			let mimeType = new MIMEType(response.headers['content-type']);
+			
+			// Filter content-type in the same way as JSDOM does
+			if (mimeType.isHTML() || mimeType.isXML()) {
+				result.type = 'document';
 				body = decodeContent(body, response.headers['content-type']);
-				dom = new JSDOM(body, {
+				let dom = new JSDOM(body, {
 					url: result.responseURL,
-					// Inform JSDOM what content type it's parsing,
-					// so it could reject unsupported content types
+					// Inform JSDOM what content type it's parsing
 					contentType: response.headers['content-type']
 				});
-			}
-			catch (e) {
-				if (e.message.includes('not a HTML or XML content type')) {
-					Zotero.debug(e, 1)
-					throw new this.UnsupportedFormatError(result.responseURL, e.message);
-				}
-				throw e;
-			}
-			wgxpath.install(dom.window, true);
-			result.response = dom.window.document;
-			
-			// Follow meta redirects
-			if (response.headers['content-type']
-					&& response.headers['content-type'].startsWith('text/html')) {
+				
+				wgxpath.install(dom.window, true);
+				result.response = dom.window.document;
+				
+				// Follow meta redirects in HTML and XML files
 				let meta = result.response.querySelector('meta[http-equiv=refresh]');
 				if (meta && meta.getAttribute('content')) {
 					let parts = meta.getAttribute('content').split(/;\s*url=/);
@@ -191,6 +185,19 @@ Zotero.HTTP = new function() {
 					}
 				}
 			}
+			else if (
+				options.fallbackToPDF &&
+				mimeType.essence === 'application/pdf'
+			) {
+				result.type = 'pdf';
+				result.response = body;
+			}
+			else {
+				throw new this.UnsupportedFormatError(
+					result.responseURL,
+					response.headers['content-type'] + ' is not supported'
+				);
+			}
 		}
 		else if (options.responseType == 'json') {
 			result.response = JSON.parse(body.toString());
@@ -319,7 +326,6 @@ Zotero.HTTP = new function() {
  *
  * TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved
  */
-const MIMEType = require("whatwg-mimetype");
 const sniffHTMLEncoding = require("html-encoding-sniffer");
 const whatwgEncoding = require("whatwg-encoding");
 
@@ -389,6 +395,22 @@ function customRequest(method, requestURL, options) {
 			.on('response', function (res) {
 				if (returned) return;
 				response = res;
+				
+				// Check content-type before starting the download
+				let mimeType = new MIMEType(response.headers['content-type']);
+				if (!(
+					mimeType.isHTML() ||
+					mimeType.isXML() ||
+					options.fallbackToPDF && mimeType.essence === 'application/pdf'
+				)) {
+					req.abort();
+					returned = true;
+					return reject(new Zotero.HTTP.UnsupportedFormatError(
+						requestURL,
+						response.headers['content-type'] + ' is not supported'
+					));
+				}
+				
 				// Content-length doesn't always exists or it can be a length of a gzipped content,
 				// but it's still worth to do the initial size check
 				if (
@@ -399,8 +421,6 @@ function customRequest(method, requestURL, options) {
 					returned = true;
 					reject(new Zotero.HTTP.ResponseSizeError(requestURL));
 				}
-				
-				// TODO: Filter content-type too
 			})
 			.on('end', function () {
 				if (returned) return;
@@ -408,6 +428,6 @@ function customRequest(method, requestURL, options) {
 				resolve({response, body: Buffer.concat(buffers, bufferLength)});
 			});
 	});
-};
+}
 
 module.exports = Zotero.HTTP;
diff --git a/src/lambda.js b/src/lambda.js
index 8fb7162..d311778 100644
--- a/src/lambda.js
+++ b/src/lambda.js
@@ -35,6 +35,7 @@ var Translators; // Translators module is cashed
 const SearchEndpoint = require('./searchEndpoint');
 const WebEndpoint = require('./webEndpoint');
 const ExportEndpoint = require('./exportEndpoint');
+const Recognizer = require('./recognizer');
 
 const app = module.exports = new Koa();
 app.use(cors);
@@ -42,6 +43,8 @@ app.use(bodyParser({enableTypes: ['text', 'json']}));
 app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
 app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
 app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
+app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
+app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));
 
 Debug.init(1);
 
diff --git a/src/recognizer.js b/src/recognizer.js
new file mode 100644
index 0000000..723e665
--- /dev/null
+++ b/src/recognizer.js
@@ -0,0 +1,213 @@
+/*
+    ***** BEGIN LICENSE BLOCK *****
+    
+    Copyright © 2018 Corporation for Digital Scholarship
+                     Vienna, Virginia, USA
+                     https://www.zotero.org
+    
+    This file is part of Zotero.
+    
+    Zotero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    
+    Zotero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    
+    You should have received a copy of the GNU Affero General Public License
+    along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
+    
+    ***** END LICENSE BLOCK *****
+*/
+
+// TODO: Move upload logic outside of recognizer.js if uploads will be needed for other purposes
+
+const config = require('config');
+const AWS = require('aws-sdk');
+const crypto = require('crypto');
+const Lambda = new AWS.Lambda({apiVersion: '2015-03-31'});
+const S3 = new AWS.S3(config.get('s3Upload'));
+
+let UPLOAD_EXPIRATION = 1 * 60; // 1 minute to initiate upload
+let MAX_PDF_SIZE = 50 * 1024 * 1024; // 50 MB
+
+let Recognizer = module.exports = {
+	/**
+	 * Directly upload file and get its uploadID
+	 *
+	 * @param buffer
+	 * @return {Promise<string>} uploadID
+	 */
+	upload: async function (buffer) {
+		// Generate UUID
+		let uploadID = crypto.randomBytes(16).toString('hex');
+		await S3.upload({Key: uploadID, Body: buffer}).promise();
+		return uploadID;
+	},
+	
+	remove: async function(uploadID) {
+		await S3.deleteObject({Key: uploadID}).promise();
+	},
+	
+	/**
+	 * Recognize the uploaded PDF by invoking recognizer Lambda function
+	 *
+	 * @param uploadID
+	 * @return {Promise<Object|null>} Item metadata in translator format
+	 */
+	recognize: async function (uploadID) {
+		let params = {
+			FunctionName: config.get('recognizerLambda'),
+			InvocationType: 'RequestResponse',
+			// Inform recognizer Lambda that we are calling it internally, not over API gateway
+			Payload: JSON.stringify({type: 'INTERNAL', body: {action: "recognizeUpload", uploadID}})
+		};
+		
+		let res = await Lambda.invoke(params).promise();
+		
+		if (res.FunctionError) {
+			throw new Error('Lambda error: ' + res.Payload);
+		}
+		
+		res = JSON.parse(res.Payload);
+		
+		// Retrieve metadata by using recognized identifiers
+		let identifiers = [];
+		
+		if (res.arxiv) {
+			identifiers.push({arXiv: res.arxiv});
+		}
+		
+		if (res.doi) {
+			identifiers.push({DOI: res.doi});
+		}
+		
+		if (res.isbn) {
+			identifiers.push({ISBN: res.isbn});
+		}
+		
+		for (let identifier of identifiers) {
+			let translate = new Zotero.Translate.Search();
+			translate.setIdentifier(identifier);
+			let translators = await translate.getTranslators();
+			translate.setTranslator(translators);
+			
+			try {
+				let items = await translate.translate({libraryID: false});
+				
+				if (items.length) {
+					let item = items[0];
+					
+					// Add some fields if the translated item doesn't have them
+					
+					if (!item.abstractNote && res.abstract) {
+						item.abstractNote = res.abstract;
+					}
+					
+					if (!item.language && res.language) {
+						item.language = res.language;
+					}
+					return item;
+				}
+			}
+			catch (e) {
+				Zotero.debug(e);
+			}
+		}
+		
+		// Return the extracted metadata
+		if (res.title) {
+			let item = {};
+			item.itemType = 'journalArticle';
+			
+			if (res.type === 'book-chapter') {
+				item.itemType = 'bookSection';
+			}
+			
+			item.title = res.title;
+			
+			item.creators = [];
+			for (let author of res.authors) {
+				item.creators.push({
+					firstName: author.firstName,
+					lastName: author.lastName,
+					creatorType: 'author'
+				})
+			}
+			
+			if (res.abstract) item.abstractNote = res.abstract;
+			if (res.year) item.date = res.year;
+			if (res.pages) item.pages = res.pages;
+			if (res.volume) item.volume = res.volume;
+			if (res.url) item.url = res.url;
+			if (res.language) item.language = res.language;
+			
+			if (item.itemType === 'journalArticle') {
+				if (res.issue) item.issue = res.issue;
+				if (res.issn) item.issn = res.issn;
+				if (res.container) item.publicationTitle = res.container;
+			}
+			else if (item.itemType === 'bookSection') {
+				if (res.container) item.bookTitle = res.container;
+				if (res.publisher) item.publisher = res.publisher;
+			}
+			
+			item.libraryCatalog = 'Zotero';
+			return item;
+		}
+		
+		return null;
+	},
+	
+	/**
+	 * Generate presigned upload params
+	 *
+	 * @param ctx
+	 * @return {Promise<void>}
+	 */
+	handleUpload: async function (ctx) {
+		// Generate UUID
+		let uploadID = crypto.randomBytes(16).toString('hex');
+		// Generate a presigned POST form, which have to posted from browser to S3.
+		// createPresignedPost is used instead of getSignedUrl because it
+		// doesn't support file size limiting
+		const data = S3.createPresignedPost({
+			Fields: {
+				key: uploadID
+			},
+			Expires: UPLOAD_EXPIRATION,
+			Conditions: [
+				['content-length-range', 0, MAX_PDF_SIZE],
+			]
+		});
+		ctx.body = {uploadID, data};
+	},
+	
+	/**
+	 * Recognize the uploaded PDF file
+	 *
+	 * @param ctx
+	 * @return {Promise<void>}
+	 */
+	handleProcess: async function (ctx) {
+		let uploadID = ctx.request.body;
+		
+		if (!uploadID) {
+			ctx.throw(400, "uploadID not provided\n");
+		}
+		
+		try {
+			let item = await this.recognize(uploadID);
+			ctx.body = Zotero.Utilities.itemToAPIJSON(item);
+		}
+		catch (e) {
+			throw e;
+		}
+		finally {
+			await this.remove(uploadID);
+		}
+	}
+};
diff --git a/src/server.js b/src/server.js
index d69ddcd..2fd33f7 100644
--- a/src/server.js
+++ b/src/server.js
@@ -36,6 +36,7 @@ const Translators = require('./translators');
 const SearchEndpoint = require('./searchEndpoint');
 const WebEndpoint = require('./webEndpoint');
 const ExportEndpoint = require('./exportEndpoint');
+const Recognizer = require('./recognizer');
 
 const app = module.exports = new Koa();
 app.use(cors);
@@ -43,6 +44,8 @@ app.use(bodyParser({ enableTypes: ['text', 'json']}));
 app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
 app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
 app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
+app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
+app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));
 
 Debug.init(1);
 Translators.init()
diff --git a/src/webSession.js b/src/webSession.js
index 1cf6c77..22be790 100644
--- a/src/webSession.js
+++ b/src/webSession.js
@@ -31,6 +31,7 @@ const HTTP = require('./http');
 const Translators = require('./translators');
 const SearchEndpoint = require('./searchEndpoint');
 const { jar: cookieJar } = require('request');
+const Recognizer = require('./recognizer');
 
 const SERVER_TRANSLATION_TIMEOUT = 30;
 const FORWARDED_HEADERS = ['Accept-Language'];
@@ -153,19 +154,39 @@ WebSession.prototype.handleURL = async function () {
 		translate.setRequestHeaders(headers);
 		
 		try {
-			await HTTP.processDocuments(
-				[url],
-				(doc) => {
-					translate.setDocument(doc);
-					// This could be optimized by only running detect on secondary translators
-					// if the first fails, but for now just run detect on all
-					return translate.getTranslators(true);
-				},
+			let req = await Zotero.HTTP.request(
+				"GET",
+				url,
 				{
+					responseType: 'document',
+					fallbackToPDF: true,
 					cookieSandbox: this._cookieSandbox,
 					headers
 				}
 			);
+			if (req.type === 'document') {
+				translate.setDocument(req.response);
+				// This could be optimized by only running detect on secondary translators
+				// if the first fails, but for now just run detect on all
+				translate.getTranslators(true);
+			}
+			// If PDF received
+			else {
+				let uploadID;
+				try {
+					uploadID = await Recognizer.upload(req.response);
+					let item = await Recognizer.recognize(uploadID);
+					this.ctx.response.body = Zotero.Utilities.itemToAPIJSON(item);
+					resolve();
+				}
+				catch (e) {
+					throw e;
+				}
+				finally {
+					if (uploadID) await Recognizer.remove(uploadID);
+				}
+			}
+			
 			return promise;
 		}
 		catch (e) {