From 2dfd9dcb19ab45323a23eb88277a64c48e99ecd4 Mon Sep 17 00:00:00 2001
From: Dan Stillman <dstillman@zotero.org>
Date: Thu, 4 Apr 2019 12:53:06 -0400
Subject: [PATCH] Translate import formats via /web

If the remote URL is text/plain or one of the known content types for
the formats we support (e.g., application/x-bibtex), try to handle it
via import translation. This allows, for example, a remote BibTeX file
to be translated.

This adapts some of the code from #59 in a more general way, with a
responseTypeMap property that can be passed to HTTP.request().
---
 package.json                  |   1 +
 src/exportEndpoint.js         |  45 +----------
 src/formats.js                |  43 ++++++++++
 src/http.js                   | 148 +++++++++++++++++++++++-----------
 src/testEndpoint.js           |  10 +++
 src/webSession.js             |  39 +++++++--
 test/data/bibtex_response.xml |   7 ++
 test/setup.js                 |   1 +
 test/web_test.js              |  15 ++++
 9 files changed, 212 insertions(+), 97 deletions(-)
 create mode 100644 src/formats.js
 create mode 100644 test/data/bibtex_response.xml

diff --git a/package.json b/package.json
index 1fc4515..48fe1b7 100644
--- a/package.json
+++ b/package.json
@@ -11,6 +11,7 @@
 	"dependencies": {
 		"aws-sdk": "^2.326.0",
 		"config": "^1.30.0",
+		"iconv-lite": "^0.4.24",
 		"jsdom": "^13.1.0",
 		"koa": "^2.5.1",
 		"koa-bodyparser": "^4.2.1",
diff --git a/src/exportEndpoint.js b/src/exportEndpoint.js
index 893d547..37cff41 100644
--- a/src/exportEndpoint.js
+++ b/src/exportEndpoint.js
@@ -24,48 +24,9 @@
 */
 
 const config = require('config');
+const { FORMATS, CONTENT_TYPES } = require('./formats');
 const Translate = require('./translation/translate');
 
-const SERVER_FORMATS = {
-	bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4",
-	biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f",
-	bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8",
-	coins: "05d07af9-105a-4572-99f6-a8e231c0daef",
-	csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7",
-	csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2",
-	endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b",
-	evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf",
-	mods: "0e2235e7-babf-413c-9acf-f27cce5f059c",
-	rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9",
-	rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7",
-	rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9",
-	refer: "881f60f2-0802-411a-9228-ce5f47b64c7d",
-	refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86",
-	ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
-	tei: "032ae9b7-ab90-9205-a479-baf81f49184a",
-	wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620"
-};
-
-const SERVER_CONTENT_TYPES = {
-	bibtex: "application/x-bibtex",
-	biblatex: "application/x-bibtex",
-	bookmarks: "text/html",
-	coins: "text/html",
-	csljson: "application/json",
-	csv: "text/csv",
-	endnote_xml: "text/xml",
-	evernote: "text/xml",
-	mods: "application/mods+xml",
-	rdf_bibliontology: "application/rdf+xml",
-	rdf_dc: "application/rdf+xml",
-	rdf_zotero: "application/rdf+xml",
-	refer: "application/x-research-info-systems",
-	refworks_tagged: "text/plain",
-	ris: "application/x-research-info-systems",
-	tei: "text/xml",
-	wikipedia: "text/x-wiki"
-};
-
 var ExportEndpoint = module.exports = {
 	handle: async function (ctx, next) {
 		ctx.assert(ctx.is('json'), 415);
@@ -80,7 +41,7 @@ var ExportEndpoint = module.exports = {
 		
 		var translatorID;
 		
-		if (!query.format || !(translatorID = SERVER_FORMATS[query.format])) {
+		if (!query.format || !(translatorID = FORMATS[query.format])) {
 			ctx.throw(400, "Invalid format specified");
 		}
 		
@@ -107,7 +68,7 @@ var ExportEndpoint = module.exports = {
 						reject();
 					}
 					else {
-						ctx.set('Content-Type', SERVER_CONTENT_TYPES[query.format]);
+						ctx.set('Content-Type', CONTENT_TYPES[query.format]);
 						ctx.response.body = translate.string;
 						resolve();
 					}
diff --git a/src/formats.js b/src/formats.js
new file mode 100644
index 0000000..b8999ff
--- /dev/null
+++ b/src/formats.js
@@ -0,0 +1,43 @@
+/* eslint camelcase: "off" */
+
+const FORMATS = {
+	bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4",
+	biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f",
+	bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8",
+	coins: "05d07af9-105a-4572-99f6-a8e231c0daef",
+	csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7",
+	csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2",
+	endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b",
+	evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf",
+	mods: "0e2235e7-babf-413c-9acf-f27cce5f059c",
+	rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9",
+	rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7",
+	rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9",
+	refer: "881f60f2-0802-411a-9228-ce5f47b64c7d",
+	refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86",
+	ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
+	tei: "032ae9b7-ab90-9205-a479-baf81f49184a",
+	wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620"
+};
+
+const CONTENT_TYPES = {
+	bibtex: "application/x-bibtex",
+	biblatex: "application/x-bibtex",
+	bookmarks: "text/html",
+	coins: "text/html",
+	csljson: "application/json",
+	csv: "text/csv",
+	endnote_xml: "text/xml",
+	evernote: "text/xml",
+	mods: "application/mods+xml",
+	rdf_bibliontology: "application/rdf+xml",
+	rdf_dc: "application/rdf+xml",
+	rdf_zotero: "application/rdf+xml",
+	refer: "application/x-research-info-systems",
+	refworks_tagged: "text/plain",
+	ris: "application/x-research-info-systems",
+	tei: "text/xml",
+	wikipedia: "text/x-wiki"
+};
+
+module.exports = { FORMATS, CONTENT_TYPES };
diff --git a/src/http.js b/src/http.js
index 187aced..b6bc116 100644
--- a/src/http.js
+++ b/src/http.js
@@ -25,10 +25,12 @@
 
 var config = require('config');
 var request = require('request');
+var iconv = require('iconv-lite');
 var url = require('url');
 var jsdom = require('jsdom');
 var { JSDOM } = jsdom;
 var wgxpath = require('wicked-good-xpath');
+var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM
 
 /**
  * Functions for performing HTTP requests
@@ -70,6 +72,10 @@ Zotero.HTTP = new function() {
 	 *         <li>logBodyLength - Length of request body to log</li>
 	 *         <li>timeout - Request timeout specified in milliseconds [default 15000]</li>
 	 *         <li>responseType - The response type of the request from the XHR spec</li>
+	 *         <li>responseTypeMap - A Map of remote content type ('application/x-bibtex') to
+	 *              XHR response type ('text'). 'html' and 'xml' imply isHTML() and isXML() from
+	 *              whatwg-mimetype. Use an empty string for the key to set a fallback response type;
+	 *              otherwise unspecified content types are rejected.</li>
 	 *         <li>successCodes - HTTP status codes that are considered successful, or FALSE to allow all</li>
 	 *     </ul>
 	 * @return {Promise<Object>} A promise resolved with a response object containing:
@@ -123,26 +129,6 @@ Zotero.HTTP = new function() {
 		
 		let {response, body} = await customRequest(method, requestURL, options);
 		
-		if (!response.headers['content-type']) {
-			throw new this.UnsupportedFormatError(requestURL, 'Missing Content-Type header');
-		}
-		
-		// Array of success codes given
-		if (options.successCodes) {
-			var success = options.successCodes.includes(response.statusCode);
-		}
-		// Explicit FALSE means allow any status code
-		else if (options.successCodes === false) {
-			var success = true;
-		}
-		// Otherwise, 2xx is success
-		else {
-			var success = response.statusCode >= 200 && response.statusCode < 300;
-		}
-		if (!success) {
-			throw new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body);
-		}
-
 		if (options.debug) {
 			Zotero.debug(`HTTP ${response.statusCode} response: ${body}`);
 		}
@@ -153,30 +139,24 @@ Zotero.HTTP = new function() {
 			status: response.statusCode
 		};
 		
-		if (options.responseType == 'document') {
-			let dom;
-			try {
-				body = decodeContent(body, response.headers['content-type']);
-				dom = new JSDOM(body, {
-					url: result.responseURL,
-					// Inform JSDOM what content type it's parsing,
-					// so it could reject unsupported content types
-					contentType: response.headers['content-type']
-				});
-			}
-			catch (e) {
-				if (e.message.includes('not a HTML or XML content type')) {
-					Zotero.debug(e, 1)
-					throw new this.UnsupportedFormatError(result.responseURL, e.message);
-				}
-				throw e;
-			}
+		var mimeType = new MIMEType(response.headers['content-type']);
+		var responseType = getResponseType(response.headers['content-type'], options);
+		result.type = responseType;
+		
+		if (responseType == 'document') {
+			body = decodeContent(body, response.headers['content-type']);
+			let dom = new JSDOM(body, {
+				url: result.responseURL,
+				// Inform JSDOM what content type it's parsing,
+				// so it could reject unsupported content types
+				contentType: response.headers['content-type']
+			});
+			
 			wgxpath.install(dom.window, true);
 			result.response = dom.window.document;
 			
-			// Follow meta redirects
-			if (response.headers['content-type']
-					&& response.headers['content-type'].startsWith('text/html')) {
+			// Follow meta redirects in HTML files
+			if (mimeType.isHTML()) {
 				let meta = result.response.querySelector('meta[http-equiv=refresh]');
 				if (meta && meta.getAttribute('content')) {
 					let parts = meta.getAttribute('content').split(/;\s*url=/);
@@ -192,11 +172,20 @@ Zotero.HTTP = new function() {
 				}
 			}
 		}
-		else if (options.responseType == 'json') {
+		else if (responseType == 'json') {
 			result.response = JSON.parse(body.toString());
 		}
-		else if (!options.responseType || options.responseType == 'text') {
-			body = body.toString();
+		else if (responseType == 'text') {
+			let charset = mimeType.parameters.get('charset');
+			// Treat unknown charset as utf-8
+			if (!charset) {
+				charset = 'utf8';
+			}
+			else if (!iconv.encodingExists(charset)) {
+				Zotero.debug(`Unknown charset ${charset} -- decoding as UTF-8`);
+				charset = 'utf8';
+			}
+			body = iconv.decode(body, charset);
 			result.response = body;
 			result.responseText = body;
 		}
@@ -319,7 +308,6 @@ Zotero.HTTP = new function() {
  *
  * TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved
  */
-const MIMEType = require("whatwg-mimetype");
 const sniffHTMLEncoding = require("html-encoding-sniffer");
 const whatwgEncoding = require("whatwg-encoding");
 
@@ -389,6 +377,54 @@ function customRequest(method, requestURL, options) {
 			.on('response', function (res) {
 				if (returned) return;
 				response = res;
+				
+				if (!response.headers['content-type']) {
+					returned = true;
+					return reject(new Zotero.HTTP.UnsupportedFormatError(requestURL, 'Missing Content-Type header'));
+				}
+				
+				// Check if the status code is allowed
+				// Array of success codes given
+				if (options.successCodes) {
+					var success = options.successCodes.includes(response.statusCode);
+				}
+				// Explicit FALSE means allow any status code
+				else if (options.successCodes === false) {
+					var success = true;
+				}
+				// Otherwise, 2xx is success
+				else {
+					var success = response.statusCode >= 200 && response.statusCode < 300;
+				}
+				if (!success) {
+					returned = true;
+					return reject(new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body));
+				}
+				
+				// Check Content-Type before starting the download
+				let supported = true;
+				let mimeType = new MIMEType(response.headers['content-type']);
+				if (options.responseType == 'document') {
+					supported = mimeType.isHTML() || mimeType.isXML();
+				}
+				else if (options.responseTypeMap) {
+					let map = options.responseTypeMap;
+					supported = (map.has('html') && mimeType.isHTML())
+						|| (map.has('xml') && mimeType.isXML())
+						|| map.has(mimeType.essence)
+						// An empty string for a key allows unspecified types as text
+						|| map.has('');
+				}
+				
+				if (!supported) {
+					req.abort();
+					returned = true;
+					return reject(new Zotero.HTTP.UnsupportedFormatError(
+						requestURL,
+						response.headers['content-type'] + ' is not supported'
+					));
+				}
+				
 				// Content-length doesn't always exists or it can be a length of a gzipped content,
 				// but it's still worth to do the initial size check
 				if (
@@ -399,8 +435,6 @@ function customRequest(method, requestURL, options) {
 					returned = true;
 					reject(new Zotero.HTTP.ResponseSizeError(requestURL));
 				}
-				
-				// TODO: Filter content-type too
 			})
 			.on('end', function () {
 				if (returned) return;
@@ -410,4 +444,24 @@ function customRequest(method, requestURL, options) {
 	});
 };
 
+function getResponseType(contentType, options) {
+	var mimeType = new MIMEType(contentType);
+	if (options.responseType) {
+		return options.responseType;
+	}
+	if (options.responseTypeMap) {
+		let map = options.responseTypeMap;
+		if (map.has('html') && mimeType.isHTML()) {
+			return map.get('html');
+		}
+		if (map.has('xml') && mimeType.isXML()) {
+			return map.get('xml');
+		}
+		if (map.has(mimeType.essence)) {
+			return map.get(mimeType.essence);
+		}
+	}
+	return 'text';
+}
+
 module.exports = Zotero.HTTP;
diff --git a/src/testEndpoint.js b/src/testEndpoint.js
index e230a13..bfaa5d2 100644
--- a/src/testEndpoint.js
+++ b/src/testEndpoint.js
@@ -1,3 +1,6 @@
+var fs = require('fs');
+var path = require('path');
+
 var TestEndpoint = {
 	handlePlain: async function (ctx, _next) {
 		ctx.response.body = '<html><head><title>Test</title></head><body>Hello</body></html>';
@@ -27,6 +30,13 @@ var TestEndpoint = {
 		ctx.redirect('/test/single');
 	},
 	
+	handleBibTeX: async function (ctx, _next) {
+		ctx.set('Content-Type', 'application/x-bibtex');
+		ctx.response.body = fs
+			.readFileSync(path.join(__dirname, '../test/data/bibtex_response.xml'))
+			.toString();
+	},
+	
 	invalidContentType: async function (ctx, _next) {
 		ctx.set('Content-Type', 'image/jpeg');
 		ctx.response.body = '';
diff --git a/src/webSession.js b/src/webSession.js
index 1cf6c77..c652bad 100644
--- a/src/webSession.js
+++ b/src/webSession.js
@@ -25,10 +25,12 @@
 
 const config = require('config');
 const urlLib = require('url');
+const { CONTENT_TYPES } = require('./formats');
 const Translate = require('./translation/translate');
 const TLDS = Zotero.require('./translation/tlds');
 const HTTP = require('./http');
 const Translators = require('./translators');
+const ImportEndpoint = require('./importEndpoint');
 const SearchEndpoint = require('./searchEndpoint');
 const { jar: cookieJar } = require('request');
 
@@ -85,6 +87,18 @@ WebSession.prototype.handleURL = async function () {
 		}
 	}
 	
+	var responseTypeMap = new Map([
+		['html', 'document'],
+		['text/plain', 'text']
+	]);
+	// Force all import content types to text
+	for (let type in CONTENT_TYPES) {
+		let contentType = CONTENT_TYPES[type];
+		if (contentType == 'text/html') continue;
+		if (responseTypeMap.has(contentType)) continue;
+		responseTypeMap.set(contentType, 'text');
+	}
+	
 	var urlsToTry = config.get('deproxifyURLs') ? this.deproxifyURL(url) : [url];
 	for (let i = 0; i < urlsToTry.length; i++) {
 		let url = urlsToTry[i];
@@ -153,19 +167,28 @@ WebSession.prototype.handleURL = async function () {
 		translate.setRequestHeaders(headers);
 		
 		try {
-			await HTTP.processDocuments(
-				[url],
-				(doc) => {
-					translate.setDocument(doc);
-					// This could be optimized by only running detect on secondary translators
-					// if the first fails, but for now just run detect on all
-					return translate.getTranslators(true);
-				},
+			let req = await Zotero.HTTP.request(
+				"GET",
+				url,
 				{
+					responseTypeMap,
 					cookieSandbox: this._cookieSandbox,
 					headers
 				}
 			);
+			if (req.type === 'document') {
+				translate.setDocument(req.response);
+				// This could be optimized by only running detect on secondary translators
+				// if the first fails, but for now just run detect on all
+				translate.getTranslators(true);
+			}
+			else {
+				Zotero.debug(`Handling ${req.headers['content-type']} as import`);
+				this.ctx.request.body = req.response;
+				await ImportEndpoint.handle(this.ctx);
+				return;
+			}
+			
 			return promise;
 		}
 		catch (e) {
diff --git a/test/data/bibtex_response.xml b/test/data/bibtex_response.xml
new file mode 100644
index 0000000..43ea9f1
--- /dev/null
+++ b/test/data/bibtex_response.xml
@@ -0,0 +1,7 @@
+@article{Bar19,
+  author = "Foo Bar",
+   title = "Title",
+ journal = "Journal",
+  volume = 123,
+    year = 2019,
+   pages = "123--125"}
\ No newline at end of file
diff --git a/test/setup.js b/test/setup.js
index 4dc1b75..b07283d 100644
--- a/test/setup.js
+++ b/test/setup.js
@@ -23,6 +23,7 @@ testApp.use(_.get('/test/plain', TestEndpoint.handlePlain));
 testApp.use(_.get('/test/single', TestEndpoint.handleSingle));
 testApp.use(_.get('/test/multiple', TestEndpoint.handleMultiple));
 testApp.use(_.get('/test/redirect', TestEndpoint.handleRedirect));
+testApp.use(_.get('/test/bibtex', TestEndpoint.handleBibTeX));
 testApp.use(_.get('/test/invalidContentType', TestEndpoint.invalidContentType));
 testApp.use(_.get('/test/missingContentType', TestEndpoint.missingContentType));
 var testServer = testApp.listen(config.get('testPort'));
diff --git a/test/web_test.js b/test/web_test.js
index 4265b9b..73fc82c 100644
--- a/test/web_test.js
+++ b/test/web_test.js
@@ -73,6 +73,21 @@ describe("/web", function () {
 	});
 	
 	
+	it("should translate a remote BibTeX file", async function () {
+		var url = testURL + 'bibtex';
+		var response = await request()
+			.post('/web')
+			.set('Content-Type', 'text/plain')
+			.send(url);
+		assert.equal(response.statusCode, 200);
+		var json = response.body;
+		
+		assert.lengthOf(json, 1);
+		assert.equal(json[0].itemType, 'journalArticle');
+		assert.equal(json[0].title, 'Title');
+	});
+	
+	
 	it("should return 400 if a page returns a 404", async function () {
 		var url = testURL + '404';
 		var response = await request()