From 2dfd9dcb19ab45323a23eb88277a64c48e99ecd4 Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Thu, 4 Apr 2019 12:53:06 -0400 Subject: [PATCH] Translate import formats via /web If the remote URL is text/plain or one of the known content types for the formats we support (e.g., application/x-bibtex), try to handle it via import translation. This allows, for example, a remote BibTeX file to be translated. This adapts some of the code from #59 in a more general way, with a responseTypeMap property that can be passed to HTTP.request(). --- package.json | 1 + src/exportEndpoint.js | 45 +---------- src/formats.js | 43 ++++++++++ src/http.js | 148 +++++++++++++++++++++++----------- src/testEndpoint.js | 10 +++ src/webSession.js | 39 +++++++-- test/data/bibtex_response.xml | 7 ++ test/setup.js | 1 + test/web_test.js | 15 ++++ 9 files changed, 212 insertions(+), 97 deletions(-) create mode 100644 src/formats.js create mode 100644 test/data/bibtex_response.xml diff --git a/package.json b/package.json index 1fc4515..48fe1b7 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "dependencies": { "aws-sdk": "^2.326.0", "config": "^1.30.0", + "iconv-lite": "^0.4.24", "jsdom": "^13.1.0", "koa": "^2.5.1", "koa-bodyparser": "^4.2.1", diff --git a/src/exportEndpoint.js b/src/exportEndpoint.js index 893d547..37cff41 100644 --- a/src/exportEndpoint.js +++ b/src/exportEndpoint.js @@ -24,48 +24,9 @@ */ const config = require('config'); +const { FORMATS, CONTENT_TYPES } = require('./formats'); const Translate = require('./translation/translate'); -const SERVER_FORMATS = { - bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4", - biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f", - bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8", - coins: "05d07af9-105a-4572-99f6-a8e231c0daef", - csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7", - csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2", - endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b", - evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf", - mods: "0e2235e7-babf-413c-9acf-f27cce5f059c", - rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9", - rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7", - rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9", - refer: "881f60f2-0802-411a-9228-ce5f47b64c7d", - refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86", - ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7", - tei: "032ae9b7-ab90-9205-a479-baf81f49184a", - wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620" -}; - -const SERVER_CONTENT_TYPES = { - bibtex: "application/x-bibtex", - biblatex: "application/x-bibtex", - bookmarks: "text/html", - coins: "text/html", - csljson: "application/json", - csv: "text/csv", - endnote_xml: "text/xml", - evernote: "text/xml", - mods: "application/mods+xml", - rdf_bibliontology: "application/rdf+xml", - rdf_dc: "application/rdf+xml", - rdf_zotero: "application/rdf+xml", - refer: "application/x-research-info-systems", - refworks_tagged: "text/plain", - ris: "application/x-research-info-systems", - tei: "text/xml", - wikipedia: "text/x-wiki" -}; - var ExportEndpoint = module.exports = { handle: async function (ctx, next) { ctx.assert(ctx.is('json'), 415); @@ -80,7 +41,7 @@ var ExportEndpoint = module.exports = { var translatorID; - if (!query.format || !(translatorID = SERVER_FORMATS[query.format])) { + if (!query.format || !(translatorID = FORMATS[query.format])) { ctx.throw(400, "Invalid format specified"); } @@ -107,7 +68,7 @@ var ExportEndpoint = module.exports = { reject(); } else { - ctx.set('Content-Type', SERVER_CONTENT_TYPES[query.format]); + ctx.set('Content-Type', CONTENT_TYPES[query.format]); ctx.response.body = translate.string; resolve(); } diff --git a/src/formats.js b/src/formats.js new file mode 100644 index 0000000..b8999ff --- /dev/null +++ b/src/formats.js @@ -0,0 +1,43 @@ +/* eslint camelcase: "off" */ + +const FORMATS = { + bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4", + biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f", + bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8", + coins: "05d07af9-105a-4572-99f6-a8e231c0daef", + csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7", + csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2", + endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b", + evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf", + mods: "0e2235e7-babf-413c-9acf-f27cce5f059c", + rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9", + rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7", + rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9", + refer: "881f60f2-0802-411a-9228-ce5f47b64c7d", + refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86", + ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7", + tei: "032ae9b7-ab90-9205-a479-baf81f49184a", + wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620" +}; + +const CONTENT_TYPES = { + bibtex: "application/x-bibtex", + biblatex: "application/x-bibtex", + bookmarks: "text/html", + coins: "text/html", + csljson: "application/json", + csv: "text/csv", + endnote_xml: "text/xml", + evernote: "text/xml", + mods: "application/mods+xml", + rdf_bibliontology: "application/rdf+xml", + rdf_dc: "application/rdf+xml", + rdf_zotero: "application/rdf+xml", + refer: "application/x-research-info-systems", + refworks_tagged: "text/plain", + ris: "application/x-research-info-systems", + tei: "text/xml", + wikipedia: "text/x-wiki" +}; + +module.exports = { FORMATS, CONTENT_TYPES }; diff --git a/src/http.js b/src/http.js index 187aced..b6bc116 100644 --- a/src/http.js +++ b/src/http.js @@ -25,10 +25,12 @@ var config = require('config'); var request = require('request'); +var iconv = require('iconv-lite'); var url = require('url'); var jsdom = require('jsdom'); var { JSDOM } = jsdom; var wgxpath = require('wicked-good-xpath'); +var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM /** * Functions for performing HTTP requests @@ -70,6 +72,10 @@ Zotero.HTTP = new function() { *
  • logBodyLength - Length of request body to log
  • *
  • timeout - Request timeout specified in milliseconds [default 15000]
  • *
  • responseType - The response type of the request from the XHR spec
  • + *
  • responseTypeMap - A Map of remote content type ('application/x-bibtex') to + * XHR response type ('text'). 'html' and 'xml' imply isHTML() and isXML() from + * whatwg-mimetype. Use an empty string for the key to set a fallback response type; + * otherwise unspecified content types are rejected.
  • *
  • successCodes - HTTP status codes that are considered successful, or FALSE to allow all
  • * * @return {Promise} A promise resolved with a response object containing: @@ -123,26 +129,6 @@ Zotero.HTTP = new function() { let {response, body} = await customRequest(method, requestURL, options); - if (!response.headers['content-type']) { - throw new this.UnsupportedFormatError(requestURL, 'Missing Content-Type header'); - } - - // Array of success codes given - if (options.successCodes) { - var success = options.successCodes.includes(response.statusCode); - } - // Explicit FALSE means allow any status code - else if (options.successCodes === false) { - var success = true; - } - // Otherwise, 2xx is success - else { - var success = response.statusCode >= 200 && response.statusCode < 300; - } - if (!success) { - throw new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body); - } - if (options.debug) { Zotero.debug(`HTTP ${response.statusCode} response: ${body}`); } @@ -153,30 +139,24 @@ Zotero.HTTP = new function() { status: response.statusCode }; - if (options.responseType == 'document') { - let dom; - try { - body = decodeContent(body, response.headers['content-type']); - dom = new JSDOM(body, { - url: result.responseURL, - // Inform JSDOM what content type it's parsing, - // so it could reject unsupported content types - contentType: response.headers['content-type'] - }); - } - catch (e) { - if (e.message.includes('not a HTML or XML content type')) { - Zotero.debug(e, 1) - throw new this.UnsupportedFormatError(result.responseURL, e.message); - } - throw e; - } + var mimeType = new MIMEType(response.headers['content-type']); + var responseType = getResponseType(response.headers['content-type'], options); + result.type = responseType; + + if (responseType == 'document') { + body = decodeContent(body, response.headers['content-type']); + let dom = new JSDOM(body, { + url: result.responseURL, + // Inform JSDOM what content type it's parsing, + // so it could reject unsupported content types + contentType: response.headers['content-type'] + }); + wgxpath.install(dom.window, true); result.response = dom.window.document; - // Follow meta redirects - if (response.headers['content-type'] - && response.headers['content-type'].startsWith('text/html')) { + // Follow meta redirects in HTML files + if (mimeType.isHTML()) { let meta = result.response.querySelector('meta[http-equiv=refresh]'); if (meta && meta.getAttribute('content')) { let parts = meta.getAttribute('content').split(/;\s*url=/); @@ -192,11 +172,20 @@ Zotero.HTTP = new function() { } } } - else if (options.responseType == 'json') { + else if (responseType == 'json') { result.response = JSON.parse(body.toString()); } - else if (!options.responseType || options.responseType == 'text') { - body = body.toString(); + else if (responseType == 'text') { + let charset = mimeType.parameters.get('charset'); + // Treat unknown charset as utf-8 + if (!charset) { + charset = 'utf8'; + } + else if (!iconv.encodingExists(charset)) { + Zotero.debug(`Unknown charset ${charset} -- decoding as UTF-8`); + charset = 'utf8'; + } + body = iconv.decode(body, charset); result.response = body; result.responseText = body; } @@ -319,7 +308,6 @@ Zotero.HTTP = new function() { * * TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved */ -const MIMEType = require("whatwg-mimetype"); const sniffHTMLEncoding = require("html-encoding-sniffer"); const whatwgEncoding = require("whatwg-encoding"); @@ -389,6 +377,54 @@ function customRequest(method, requestURL, options) { .on('response', function (res) { if (returned) return; response = res; + + if (!response.headers['content-type']) { + returned = true; + return reject(new Zotero.HTTP.UnsupportedFormatError(requestURL, 'Missing Content-Type header')); + } + + // Check if the status code is allowed + // Array of success codes given + if (options.successCodes) { + var success = options.successCodes.includes(response.statusCode); + } + // Explicit FALSE means allow any status code + else if (options.successCodes === false) { + var success = true; + } + // Otherwise, 2xx is success + else { + var success = response.statusCode >= 200 && response.statusCode < 300; + } + if (!success) { + returned = true; + return reject(new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body)); + } + + // Check Content-Type before starting the download + let supported = true; + let mimeType = new MIMEType(response.headers['content-type']); + if (options.responseType == 'document') { + supported = mimeType.isHTML() || mimeType.isXML(); + } + else if (options.responseTypeMap) { + let map = options.responseTypeMap; + supported = (map.has('html') && mimeType.isHTML()) + || (map.has('xml') && mimeType.isXML()) + || map.has(mimeType.essence) + // An empty string for a key allows unspecified types as text + || map.has(''); + } + + if (!supported) { + req.abort(); + returned = true; + return reject(new Zotero.HTTP.UnsupportedFormatError( + requestURL, + response.headers['content-type'] + ' is not supported' + )); + } + // Content-length doesn't always exists or it can be a length of a gzipped content, // but it's still worth to do the initial size check if ( @@ -399,8 +435,6 @@ function customRequest(method, requestURL, options) { returned = true; reject(new Zotero.HTTP.ResponseSizeError(requestURL)); } - - // TODO: Filter content-type too }) .on('end', function () { if (returned) return; @@ -410,4 +444,24 @@ function customRequest(method, requestURL, options) { }); }; +function getResponseType(contentType, options) { + var mimeType = new MIMEType(contentType); + if (options.responseType) { + return options.responseType; + } + if (options.responseTypeMap) { + let map = options.responseTypeMap; + if (map.has('html') && mimeType.isHTML()) { + return map.get('html'); + } + if (map.has('xml') && mimeType.isXML()) { + return map.get('xml'); + } + if (map.has(mimeType.essence)) { + return map.get(mimeType.essence); + } + } + return 'text'; +} + module.exports = Zotero.HTTP; diff --git a/src/testEndpoint.js b/src/testEndpoint.js index e230a13..bfaa5d2 100644 --- a/src/testEndpoint.js +++ b/src/testEndpoint.js @@ -1,3 +1,6 @@ +var fs = require('fs'); +var path = require('path'); + var TestEndpoint = { handlePlain: async function (ctx, _next) { ctx.response.body = 'TestHello'; @@ -27,6 +30,13 @@ var TestEndpoint = { ctx.redirect('/test/single'); }, + handleBibTeX: async function (ctx, _next) { + ctx.set('Content-Type', 'application/x-bibtex'); + ctx.response.body = fs + .readFileSync(path.join(__dirname, '../test/data/bibtex_response.xml')) + .toString(); + }, + invalidContentType: async function (ctx, _next) { ctx.set('Content-Type', 'image/jpeg'); ctx.response.body = ''; diff --git a/src/webSession.js b/src/webSession.js index 1cf6c77..c652bad 100644 --- a/src/webSession.js +++ b/src/webSession.js @@ -25,10 +25,12 @@ const config = require('config'); const urlLib = require('url'); +const { CONTENT_TYPES } = require('./formats'); const Translate = require('./translation/translate'); const TLDS = Zotero.require('./translation/tlds'); const HTTP = require('./http'); const Translators = require('./translators'); +const ImportEndpoint = require('./importEndpoint'); const SearchEndpoint = require('./searchEndpoint'); const { jar: cookieJar } = require('request'); @@ -85,6 +87,18 @@ WebSession.prototype.handleURL = async function () { } } + var responseTypeMap = new Map([ + ['html', 'document'], + ['text/plain', 'text'] + ]); + // Force all import content types to text + for (let type in CONTENT_TYPES) { + let contentType = CONTENT_TYPES[type]; + if (contentType == 'text/html') continue; + if (responseTypeMap.has(contentType)) continue; + responseTypeMap.set(contentType, 'text'); + } + var urlsToTry = config.get('deproxifyURLs') ? this.deproxifyURL(url) : [url]; for (let i = 0; i < urlsToTry.length; i++) { let url = urlsToTry[i]; @@ -153,19 +167,28 @@ WebSession.prototype.handleURL = async function () { translate.setRequestHeaders(headers); try { - await HTTP.processDocuments( - [url], - (doc) => { - translate.setDocument(doc); - // This could be optimized by only running detect on secondary translators - // if the first fails, but for now just run detect on all - return translate.getTranslators(true); - }, + let req = await Zotero.HTTP.request( + "GET", + url, { + responseTypeMap, cookieSandbox: this._cookieSandbox, headers } ); + if (req.type === 'document') { + translate.setDocument(req.response); + // This could be optimized by only running detect on secondary translators + // if the first fails, but for now just run detect on all + translate.getTranslators(true); + } + else { + Zotero.debug(`Handling ${req.headers['content-type']} as import`); + this.ctx.request.body = req.response; + await ImportEndpoint.handle(this.ctx); + return; + } + return promise; } catch (e) { diff --git a/test/data/bibtex_response.xml b/test/data/bibtex_response.xml new file mode 100644 index 0000000..43ea9f1 --- /dev/null +++ b/test/data/bibtex_response.xml @@ -0,0 +1,7 @@ +@article{Bar19, + author = "Foo Bar", + title = "Title", + journal = "Journal", + volume = 123, + year = 2019, + pages = "123--125"} \ No newline at end of file diff --git a/test/setup.js b/test/setup.js index 4dc1b75..b07283d 100644 --- a/test/setup.js +++ b/test/setup.js @@ -23,6 +23,7 @@ testApp.use(_.get('/test/plain', TestEndpoint.handlePlain)); testApp.use(_.get('/test/single', TestEndpoint.handleSingle)); testApp.use(_.get('/test/multiple', TestEndpoint.handleMultiple)); testApp.use(_.get('/test/redirect', TestEndpoint.handleRedirect)); +testApp.use(_.get('/test/bibtex', TestEndpoint.handleBibTeX)); testApp.use(_.get('/test/invalidContentType', TestEndpoint.invalidContentType)); testApp.use(_.get('/test/missingContentType', TestEndpoint.missingContentType)); var testServer = testApp.listen(config.get('testPort')); diff --git a/test/web_test.js b/test/web_test.js index 4265b9b..73fc82c 100644 --- a/test/web_test.js +++ b/test/web_test.js @@ -73,6 +73,21 @@ describe("/web", function () { }); + it("should translate a remote BibTeX file", async function () { + var url = testURL + 'bibtex'; + var response = await request() + .post('/web') + .set('Content-Type', 'text/plain') + .send(url); + assert.equal(response.statusCode, 200); + var json = response.body; + + assert.lengthOf(json, 1); + assert.equal(json[0].itemType, 'journalArticle'); + assert.equal(json[0].title, 'Title'); + }); + + it("should return 400 if a page returns a 404", async function () { var url = testURL + '404'; var response = await request()