Skip to content

Commit

Permalink
Translate import formats via /web
Browse files Browse the repository at this point in the history
If the remote URL is text/plain or one of the known content types for
the formats we support (e.g., application/x-bibtex), try to handle it
via import translation. This allows, for example, a remote BibTeX file
to be translated.

This adapts some of the code from #59 in a more general way, with a
responseTypeMap property that can be passed to HTTP.request().
  • Loading branch information
dstillman committed Apr 4, 2019
1 parent c5dd0ed commit 2dfd9dc
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 97 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"dependencies": {
"aws-sdk": "^2.326.0",
"config": "^1.30.0",
"iconv-lite": "^0.4.24",
"jsdom": "^13.1.0",
"koa": "^2.5.1",
"koa-bodyparser": "^4.2.1",
Expand Down
45 changes: 3 additions & 42 deletions src/exportEndpoint.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,48 +24,9 @@
*/

const config = require('config');
const { FORMATS, CONTENT_TYPES } = require('./formats');
const Translate = require('./translation/translate');

const SERVER_FORMATS = {
bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4",
biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f",
bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8",
coins: "05d07af9-105a-4572-99f6-a8e231c0daef",
csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7",
csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2",
endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b",
evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf",
mods: "0e2235e7-babf-413c-9acf-f27cce5f059c",
rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9",
rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7",
rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9",
refer: "881f60f2-0802-411a-9228-ce5f47b64c7d",
refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86",
ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
tei: "032ae9b7-ab90-9205-a479-baf81f49184a",
wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620"
};

const SERVER_CONTENT_TYPES = {
bibtex: "application/x-bibtex",
biblatex: "application/x-bibtex",
bookmarks: "text/html",
coins: "text/html",
csljson: "application/json",
csv: "text/csv",
endnote_xml: "text/xml",
evernote: "text/xml",
mods: "application/mods+xml",
rdf_bibliontology: "application/rdf+xml",
rdf_dc: "application/rdf+xml",
rdf_zotero: "application/rdf+xml",
refer: "application/x-research-info-systems",
refworks_tagged: "text/plain",
ris: "application/x-research-info-systems",
tei: "text/xml",
wikipedia: "text/x-wiki"
};

var ExportEndpoint = module.exports = {
handle: async function (ctx, next) {
ctx.assert(ctx.is('json'), 415);
Expand All @@ -80,7 +41,7 @@ var ExportEndpoint = module.exports = {

var translatorID;

if (!query.format || !(translatorID = SERVER_FORMATS[query.format])) {
if (!query.format || !(translatorID = FORMATS[query.format])) {
ctx.throw(400, "Invalid format specified");
}

Expand All @@ -107,7 +68,7 @@ var ExportEndpoint = module.exports = {
reject();
}
else {
ctx.set('Content-Type', SERVER_CONTENT_TYPES[query.format]);
ctx.set('Content-Type', CONTENT_TYPES[query.format]);
ctx.response.body = translate.string;
resolve();
}
Expand Down
43 changes: 43 additions & 0 deletions src/formats.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* eslint camelcase: "off" */

const FORMATS = {
bibtex: "9cb70025-a888-4a29-a210-93ec52da40d4",
biblatex: "b6e39b57-8942-4d11-8259-342c46ce395f",
bookmarks: "4e7119e0-02be-4848-86ef-79a64185aad8",
coins: "05d07af9-105a-4572-99f6-a8e231c0daef",
csljson: "bc03b4fe-436d-4a1f-ba59-de4d2d7a63f7",
csv: "25f4c5e2-d790-4daa-a667-797619c7e2f2",
endnote_xml: "eb7059a4-35ec-4961-a915-3cf58eb9784b",
evernote: "18dd188a-9afc-4cd6-8775-1980c3ce0fbf",
mods: "0e2235e7-babf-413c-9acf-f27cce5f059c",
rdf_bibliontology: "14763d25-8ba0-45df-8f52-b8d1108e7ac9",
rdf_dc: "6e372642-ed9d-4934-b5d1-c11ac758ebb7",
rdf_zotero: "14763d24-8ba0-45df-8f52-b8d1108e7ac9",
refer: "881f60f2-0802-411a-9228-ce5f47b64c7d",
refworks_tagged: "1a3506da-a303-4b0a-a1cd-f216e6138d86",
ris: "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7",
tei: "032ae9b7-ab90-9205-a479-baf81f49184a",
wikipedia: "3f50aaac-7acc-4350-acd0-59cb77faf620"
};

const CONTENT_TYPES = {
bibtex: "application/x-bibtex",
biblatex: "application/x-bibtex",
bookmarks: "text/html",
coins: "text/html",
csljson: "application/json",
csv: "text/csv",
endnote_xml: "text/xml",
evernote: "text/xml",
mods: "application/mods+xml",
rdf_bibliontology: "application/rdf+xml",
rdf_dc: "application/rdf+xml",
rdf_zotero: "application/rdf+xml",
refer: "application/x-research-info-systems",
refworks_tagged: "text/plain",
ris: "application/x-research-info-systems",
tei: "text/xml",
wikipedia: "text/x-wiki"
};

module.exports = { FORMATS, CONTENT_TYPES };
148 changes: 101 additions & 47 deletions src/http.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@

var config = require('config');
var request = require('request');
var iconv = require('iconv-lite');
var url = require('url');
var jsdom = require('jsdom');
var { JSDOM } = jsdom;
var wgxpath = require('wicked-good-xpath');
var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM

/**
* Functions for performing HTTP requests
Expand Down Expand Up @@ -70,6 +72,10 @@ Zotero.HTTP = new function() {
* <li>logBodyLength - Length of request body to log</li>
* <li>timeout - Request timeout specified in milliseconds [default 15000]</li>
* <li>responseType - The response type of the request from the XHR spec</li>
* <li>responseTypeMap - A Map of remote content type ('application/x-bibtex') to
* XHR response type ('text'). 'html' and 'xml' imply isHTML() and isXML() from
* whatwg-mimetype. Use an empty string for the key to set a fallback response type;
* otherwise unspecified content types are rejected.</li>
* <li>successCodes - HTTP status codes that are considered successful, or FALSE to allow all</li>
* </ul>
* @return {Promise<Object>} A promise resolved with a response object containing:
Expand Down Expand Up @@ -123,26 +129,6 @@ Zotero.HTTP = new function() {

let {response, body} = await customRequest(method, requestURL, options);

if (!response.headers['content-type']) {
throw new this.UnsupportedFormatError(requestURL, 'Missing Content-Type header');
}

// Array of success codes given
if (options.successCodes) {
var success = options.successCodes.includes(response.statusCode);
}
// Explicit FALSE means allow any status code
else if (options.successCodes === false) {
var success = true;
}
// Otherwise, 2xx is success
else {
var success = response.statusCode >= 200 && response.statusCode < 300;
}
if (!success) {
throw new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body);
}

if (options.debug) {
Zotero.debug(`HTTP ${response.statusCode} response: ${body}`);
}
Expand All @@ -153,30 +139,24 @@ Zotero.HTTP = new function() {
status: response.statusCode
};

if (options.responseType == 'document') {
let dom;
try {
body = decodeContent(body, response.headers['content-type']);
dom = new JSDOM(body, {
url: result.responseURL,
// Inform JSDOM what content type it's parsing,
// so it could reject unsupported content types
contentType: response.headers['content-type']
});
}
catch (e) {
if (e.message.includes('not a HTML or XML content type')) {
Zotero.debug(e, 1)
throw new this.UnsupportedFormatError(result.responseURL, e.message);
}
throw e;
}
var mimeType = new MIMEType(response.headers['content-type']);
var responseType = getResponseType(response.headers['content-type'], options);
result.type = responseType;

if (responseType == 'document') {
body = decodeContent(body, response.headers['content-type']);
let dom = new JSDOM(body, {
url: result.responseURL,
// Inform JSDOM what content type it's parsing,
// so it could reject unsupported content types
contentType: response.headers['content-type']
});

wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects
if (response.headers['content-type']
&& response.headers['content-type'].startsWith('text/html')) {
// Follow meta redirects in HTML files
if (mimeType.isHTML()) {
let meta = result.response.querySelector('meta[http-equiv=refresh]');
if (meta && meta.getAttribute('content')) {
let parts = meta.getAttribute('content').split(/;\s*url=/);
Expand All @@ -192,11 +172,20 @@ Zotero.HTTP = new function() {
}
}
}
else if (options.responseType == 'json') {
else if (responseType == 'json') {
result.response = JSON.parse(body.toString());
}
else if (!options.responseType || options.responseType == 'text') {
body = body.toString();
else if (responseType == 'text') {
let charset = mimeType.parameters.get('charset');
// Treat unknown charset as utf-8
if (!charset) {
charset = 'utf8';
}
else if (!iconv.encodingExists(charset)) {
Zotero.debug(`Unknown charset ${charset} -- decoding as UTF-8`);
charset = 'utf8';
}
body = iconv.decode(body, charset);
result.response = body;
result.responseText = body;
}
Expand Down Expand Up @@ -319,7 +308,6 @@ Zotero.HTTP = new function() {
*
* TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved
*/
const MIMEType = require("whatwg-mimetype");
const sniffHTMLEncoding = require("html-encoding-sniffer");
const whatwgEncoding = require("whatwg-encoding");

Expand Down Expand Up @@ -389,6 +377,54 @@ function customRequest(method, requestURL, options) {
.on('response', function (res) {
if (returned) return;
response = res;

if (!response.headers['content-type']) {
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(requestURL, 'Missing Content-Type header'));
}

// Check if the status code is allowed
// Array of success codes given
if (options.successCodes) {
var success = options.successCodes.includes(response.statusCode);
}
// Explicit FALSE means allow any status code
else if (options.successCodes === false) {
var success = true;
}
// Otherwise, 2xx is success
else {
var success = response.statusCode >= 200 && response.statusCode < 300;
}
if (!success) {
returned = true;
return reject(new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body));
}

// Check Content-Type before starting the download
let supported = true;
let mimeType = new MIMEType(response.headers['content-type']);
if (options.responseType == 'document') {
supported = mimeType.isHTML() || mimeType.isXML();
}
else if (options.responseTypeMap) {
let map = options.responseTypeMap;
supported = (map.has('html') && mimeType.isHTML())
|| (map.has('xml') && mimeType.isXML())
|| map.has(mimeType.essence)
// An empty string for a key allows unspecified types as text
|| map.has('');
}

if (!supported) {
req.abort();
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(
requestURL,
response.headers['content-type'] + ' is not supported'
));
}

// Content-length doesn't always exists or it can be a length of a gzipped content,
// but it's still worth to do the initial size check
if (
Expand All @@ -399,8 +435,6 @@ function customRequest(method, requestURL, options) {
returned = true;
reject(new Zotero.HTTP.ResponseSizeError(requestURL));
}

// TODO: Filter content-type too
})
.on('end', function () {
if (returned) return;
Expand All @@ -410,4 +444,24 @@ function customRequest(method, requestURL, options) {
});
};

function getResponseType(contentType, options) {
var mimeType = new MIMEType(contentType);
if (options.responseType) {
return options.responseType;
}
if (options.responseTypeMap) {
let map = options.responseTypeMap;
if (map.has('html') && mimeType.isHTML()) {
return map.get('html');
}
if (map.has('xml') && mimeType.isXML()) {
return map.get('xml');
}
if (map.has(mimeType.essence)) {
return map.get(mimeType.essence);
}
}
return 'text';
}

module.exports = Zotero.HTTP;
10 changes: 10 additions & 0 deletions src/testEndpoint.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
var fs = require('fs');
var path = require('path');

var TestEndpoint = {
handlePlain: async function (ctx, _next) {
ctx.response.body = '<html><head><title>Test</title></head><body>Hello</body></html>';
Expand Down Expand Up @@ -27,6 +30,13 @@ var TestEndpoint = {
ctx.redirect('/test/single');
},

handleBibTeX: async function (ctx, _next) {
ctx.set('Content-Type', 'application/x-bibtex');
ctx.response.body = fs
.readFileSync(path.join(__dirname, '../test/data/bibtex_response.xml'))
.toString();
},

invalidContentType: async function (ctx, _next) {
ctx.set('Content-Type', 'image/jpeg');
ctx.response.body = '';
Expand Down
Loading

0 comments on commit 2dfd9dc

Please sign in to comment.