Skip to content

Commit

Permalink
Add PDF handling
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed Jan 31, 2019
1 parent 8b83a47 commit eb5ecd7
Show file tree
Hide file tree
Showing 8 changed files with 314 additions and 31 deletions.
8 changes: 8 additions & 0 deletions config/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
"blacklistedDomains": [],
"deproxifyURLs": false, // Automatically try deproxified versions of URLs
"identifierSearchLambda": "", // Identifier search Lambda function for text search
"recognizerLambda": "", // PDF recognizer Lambda function
"port": 1969,
"translators": {
"CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool
},
"s3Upload": {
"params": {
"Bucket": ""
},
"accessKeyId": "",
"secretAccessKey": ""
},
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
"translatorsDirectory": "./modules/translators"
}
14 changes: 14 additions & 0 deletions lambda_template.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,24 @@ Resources:
MemorySize: 2048
Timeout: 30
Policies:
- Statement:
Action:
- s3:PutObject
- s3:DeleteObject
Effect: Allow
Resource: !ImportValue RecognizerUploadBucketArn
- LambdaInvokePolicy:
FunctionName: {{ identifier_search_function_name }}
- LambdaInvokePolicy:
FunctionName: {{ recognizer_function_name }}
Events:
# API Gateway
GetAPI:
Type: Api
Properties:
# Proxy all GET requests to Lambda function
Path: /{proxy+}
Method: get
PostAPI:
Type: Api
Properties:
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"request-promise-native": "^1.0.5",
"serverless-http": "^1.6.0",
"w3c-xmlserializer": "0.0.1",
"whatwg-mimetype": "^2.3.0",
"wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d",
"xregexp": "^4.2.0",
"yargs": "^12.0.2"
Expand Down
66 changes: 43 additions & 23 deletions src/http.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var url = require('url');
var jsdom = require('jsdom');
var { JSDOM } = jsdom;
var wgxpath = require('wicked-good-xpath');
var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does

/**
* Functions for performing HTTP requests
Expand Down Expand Up @@ -154,29 +155,22 @@ Zotero.HTTP = new function() {
};

if (options.responseType == 'document') {
let dom;
try {
let mimeType = new MIMEType(response.headers['content-type']);

// Filter content-type in the same way as JSDOM does
if (mimeType.isHTML() || mimeType.isXML()) {
result.type = 'document';
body = decodeContent(body, response.headers['content-type']);
dom = new JSDOM(body, {
let dom = new JSDOM(body, {
url: result.responseURL,
// Inform JSDOM what content type it's parsing,
// so it could reject unsupported content types
// Inform JSDOM what content type it's parsing
contentType: response.headers['content-type']
});
}
catch (e) {
if (e.message.includes('not a HTML or XML content type')) {
Zotero.debug(e, 1)
throw new this.UnsupportedFormatError(result.responseURL, e.message);
}
throw e;
}
wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects
if (response.headers['content-type']
&& response.headers['content-type'].startsWith('text/html')) {

wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects in HTML and XML files
let meta = result.response.querySelector('meta[http-equiv=refresh]');
if (meta && meta.getAttribute('content')) {
let parts = meta.getAttribute('content').split(/;\s*url=/);
Expand All @@ -191,6 +185,19 @@ Zotero.HTTP = new function() {
}
}
}
else if (
options.fallbackToPDF &&
mimeType.essence === 'application/pdf'
) {
result.type = 'pdf';
result.response = body;
}
else {
throw new this.UnsupportedFormatError(
result.responseURL,
response.headers['content-type'] + ' is not supported'
);
}
}
else if (options.responseType == 'json') {
result.response = JSON.parse(body.toString());
Expand Down Expand Up @@ -319,7 +326,6 @@ Zotero.HTTP = new function() {
*
* TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved
*/
const MIMEType = require("whatwg-mimetype");
const sniffHTMLEncoding = require("html-encoding-sniffer");
const whatwgEncoding = require("whatwg-encoding");

Expand Down Expand Up @@ -389,6 +395,22 @@ function customRequest(method, requestURL, options) {
.on('response', function (res) {
if (returned) return;
response = res;

// Check content-type before starting the download
let mimeType = new MIMEType(response.headers['content-type']);
if (!(
mimeType.isHTML() ||
mimeType.isXML() ||
options.fallbackToPDF && mimeType.essence === 'application/pdf'
)) {
req.abort();
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(
requestURL,
response.headers['content-type'] + ' is not supported'
));
}

// Content-length doesn't always exists or it can be a length of a gzipped content,
// but it's still worth to do the initial size check
if (
Expand All @@ -399,15 +421,13 @@ function customRequest(method, requestURL, options) {
returned = true;
reject(new Zotero.HTTP.ResponseSizeError(requestURL));
}

// TODO: Filter content-type too
})
.on('end', function () {
if (returned) return;
returned = true;
resolve({response, body: Buffer.concat(buffers, bufferLength)});
});
});
};
}

module.exports = Zotero.HTTP;
3 changes: 3 additions & 0 deletions src/lambda.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ var Translators; // Translators module is cashed
const SearchEndpoint = require('./searchEndpoint');
const WebEndpoint = require('./webEndpoint');
const ExportEndpoint = require('./exportEndpoint');
const Recognizer = require('./recognizer');

const app = module.exports = new Koa();
app.use(cors);
app.use(bodyParser({enableTypes: ['text', 'json']}));
app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));

Debug.init(1);

Expand Down
Loading

0 comments on commit eb5ecd7

Please sign in to comment.