Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PDF handling #59

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
"blacklistedDomains": [],
"deproxifyURLs": false, // Automatically try deproxified versions of URLs
"identifierSearchLambda": "", // Identifier search Lambda function for text search
"recognizerLambda": "", // PDF recognizer Lambda function
"port": 1969,
"translators": {
"CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool
},
"s3Upload": {
"params": {
"Bucket": ""
},
"accessKeyId": "",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Zotero house style: accessKeyID

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are just passing the whole s3Upload object as a S3 config. It's useful because there can be other configuration options that are needed to pass i.e. region. But the variable naming doesn't depend on us.

"secretAccessKey": ""
},
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
"translatorsDirectory": "./modules/translators"
}
1 change: 1 addition & 0 deletions lambda_config.env-sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ deployment_bucket_name=deployment
stack_name=TranslationServer
main_function_name=translation-server
identifier_search_function_name=identifier-search
recognizer_function_name=recognizer-lambda
14 changes: 14 additions & 0 deletions lambda_template.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,24 @@ Resources:
MemorySize: 2048
Timeout: 30
Policies:
- Statement:
Action:
- s3:PutObject
- s3:DeleteObject
Effect: Allow
Resource: !ImportValue RecognizerUploadBucketArn
- LambdaInvokePolicy:
FunctionName: {{ identifier_search_function_name }}
- LambdaInvokePolicy:
FunctionName: {{ recognizer_function_name }}
Events:
# API Gateway
GetAPI:
Type: Api
Properties:
# Proxy all GET requests to Lambda function
Path: /{proxy+}
Method: get
PostAPI:
Type: Api
Properties:
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"request-promise-native": "^1.0.5",
"serverless-http": "^1.6.0",
"w3c-xmlserializer": "0.0.1",
"whatwg-mimetype": "^2.3.0",
"wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d",
"xregexp": "^4.2.0",
"yargs": "^12.0.2"
Expand Down
110 changes: 67 additions & 43 deletions src/http.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var url = require('url');
var jsdom = require('jsdom');
var { JSDOM } = jsdom;
var wgxpath = require('wicked-good-xpath');
var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does

/**
* Functions for performing HTTP requests
Expand Down Expand Up @@ -123,26 +124,6 @@ Zotero.HTTP = new function() {

let {response, body} = await customRequest(method, requestURL, options);

if (!response.headers['content-type']) {
throw new this.UnsupportedFormatError(requestURL, 'Missing Content-Type header');
}

// Array of success codes given
if (options.successCodes) {
var success = options.successCodes.includes(response.statusCode);
}
// Explicit FALSE means allow any status code
else if (options.successCodes === false) {
var success = true;
}
// Otherwise, 2xx is success
else {
var success = response.statusCode >= 200 && response.statusCode < 300;
}
if (!success) {
throw new Zotero.HTTP.StatusError(requestURL, response.statusCode, response.body);
}

if (options.debug) {
Zotero.debug(`HTTP ${response.statusCode} response: ${body}`);
}
Expand All @@ -154,29 +135,22 @@ Zotero.HTTP = new function() {
};

if (options.responseType == 'document') {
let dom;
try {
let mimeType = new MIMEType(response.headers['content-type']);

// Filter content-type in the same way as JSDOM does
if (mimeType.isHTML() || mimeType.isXML()) {
result.type = 'document';
body = decodeContent(body, response.headers['content-type']);
dom = new JSDOM(body, {
let dom = new JSDOM(body, {
url: result.responseURL,
// Inform JSDOM what content type it's parsing,
// so it could reject unsupported content types
// Inform JSDOM what content type it's parsing
contentType: response.headers['content-type']
});
}
catch (e) {
if (e.message.includes('not a HTML or XML content type')) {
Zotero.debug(e, 1)
throw new this.UnsupportedFormatError(result.responseURL, e.message);
}
throw e;
}
wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects
if (response.headers['content-type']
&& response.headers['content-type'].startsWith('text/html')) {

wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects in HTML and XML files
let meta = result.response.querySelector('meta[http-equiv=refresh]');
if (meta && meta.getAttribute('content')) {
let parts = meta.getAttribute('content').split(/;\s*url=/);
Expand All @@ -191,6 +165,19 @@ Zotero.HTTP = new function() {
}
}
}
else if (
options.fallbackToPDF &&
mimeType.essence === 'application/pdf'
) {
result.type = 'pdf';
result.response = body;
}
else {
throw new this.UnsupportedFormatError(
result.responseURL,
response.headers['content-type'] + ' is not supported'
);
}
}
else if (options.responseType == 'json') {
result.response = JSON.parse(body.toString());
Expand Down Expand Up @@ -319,7 +306,6 @@ Zotero.HTTP = new function() {
*
* TODO: Remove this code when https://github.com/jsdom/jsdom/issues/2495 will be solved
*/
const MIMEType = require("whatwg-mimetype");
const sniffHTMLEncoding = require("html-encoding-sniffer");
const whatwgEncoding = require("whatwg-encoding");

Expand Down Expand Up @@ -389,6 +375,46 @@ function customRequest(method, requestURL, options) {
.on('response', function (res) {
if (returned) return;
response = res;

// Check if the content-type header exists
if (!response.headers['content-type']) {
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(requestURL, 'Missing Content-Type header'));
}

// Check if the status code is allowed
// Array of success codes given
if (options.successCodes) {
var success = options.successCodes.includes(response.statusCode);
}
// Explicit FALSE means allow any status code
else if (options.successCodes === false) {
var success = true;
}
// Otherwise, 2xx is success
else {
var success = response.statusCode >= 200 && response.statusCode < 300;
}
if (!success) {
returned = true;
return reject(new Zotero.HTTP.StatusError(requestURL, response.statusCode));
}

// Check content-type before starting the download
let mimeType = new MIMEType(response.headers['content-type']);
if (!(
mimeType.isHTML() ||
mimeType.isXML() ||
options.fallbackToPDF && mimeType.essence === 'application/pdf'
)) {
req.abort();
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(
requestURL,
response.headers['content-type'] + ' is not supported'
));
}

// Content-length doesn't always exists or it can be a length of a gzipped content,
// but it's still worth to do the initial size check
if (
Expand All @@ -399,15 +425,13 @@ function customRequest(method, requestURL, options) {
returned = true;
reject(new Zotero.HTTP.ResponseSizeError(requestURL));
}

// TODO: Filter content-type too
})
.on('end', function () {
if (returned) return;
returned = true;
resolve({response, body: Buffer.concat(buffers, bufferLength)});
});
});
};
}

module.exports = Zotero.HTTP;
3 changes: 3 additions & 0 deletions src/lambda.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ var Translators; // Translators module is cashed
const SearchEndpoint = require('./searchEndpoint');
const WebEndpoint = require('./webEndpoint');
const ExportEndpoint = require('./exportEndpoint');
const Recognizer = require('./recognizer');

const app = module.exports = new Koa();
app.use(cors);
app.use(bodyParser({enableTypes: ['text', 'json']}));
app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));

Debug.init(1);

Expand Down
Loading