Skip to content

Commit

Permalink
Add PDF handling
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed Jan 9, 2019
1 parent 5848d9e commit d092ecf
Show file tree
Hide file tree
Showing 8 changed files with 315 additions and 30 deletions.
8 changes: 8 additions & 0 deletions config/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,18 @@
"blacklistedDomains": [],
"deproxifyURLs": false, // Automatically try deproxified versions of URLs
"identifierSearchLambda": "", // Identifier search Lambda function for text search
"recognizerLambda": "", // PDF recognizer Lambda function
"port": 1969,
"translators": {
"CrossrefREST.email": "" // Pass an email to Crossref REST API to utilize the faster servers pool
},
"s3Upload": {
"params": {
"Bucket": ""
},
"accessKeyId": "",
"secretAccessKey": ""
},
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0",
"translatorsDirectory": "./modules/translators"
}
14 changes: 14 additions & 0 deletions lambda_template.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,24 @@ Resources:
MemorySize: 2048
Timeout: 30
Policies:
- Statement:
Action:
- s3:PutObject
- s3:DeleteObject
Effect: Allow
Resource: !ImportValue RecognizerUploadBucketArn
- LambdaInvokePolicy:
FunctionName: {{ identifier_search_function_name }}
- LambdaInvokePolicy:
FunctionName: {{ recognizer_function_name }}
Events:
# API Gateway
GetAPI:
Type: Api
Properties:
# Proxy all GET requests to Lambda function
Path: /{proxy+}
Method: get
PostAPI:
Type: Api
Properties:
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"request-promise-native": "^1.0.5",
"serverless-http": "^1.6.0",
"w3c-xmlserializer": "0.0.1",
"whatwg-mimetype": "^2.3.0",
"wicked-good-xpath": "git+https://github.com/adomasven/wicked-good-xpath.git#e84d65d",
"xregexp": "^4.2.0",
"yargs": "^12.0.2"
Expand Down
66 changes: 44 additions & 22 deletions src/http.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var url = require('url');
var jsdom = require('jsdom');
var { JSDOM } = jsdom;
var wgxpath = require('wicked-good-xpath');
var MIMEType = require("whatwg-mimetype"); // Use the same MIME type library as JSDOM does

/**
* Functions for performing HTTP requests
Expand Down Expand Up @@ -154,28 +155,22 @@ Zotero.HTTP = new function() {
};

if (options.responseType == 'document') {
let dom;
try {
dom = new JSDOM(body, {
let mimeType = new MIMEType(response.headers['content-type']);

// Filter content-type in the same way as JSDOM does
if (mimeType.isHTML() || mimeType.isXML()) {
result.type = 'document';

let dom = new JSDOM(body, {
url: result.responseURL,
// Inform JSDOM what content type it's parsing,
// so it could reject unsupported content types
// Inform JSDOM what content type it's parsing
contentType: response.headers['content-type']
});
}
catch (e) {
if (e.message.includes('not a HTML or XML content type')) {
Zotero.debug(e, 1)
throw new this.UnsupportedFormatError(result.responseURL, e.message);
}
throw e;
}
wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects
if (response.headers['content-type']
&& response.headers['content-type'].startsWith('text/html')) {

wgxpath.install(dom.window, true);
result.response = dom.window.document;

// Follow meta redirects in HTML and XML files
let meta = result.response.querySelector('meta[http-equiv=refresh]');
if (meta && meta.getAttribute('content')) {
let parts = meta.getAttribute('content').split(/;\s*url=/);
Expand All @@ -190,6 +185,19 @@ Zotero.HTTP = new function() {
}
}
}
else if (
options.fallbackToPDF &&
mimeType.essence === 'application/pdf'
) {
result.type = 'pdf';
result.response = body;
}
else {
throw new this.UnsupportedFormatError(
result.responseURL,
response.headers['content-type'] + ' is not supported'
);
}
}
else if (options.responseType == 'json') {
result.response = JSON.parse(body.toString());
Expand Down Expand Up @@ -362,6 +370,22 @@ function customRequest(method, requestURL, options) {
.on('response', function (res) {
if (returned) return;
response = res;

// Check content-type before starting the download
let mimeType = new MIMEType(response.headers['content-type']);
if (!(
mimeType.isHTML() ||
mimeType.isXML() ||
options.fallbackToPDF && mimeType.essence === 'application/pdf'
)) {
req.abort();
returned = true;
return reject(new Zotero.HTTP.UnsupportedFormatError(
requestURL,
response.headers['content-type'] + ' is not supported'
));
}

// Content-length doesn't always exists or it can be a length of a gzipped content,
// but it's still worth to do the initial size check
if (
Expand All @@ -372,15 +396,13 @@ function customRequest(method, requestURL, options) {
returned = true;
reject(new Zotero.HTTP.ResponseSizeError(requestURL));
}

// TODO: Filter content-type too
})
.on('end', function () {
if (returned) return;
returned = true;
resolve({response, body: Buffer.concat(buffers, bufferLength)});
});
});
};
}

module.exports = Zotero.HTTP;
3 changes: 3 additions & 0 deletions src/lambda.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ var Translators; // Translators module is cashed
const SearchEndpoint = require('./searchEndpoint');
const WebEndpoint = require('./webEndpoint');
const ExportEndpoint = require('./exportEndpoint');
const Recognizer = require('./recognizer');

const app = module.exports = new Koa();
app.use(cors);
app.use(bodyParser({enableTypes: ['text', 'json']}));
app.use(_.post('/web', WebEndpoint.handle.bind(WebEndpoint)));
app.use(_.post('/search', SearchEndpoint.handle.bind(SearchEndpoint)));
app.use(_.post('/export', ExportEndpoint.handle.bind(ExportEndpoint)));
app.use(_.get('/recognize/getUploadParams', Recognizer.handleUpload.bind(Recognizer)));
app.use(_.post('/recognize/process', Recognizer.handleProcess.bind(Recognizer)));

Debug.init(1);

Expand Down
213 changes: 213 additions & 0 deletions src/recognizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2018 Corporation for Digital Scholarship
Vienna, Virginia, USA
https://www.zotero.org
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/

// TODO: Move upload logic outside of recognizer.js if uploads will be needed for other purposes

const config = require('config');
const AWS = require('aws-sdk');
const crypto = require('crypto');
const Lambda = new AWS.Lambda({apiVersion: '2015-03-31'});
const S3 = new AWS.S3(config.get('s3Upload'));

let UPLOAD_EXPIRATION = 1 * 60; // 1 minute to initiate upload
let MAX_PDF_SIZE = 50 * 1024 * 1024; // 50 MB

let Recognizer = module.exports = {
/**
* Directly upload file and get its uploadID
*
* @param buffer
* @return {Promise<string>} uploadID
*/
upload: async function (buffer) {
// Generate UUID
let uploadID = crypto.randomBytes(16).toString('hex');
await S3.upload({Key: uploadID, Body: buffer}).promise();
return uploadID;
},

remove: async function(uploadID) {
await S3.deleteObject({Key: uploadID}).promise();
},

/**
* Recognize the uploaded PDF by invoking recognizer Lambda function
*
* @param uploadID
* @return {Promise<Object|null>} Item metadata in translator format
*/
recognize: async function (uploadID) {
let params = {
FunctionName: config.get('recognizerLambda'),
InvocationType: 'RequestResponse',
// Inform recognizer Lambda that we are calling it internally, not over API gateway
Payload: JSON.stringify({type: 'INTERNAL', body: {action: "recognizeUpload", uploadID}})
};

let res = await Lambda.invoke(params).promise();

if (res.FunctionError) {
throw new Error('Lambda error: ' + res.Payload);
}

res = JSON.parse(res.Payload);

// Retrieve metadata by using recognized identifiers
let identifiers = [];

if (res.arxiv) {
identifiers.push({arXiv: res.arxiv});
}

if (res.doi) {
identifiers.push({DOI: res.doi});
}

if (res.isbn) {
identifiers.push({ISBN: res.isbn});
}

for (let identifier of identifiers) {
let translate = new Zotero.Translate.Search();
translate.setIdentifier(identifier);
let translators = await translate.getTranslators();
translate.setTranslator(translators);

try {
let items = await translate.translate({libraryID: false});

if (items.length) {
let item = items[0];

// Add some fields if the translated item doesn't have them

if (!item.abstractNote && res.abstract) {
item.abstractNote = res.abstract;
}

if (!item.language && res.language) {
item.language = res.language;
}
return item;
}
}
catch (e) {
Zotero.debug(e);
}
}

// Return the extracted metadata
if (res.title) {
let item = {};
item.itemType = 'journalArticle';

if (res.type === 'book-chapter') {
item.itemType = 'bookSection';
}

item.title = res.title;

item.creators = [];
for (let author of res.authors) {
item.creators.push({
firstName: author.firstName,
lastName: author.lastName,
creatorType: 'author'
})
}

if (res.abstract) item.abstractNote = res.abstract;
if (res.year) item.date = res.year;
if (res.pages) item.pages = res.pages;
if (res.volume) item.volume = res.volume;
if (res.url) item.url = res.url;
if (res.language) item.language = res.language;

if (item.itemType === 'journalArticle') {
if (res.issue) item.issue = res.issue;
if (res.issn) item.issn = res.issn;
if (res.container) item.publicationTitle = res.container;
}
else if (item.itemType === 'bookSection') {
if (res.container) item.bookTitle = res.container;
if (res.publisher) item.publisher = res.publisher;
}

item.libraryCatalog = 'Zotero';
return item;
}

return null;
},

/**
* Generate presigned upload params
*
* @param ctx
* @return {Promise<void>}
*/
handleUpload: async function (ctx) {
// Generate UUID
let uploadID = crypto.randomBytes(16).toString('hex');
// Generate a presigned POST form, which have to posted from browser to S3.
// createPresignedPost is used instead of getSignedUrl because it
// doesn't support file size limiting
const data = S3.createPresignedPost({
Fields: {
key: uploadID
},
Expires: UPLOAD_EXPIRATION,
Conditions: [
['content-length-range', 0, MAX_PDF_SIZE],
]
});
ctx.body = {uploadID, data};
},

/**
* Recognize the uploaded PDF file
*
* @param ctx
* @return {Promise<void>}
*/
handleProcess: async function (ctx) {
let uploadID = ctx.request.body;

if (!uploadID) {
ctx.throw(400, "uploadID not provided\n");
}

try {
let item = await this.recognize(uploadID);
ctx.body = Zotero.Utilities.itemToAPIJSON(item);
}
catch (e) {
throw e;
}
finally {
await this.remove(uploadID);
}
}
};
Loading

0 comments on commit d092ecf

Please sign in to comment.