From b5499e6939d39584caf70f89ac4455d428e8f00e Mon Sep 17 00:00:00 2001 From: developerfromjokela Date: Sat, 5 Feb 2022 14:18:04 +0200 Subject: [PATCH] - Fixed PDF parsing, when an image is present - Chrome webdriver performance improvements - Bumped up to version v1.0.9 --- Dockerfile | 5 ++++- package.json | 9 +++++---- src/handlers/aromav2.ts | 32 +++++++++++++++++++++++++++++--- src/parsers/aromiv2.ts | 10 +++++++++- src/parsers/loviisa_pk.ts | 15 +++++++++++---- src/utils/pdf.ts | 15 +++++++++++++++ 6 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 src/utils/pdf.ts diff --git a/Dockerfile b/Dockerfile index 03e4922..8914db6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,8 @@ RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" > /etc/apk/repositorie ttf-freefont \ font-noto-emoji \ wqy-zenhei \ + ghostscript \ + qpdf \ chromium-chromedriver \ && rm -rf /var/cache/* \ && mkdir /var/cache/apk @@ -47,6 +49,7 @@ RUN rm dist.tar.gz USER chrome RUN npm install +# Enable disk cache to speed up page load times, and disables miscellanous shit not necessary for chome # Docker in itself is a sandbox, so can't care less about chromium sandbox. Chrome won't be handling any personal data anyways. -ENV SELENIUM_ARGS="--no-sandbox" +ENV SELENIUM_ARGS="disk-cache-dir=/tmp/seleniumcache,disable-translate,disable-sync,no-first-run,safebrowsing-disable-auto-update,disable-background-networking,no-sandbox,disable-setuid-sandbox" CMD [ "node", "main.js" ] \ No newline at end of file diff --git a/package.json b/package.json index 16c9470..a77ee00 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "foodmenu", - "version": "1.0.8", + "version": "1.0.9", "description": "This API provides additional food menus which are not available as JSON.\nThis middleware converts them to JSON format.", "main": "build/main.js", "scripts": { @@ -16,6 +16,7 @@ "body-parser": "^1.19.0", "console-stamp": "^3.0.2", "express": "^4.17.1", + "ghostscript-node": "github:developerfromjokela/ghostscript-node", "he": "^1.2.0", "ical-expander": "^3.1.0", "moment": "^2.29.1", @@ -23,20 +24,20 @@ "node-html-parser": "^3.2.0", "node-ts-cache": "^4.2.3", "node-ts-cache-storage-memory": "^4.2.3", - "pdfreader": "^1.2.8", + "pdfreader": "^1.2.14", "selenium-webdriver": "^4.0.0-beta.3", "uuid": "^8.3.2" }, "devDependencies": { - "@types/node-fetch": "^2.5.10", "@types/express": "^4.17.11", + "@types/he": "^1.1.1", "@types/needle": "^2.5.1", "@types/node": "^14.14.44", + "@types/node-fetch": "^2.5.10", "@types/retry": "^0.12.0", "@types/selenium-webdriver": "^4.0.12", "@types/uuid": "^8.3.0", "@types/validator": "^13.1.3", - "@types/he": "^1.1.1", "env-cmd": "^10.1.0", "ts-node": "^9.1.1", "typescript": "^4.2.4" diff --git a/src/handlers/aromav2.ts b/src/handlers/aromav2.ts index eff8c61..4bd9f40 100644 --- a/src/handlers/aromav2.ts +++ b/src/handlers/aromav2.ts @@ -4,7 +4,7 @@ import {Request, Response} from "express"; import {errorResponse, responseStatus} from "../utils/response_utilities"; -import {Builder, By, ThenableWebDriver, WebElementCondition, Condition, until} from "selenium-webdriver"; +import {Builder, By, ThenableWebDriver, until} from "selenium-webdriver"; import {Restaurant} from "../models/Restaurant"; import {AsyncIterator} from "../utils/iterator"; import {elementLocated} from "selenium-webdriver/lib/until"; @@ -177,7 +177,20 @@ export function getMenuOptions(req: Request, res: Response) { if ((global as any).seleniumArgs != null) { options.addArguments((global as any).seleniumArgs.split(",")); } - const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build(); + // Disable image loading to save + const chromeConfig = { + prefs: { + profile: { + managed_default_content_settings: { + images: 2 + }, + default_content_setting_values: { + images: 2 + } + } + } + } + const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build(); driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => { getRestaurantList(driver).then(restaurants => { userCache.setItem(hashKey, restaurants, {ttl: 3600}).then(() => { @@ -263,7 +276,20 @@ export function getRestaurantPage(req: Request, res: Response) { if ((global as any).seleniumArgs != null) { options.addArguments((global as any).seleniumArgs.split(",")); } - const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build(); + // Disable image loading to save + const chromeConfig = { + prefs: { + profile: { + managed_default_content_settings: { + images: 2 + }, + default_content_setting_values: { + images: 2 + } + } + } + } + const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build(); driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => { selectRestaurant(driver, id).then(() => { getRestaurantPDFLink(driver).then(pdfUrl => { diff --git a/src/parsers/aromiv2.ts b/src/parsers/aromiv2.ts index 5102240..c026612 100644 --- a/src/parsers/aromiv2.ts +++ b/src/parsers/aromiv2.ts @@ -9,6 +9,7 @@ import {Moment} from "moment/moment"; import {Meal} from "../models/Meal"; import {HashUtils} from "../crypto/hash"; import {Diet} from "../models/Diet"; +import {removeImagesFromPDF} from "../utils/pdf"; const pdfParser = require("pdfreader"); @@ -17,10 +18,17 @@ const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/; const type = "aromiv2"; -export function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) { +export async function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) { let rows: any = {}; // indexed by y-position let days: Day[] = []; let diets: Diet[] = []; + // Due to a bug in PDF parser, images cause it not to parse any text. + // Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue, + try { + content = await removeImagesFromPDF(content); + } catch (e) { + console.error(e); + } new pdfParser.PdfReader().parseBuffer(content, (pdfError: {parserError: string}, pdf: any) => { if (pdfError) { // This error occurs when menu is empty diff --git a/src/parsers/loviisa_pk.ts b/src/parsers/loviisa_pk.ts index 3981add..613fe4d 100644 --- a/src/parsers/loviisa_pk.ts +++ b/src/parsers/loviisa_pk.ts @@ -8,11 +8,11 @@ import {Day} from "../models/Day"; import {Meal} from "../models/Meal"; import {HashUtils} from "../crypto/hash"; import {Menu} from "../models/Menu"; -import {errorResponse} from "../utils/response_utilities"; +import {removeImagesFromPDF} from "../utils/pdf"; const pdfParser = require("pdfreader"); -const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/; -const whitespace = " "; +/*const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/; +const whitespace = " ";*/ const type = "loviisa_pk"; @@ -23,9 +23,16 @@ export function parsePDFLink(html: string): string|undefined { return links.getAttribute("href"); } -export function parse(content: any, callback: (content: Day[]|undefined) => void) { +export async function parse(content: any, callback: (content: Day[]|undefined) => void) { let rows: any = {}; // indexed by y-position let days: Day[] = []; + // Due to a bug in PDF parser, images cause it not to parse any text. + // Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue, + try { + content = await removeImagesFromPDF(content); + } catch (e) { + console.error(e); + } new pdfParser.PdfReader().parseBuffer(content, (pdfError: Error, pdf: any) => { if (pdfError) { callback(undefined); diff --git a/src/utils/pdf.ts b/src/utils/pdf.ts new file mode 100644 index 0000000..b705eb7 --- /dev/null +++ b/src/utils/pdf.ts @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2022 wilmaplus-foodmenu, developed by @developerfromjokela, for Wilma Plus mobile app + */ + + +import {removeImages} from "ghostscript-node" +import {Buffer} from "buffer"; + +/** + * Removes images from PDF document, to temporarily fix a PDF parsing bug + * @returns {Promise} + */ +export async function removeImagesFromPDF(pdf: Buffer) { + return await removeImages(pdf) +}