Skip to content

Commit

Permalink
- Fixed PDF parsing, when an image is present
Browse files Browse the repository at this point in the history
- Chrome webdriver performance improvements
- Bumped up to version v1.0.9
  • Loading branch information
developerfromjokela committed Feb 5, 2022
1 parent 15202e6 commit b5499e6
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 13 deletions.
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" > /etc/apk/repositorie
ttf-freefont \
font-noto-emoji \
wqy-zenhei \
ghostscript \
qpdf \
chromium-chromedriver \
&& rm -rf /var/cache/* \
&& mkdir /var/cache/apk
Expand Down Expand Up @@ -47,6 +49,7 @@ RUN rm dist.tar.gz
USER chrome
RUN npm install

# Enable disk cache to speed up page load times, and disables miscellanous shit not necessary for chome
# Docker in itself is a sandbox, so can't care less about chromium sandbox. Chrome won't be handling any personal data anyways.
ENV SELENIUM_ARGS="--no-sandbox"
ENV SELENIUM_ARGS="disk-cache-dir=/tmp/seleniumcache,disable-translate,disable-sync,no-first-run,safebrowsing-disable-auto-update,disable-background-networking,no-sandbox,disable-setuid-sandbox"
CMD [ "node", "main.js" ]
9 changes: 5 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "foodmenu",
"version": "1.0.8",
"version": "1.0.9",
"description": "This API provides additional food menus which are not available as JSON.\nThis middleware converts them to JSON format.",
"main": "build/main.js",
"scripts": {
Expand All @@ -16,27 +16,28 @@
"body-parser": "^1.19.0",
"console-stamp": "^3.0.2",
"express": "^4.17.1",
"ghostscript-node": "github:developerfromjokela/ghostscript-node",
"he": "^1.2.0",
"ical-expander": "^3.1.0",
"moment": "^2.29.1",
"needle": "^2.6.0",
"node-html-parser": "^3.2.0",
"node-ts-cache": "^4.2.3",
"node-ts-cache-storage-memory": "^4.2.3",
"pdfreader": "^1.2.8",
"pdfreader": "^1.2.14",
"selenium-webdriver": "^4.0.0-beta.3",
"uuid": "^8.3.2"
},
"devDependencies": {
"@types/node-fetch": "^2.5.10",
"@types/express": "^4.17.11",
"@types/he": "^1.1.1",
"@types/needle": "^2.5.1",
"@types/node": "^14.14.44",
"@types/node-fetch": "^2.5.10",
"@types/retry": "^0.12.0",
"@types/selenium-webdriver": "^4.0.12",
"@types/uuid": "^8.3.0",
"@types/validator": "^13.1.3",
"@types/he": "^1.1.1",
"env-cmd": "^10.1.0",
"ts-node": "^9.1.1",
"typescript": "^4.2.4"
Expand Down
32 changes: 29 additions & 3 deletions src/handlers/aromav2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import {Request, Response} from "express";
import {errorResponse, responseStatus} from "../utils/response_utilities";
import {Builder, By, ThenableWebDriver, WebElementCondition, Condition, until} from "selenium-webdriver";
import {Builder, By, ThenableWebDriver, until} from "selenium-webdriver";
import {Restaurant} from "../models/Restaurant";
import {AsyncIterator} from "../utils/iterator";
import {elementLocated} from "selenium-webdriver/lib/until";
Expand Down Expand Up @@ -177,7 +177,20 @@ export function getMenuOptions(req: Request, res: Response) {
if ((global as any).seleniumArgs != null) {
options.addArguments((global as any).seleniumArgs.split(","));
}
const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build();
// Disable image loading to save
const chromeConfig = {
prefs: {
profile: {
managed_default_content_settings: {
images: 2
},
default_content_setting_values: {
images: 2
}
}
}
}
const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build();
driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => {
getRestaurantList(driver).then(restaurants => {
userCache.setItem(hashKey, restaurants, {ttl: 3600}).then(() => {
Expand Down Expand Up @@ -263,7 +276,20 @@ export function getRestaurantPage(req: Request, res: Response) {
if ((global as any).seleniumArgs != null) {
options.addArguments((global as any).seleniumArgs.split(","));
}
const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build();
// Disable image loading to save
const chromeConfig = {
prefs: {
profile: {
managed_default_content_settings: {
images: 2
},
default_content_setting_values: {
images: 2
}
}
}
}
const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build();
driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => {
selectRestaurant(driver, id).then(() => {
getRestaurantPDFLink(driver).then(pdfUrl => {
Expand Down
10 changes: 9 additions & 1 deletion src/parsers/aromiv2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {Moment} from "moment/moment";
import {Meal} from "../models/Meal";
import {HashUtils} from "../crypto/hash";
import {Diet} from "../models/Diet";
import {removeImagesFromPDF} from "../utils/pdf";

const pdfParser = require("pdfreader");

Expand All @@ -17,10 +18,17 @@ const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
const type = "aromiv2";


export function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) {
export async function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) {
let rows: any = {}; // indexed by y-position
let days: Day[] = [];
let diets: Diet[] = [];
// Due to a bug in PDF parser, images cause it not to parse any text.
// Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue,
try {
content = await removeImagesFromPDF(content);
} catch (e) {
console.error(e);
}
new pdfParser.PdfReader().parseBuffer(content, (pdfError: {parserError: string}, pdf: any) => {
if (pdfError) {
// This error occurs when menu is empty
Expand Down
15 changes: 11 additions & 4 deletions src/parsers/loviisa_pk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ import {Day} from "../models/Day";
import {Meal} from "../models/Meal";
import {HashUtils} from "../crypto/hash";
import {Menu} from "../models/Menu";
import {errorResponse} from "../utils/response_utilities";
import {removeImagesFromPDF} from "../utils/pdf";
const pdfParser = require("pdfreader");

const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
const whitespace = " ";
/*const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
const whitespace = " ";*/

const type = "loviisa_pk";

Expand All @@ -23,9 +23,16 @@ export function parsePDFLink(html: string): string|undefined {
return links.getAttribute("href");
}

export function parse(content: any, callback: (content: Day[]|undefined) => void) {
export async function parse(content: any, callback: (content: Day[]|undefined) => void) {
let rows: any = {}; // indexed by y-position
let days: Day[] = [];
// Due to a bug in PDF parser, images cause it not to parse any text.
// Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue,
try {
content = await removeImagesFromPDF(content);
} catch (e) {
console.error(e);
}
new pdfParser.PdfReader().parseBuffer(content, (pdfError: Error, pdf: any) => {
if (pdfError) {
callback(undefined);
Expand Down
15 changes: 15 additions & 0 deletions src/utils/pdf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*
* Copyright (c) 2022 wilmaplus-foodmenu, developed by @developerfromjokela, for Wilma Plus mobile app
*/


import {removeImages} from "ghostscript-node"
import {Buffer} from "buffer";

/**
* Removes images from PDF document, to temporarily fix a PDF parsing bug
* @returns {Promise<Buffer>}
*/
export async function removeImagesFromPDF(pdf: Buffer) {
return await removeImages(pdf)
}

0 comments on commit b5499e6

Please sign in to comment.