- Fixed PDF parsing, when an image is present

- Chrome webdriver performance improvements - Bumped up to version v1.0.9
wilmaplus · Feb 5, 2022 · b5499e6 · b5499e6
1 parent 15202e6
commit b5499e6
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 13 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -15,6 +15,8 @@ RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" > /etc/apk/repositorie
     ttf-freefont \
     font-noto-emoji \
     wqy-zenhei \
+    ghostscript \
+    qpdf \
     chromium-chromedriver \
     && rm -rf /var/cache/* \
     && mkdir /var/cache/apk
@@ -47,6 +49,7 @@ RUN rm dist.tar.gz
 USER chrome
 RUN npm install
 
+# Enable disk cache to speed up page load times, and disables miscellanous shit not necessary for chome
 # Docker in itself is a sandbox, so can't care less about chromium sandbox. Chrome won't be handling any personal data anyways.
-ENV SELENIUM_ARGS="--no-sandbox"
+ENV SELENIUM_ARGS="disk-cache-dir=/tmp/seleniumcache,disable-translate,disable-sync,no-first-run,safebrowsing-disable-auto-update,disable-background-networking,no-sandbox,disable-setuid-sandbox"
 CMD [ "node", "main.js" ]
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "foodmenu",
-  "version": "1.0.8",
+  "version": "1.0.9",
   "description": "This API provides additional food menus which are not available as JSON.\nThis middleware converts them to JSON format.",
   "main": "build/main.js",
   "scripts": {
@@ -16,27 +16,28 @@
     "body-parser": "^1.19.0",
     "console-stamp": "^3.0.2",
     "express": "^4.17.1",
+    "ghostscript-node": "github:developerfromjokela/ghostscript-node",
     "he": "^1.2.0",
     "ical-expander": "^3.1.0",
     "moment": "^2.29.1",
     "needle": "^2.6.0",
     "node-html-parser": "^3.2.0",
     "node-ts-cache": "^4.2.3",
     "node-ts-cache-storage-memory": "^4.2.3",
-    "pdfreader": "^1.2.8",
+    "pdfreader": "^1.2.14",
     "selenium-webdriver": "^4.0.0-beta.3",
     "uuid": "^8.3.2"
   },
   "devDependencies": {
-    "@types/node-fetch": "^2.5.10",
     "@types/express": "^4.17.11",
+    "@types/he": "^1.1.1",
     "@types/needle": "^2.5.1",
     "@types/node": "^14.14.44",
+    "@types/node-fetch": "^2.5.10",
     "@types/retry": "^0.12.0",
     "@types/selenium-webdriver": "^4.0.12",
     "@types/uuid": "^8.3.0",
     "@types/validator": "^13.1.3",
-    "@types/he": "^1.1.1",
     "env-cmd": "^10.1.0",
     "ts-node": "^9.1.1",
     "typescript": "^4.2.4"

diff --git a/src/handlers/aromav2.ts b/src/handlers/aromav2.ts
@@ -4,7 +4,7 @@
 
 import {Request, Response} from "express";
 import {errorResponse, responseStatus} from "../utils/response_utilities";
-import {Builder, By, ThenableWebDriver, WebElementCondition, Condition, until} from "selenium-webdriver";
+import {Builder, By, ThenableWebDriver, until} from "selenium-webdriver";
 import {Restaurant} from "../models/Restaurant";
 import {AsyncIterator} from "../utils/iterator";
 import {elementLocated} from "selenium-webdriver/lib/until";
@@ -177,7 +177,20 @@ export function getMenuOptions(req: Request, res: Response) {
             if ((global as any).seleniumArgs != null) {
                 options.addArguments((global as any).seleniumArgs.split(","));
             }
-            const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build();
+            // Disable image loading to save
+            const chromeConfig = {
+                prefs: {
+                    profile: {
+                        managed_default_content_settings: {
+                            images: 2
+                        },
+                        default_content_setting_values: {
+                            images: 2
+                        }
+                    }
+                }
+            }
+            const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build();
             driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => {
                 getRestaurantList(driver).then(restaurants => {
                     userCache.setItem(hashKey, restaurants, {ttl: 3600}).then(() => {
@@ -263,7 +276,20 @@ export function getRestaurantPage(req: Request, res: Response) {
             if ((global as any).seleniumArgs != null) {
                 options.addArguments((global as any).seleniumArgs.split(","));
             }
-            const driver = new Builder().forBrowser("chrome").setChromeOptions(options).build();
+            // Disable image loading to save
+            const chromeConfig = {
+                prefs: {
+                    profile: {
+                        managed_default_content_settings: {
+                            images: 2
+                        },
+                        default_content_setting_values: {
+                            images: 2
+                        }
+                    }
+                }
+            }
+            const driver = new Builder().setChromeOptions(options).withCapabilities(chromeConfig).forBrowser("chrome").build();
             driver.get(url+(fullUrl ? "" : "/Default.aspx")).then(() => {
                 selectRestaurant(driver, id).then(() => {
                     getRestaurantPDFLink(driver).then(pdfUrl => {

diff --git a/src/parsers/aromiv2.ts b/src/parsers/aromiv2.ts
@@ -9,6 +9,7 @@ import {Moment} from "moment/moment";
 import {Meal} from "../models/Meal";
 import {HashUtils} from "../crypto/hash";
 import {Diet} from "../models/Diet";
+import {removeImagesFromPDF} from "../utils/pdf";
 
 const pdfParser = require("pdfreader");
 
@@ -17,10 +18,17 @@ const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
 const type = "aromiv2";
 
 
-export function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) {
+export async function parse(content: any, callback: (content: Day[]|undefined, diets: Diet[]|undefined) => void) {
     let rows: any = {}; // indexed by y-position
     let days: Day[] = [];
     let diets: Diet[] = [];
+    // Due to a bug in PDF parser, images cause it not to parse any text.
+    // Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue,
+    try {
+        content = await removeImagesFromPDF(content);
+    } catch (e) {
+        console.error(e);
+    }
     new pdfParser.PdfReader().parseBuffer(content, (pdfError: {parserError: string}, pdf: any) => {
         if (pdfError) {
             // This error occurs when menu is empty

diff --git a/src/parsers/loviisa_pk.ts b/src/parsers/loviisa_pk.ts
@@ -8,11 +8,11 @@ import {Day} from "../models/Day";
 import {Meal} from "../models/Meal";
 import {HashUtils} from "../crypto/hash";
 import {Menu} from "../models/Menu";
-import {errorResponse} from "../utils/response_utilities";
+import {removeImagesFromPDF} from "../utils/pdf";
 const pdfParser = require("pdfreader");
 
-const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
-const whitespace = " ";
+/*const dateRegex = /[0-9]+\.[0-9]+\.[0-9]{4}/;
+const whitespace = " ";*/
 
 const type = "loviisa_pk";
 
@@ -23,9 +23,16 @@ export function parsePDFLink(html: string): string|undefined {
     return links.getAttribute("href");
 }
 
-export function parse(content: any, callback: (content: Day[]|undefined) => void) {
+export async function parse(content: any, callback: (content: Day[]|undefined) => void) {
     let rows: any = {}; // indexed by y-position
     let days: Day[] = [];
+    // Due to a bug in PDF parser, images cause it not to parse any text.
+    // Removing images before parsing helps to circumvent this issue, until dev behind the lib fixes that issue,
+    try {
+        content = await removeImagesFromPDF(content);
+    } catch (e) {
+        console.error(e);
+    }
     new pdfParser.PdfReader().parseBuffer(content, (pdfError: Error, pdf: any) => {
         if (pdfError) {
             callback(undefined);

diff --git a/src/utils/pdf.ts b/src/utils/pdf.ts
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2022 wilmaplus-foodmenu, developed by @developerfromjokela, for Wilma Plus mobile app
+ */
+
+
+import {removeImages} from "ghostscript-node"
+import {Buffer} from "buffer";
+
+/**
+ * Removes images from PDF document, to temporarily fix a PDF parsing bug
+ * @returns {Promise<Buffer>}
+ */
+export async function removeImagesFromPDF(pdf: Buffer) {
+    return await removeImages(pdf)
+}