Merge pull request #39 from jadkins89/refactor

2.0 refactor
jadkins89 · Jan 25, 2021 · 81104df · 81104df
2 parents 7077709 + 356dccc
commit 81104df
Show file tree

Hide file tree

Showing 147 changed files with 3,053 additions and 3,763 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # recipe-scraper
 
-**A JS package for scraping recipes from the web.**
+**A NodeJS package for scraping recipes from the web.**
 
 [![Build Status](https://travis-ci.org/jadkins89/Recipe-Scraper.svg?branch=master)](https://travis-ci.org/jadkins89/Recipe-Scraper)
 [![Coverage Status](https://coveralls.io/repos/github/jadkins89/Recipe-Scraper/badge.svg?branch=master)](https://coveralls.io/github/jadkins89/Recipe-Scraper?branch=master)
@@ -47,13 +47,13 @@ recipeScraper("some.recipe.url").then(recipe => {
 - https://cookieandkate.com/
 - https://copykat.com/
 - https://damndelicious.net/
-- http://www.eatingwell.com/
+- https://www.eatingwell.com/
 - https://www.epicurious.com/
 - https://www.food.com/
 - https://www.foodandwine.com/
 - https://www.foodnetwork.com/
 - https://gimmedelicious.com/
-- http://www.gimmesomeoven.com/
+- https://www.gimmesomeoven.com/
 - https://julieblanner.com/
 - https://www.kitchenstories.com/
 - https://www.melskitchencafe.com/
@@ -80,7 +80,7 @@ Don't see a website you'd like to scrape? Open an [issue](https://github.com/jad
 
 ## Recipe Object
 
-Depending on the recipe, certain fields may be left blank. All fields are represented as strings or arrays of strings.
+Depending on the recipe, certain fields may be left blank. All fields are represented as strings or arrays of strings. The name, ingredients, and instructions properties are required for schema validation.
 
 ```javascript
 {

diff --git a/helpers/BaseScraper.js b/helpers/BaseScraper.js
@@ -0,0 +1,102 @@
+"use strict";
+
+const fetch = require("node-fetch");
+const cheerio = require("cheerio");
+const { validate } = require("jsonschema");
+
+const Recipe = require("./Recipe");
+const recipeSchema = require("./RecipeSchema.json");
+
+/**
+ * Abstract Class which all scrapers inherit from
+ */
+class BaseScraper {
+  constructor(url, subUrl = "") {
+    this.url = url;
+    this.subUrl = subUrl;
+  }
+
+  /**
+   * Checks if the url has the required sub url
+   */
+  checkUrl() {
+    if (!this.url.includes(this.subUrl)) {
+      throw new Error(`url provided must include '${this.subUrl}'`);
+    }
+  }
+
+  /**
+   * Builds a new instance of Recipe
+   */
+  createRecipeObject() {
+    this.recipe = new Recipe();
+  }
+
+  defaultError() {
+    throw new Error("No recipe found on page");
+  }
+
+  /**
+   * @param {object} $ - a cheerio object representing a DOM
+   * @returns {string|null} - if found, an image url
+   */
+  defaultSetImage($) {
+    this.recipe.image =
+      $("meta[property='og:image']").attr("content") ||
+      $("meta[name='og:image']").attr("content") ||
+      $("meta[itemprop='image']").attr("content");
+  }
+
+  /**
+   * Fetches html from url
+   * @returns {object} - Cheerio instance
+   */
+  async fetchDOMModel() {
+    try {
+      const res = await fetch(this.url);
+      const html = await res.text();
+      return cheerio.load(html);
+    } catch (err) {
+      this.defaultError();
+    }
+  }
+
+  /**
+   * Handles the workflow for fetching a recipe
+   * @returns {object} - an object representing the recipe
+   */
+  async fetchRecipe() {
+    this.checkUrl();
+    const $ = await this.fetchDOMModel();
+    this.createRecipeObject();
+    this.scrape($);
+    return this.validateRecipe();
+  }
+
+  /**
+   * Abstract method
+   * @param {object} $ - cheerio instance
+   * @returns {object} - an object representing the recipe
+   */
+  scrape($) {
+    throw new Error("scrape is not defined in BaseScraper");
+  }
+
+  textTrim(el) {
+    return el.text().trim();
+  }
+
+  /**
+   * Validates scraped recipes against defined recipe schema
+   * @returns {object} - an object representing the recipe
+   */
+  validateRecipe() {
+    let res = validate(this.recipe, recipeSchema);
+    if (!res.valid) {
+      this.defaultError();
+    }
+    return this.recipe;
+  }
+}
+
+module.exports = BaseScraper;
diff --git a/helpers/PuppeteerScraper.js b/helpers/PuppeteerScraper.js
@@ -0,0 +1,92 @@
+"use strict";
+
+const puppeteer = require("puppeteer");
+const cheerio = require("cheerio");
+
+const blockedResourceTypes = [
+  "image",
+  "media",
+  "font",
+  "texttrack",
+  "object",
+  "beacon",
+  "csp_report",
+  "imageset",
+  "stylesheet",
+  "font"
+];
+
+const skippedResources = [
+  "quantserve",
+  "adzerk",
+  "doubleclick",
+  "adition",
+  "exelator",
+  "sharethrough",
+  "cdn.api.twitter",
+  "google-analytics",
+  "googletagmanager",
+  "google",
+  "fontawesome",
+  "facebook",
+  "analytics",
+  "optimizely",
+  "clicktale",
+  "mixpanel",
+  "zedo",
+  "clicksor",
+  "tiqcdn"
+];
+
+const BaseScraper = require("./BaseScraper");
+
+/**
+ * Inheritable class which uses puppeteer instead of a simple http request
+ */
+class PuppeteerScraper extends BaseScraper {
+  /**
+   *
+   */
+  async customPoll(page) {
+    return true;
+  }
+  /**
+   * @override
+   * Fetches html from url using puppeteer headless browser
+   * @returns {object} - Cheerio instance
+   */
+  async fetchDOMModel() {
+    const browser = await puppeteer.launch({
+      headless: true
+    });
+    const page = await browser.newPage();
+    await page.setRequestInterception(true);
+
+    await page.on("request", req => {
+      const requestUrl = req._url.split("?")[0].split("#")[0];
+      if (
+        blockedResourceTypes.indexOf(req.resourceType()) !== -1 ||
+        skippedResources.some(resource => requestUrl.indexOf(resource) !== -1)
+      ) {
+        req.abort();
+      } else {
+        req.continue();
+      }
+    });
+
+    const response = await page.goto(this.url);
+
+    let html;
+    if (response._status < 400) {
+      await this.customPoll(page);
+      html = await page.content();
+    }
+    browser.close().catch(err => {});
+    if (response._status >= 400) {
+      this.defaultError();
+    }
+    return cheerio.load(html);
+  }
+}
+
+module.exports = PuppeteerScraper;
diff --git a/helpers/Recipe.js b/helpers/Recipe.js
@@ -0,0 +1,20 @@
+class Recipe {
+  constructor() {
+    this.name = "";
+    this.ingredients = [];
+    this.instructions = [];
+    this.tags = [];
+    this.time = {
+      prep: "",
+      cook: "",
+      active: "",
+      inactive: "",
+      ready: "",
+      total: ""
+    };
+    this.servings = "";
+    this.image = "";
+  }
+}
+
+module.exports = Recipe;
diff --git a/helpers/RecipeSchema.json b/helpers/RecipeSchema.json
@@ -0,0 +1,43 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "description": "A recipe scraped from the web",
+  "type": "object",
+  "required": ["name", "ingredients", "instructions"],
+  "properties": {
+    "name": {
+      "type": "string",
+      "minLength": 1
+    },
+    "ingredients": {
+      "type": "array",
+      "minItems": 1,
+      "items": { "type": "string" }
+    },
+    "instructions": {
+      "type": "array",
+      "minItems": 1,
+      "uniqueItems": true,
+      "items": { "type": "string" }
+    },
+    "tags": {
+      "type": "array",
+      "uniqueItems": true,
+      "items": { "type": "string" }
+    },
+    "time": {
+      "type": "object",
+      "properties": {
+        "prep": { "type": "string" },
+        "cook": { "type": "string" },
+        "active": { "type": "string" },
+        "inactive": { "type": "string" },
+        "ready": { "type": "string" },
+        "total": { "type": "string" }
+      }
+    },
+    "servings": {
+      "type": { "type": "string" }
+    },
+    "image": { "type": "string" }
+  }
+}
diff --git a/helpers/ScraperFactory.js b/helpers/ScraperFactory.js
@@ -0,0 +1,69 @@
+"use strict";
+
+const parseDomain = require("parse-domain");
+
+const domains = {
+  "101cookbooks": require("../scrapers/101CookbooksScraper"),
+  allrecipes: require("../scrapers/AllRecipesScraper"),
+  ambitiouskitchen: require("../scrapers/AmbitiousKitchenScraper"),
+  averiecooks: require("../scrapers/AverieCooksScraper"),
+  bbc: require("../scrapers/BbcScraper"),
+  bbcgoodfood: require("../scrapers/BbcGoodFoodScraper"),
+  bonappetit: require("../scrapers/BonAppetitScraper"),
+  budgetbytes: require("../scrapers/BudgetBytesScraper"),
+  centraltexasfoodbank: require("../scrapers/CentralTexasFoodBankScraper"),
+  closetcooking: require("../scrapers/ClosetCookingScraper"),
+  cookieandkate: require("../scrapers/CookieAndKateScraper"),
+  copykat: require("../scrapers/CopyKatScraper"),
+  damndelicious: require("../scrapers/DamnDeliciousScraper"),
+  eatingwell: require("../scrapers/EatingWellScraper"),
+  epicurious: require("../scrapers/EpicuriousScraper"),
+  food: require("../scrapers/FoodScraper"),
+  foodandwine: require("../scrapers/FoodAndWineScraper"),
+  foodnetwork: require("../scrapers/FoodNetworkScraper"),
+  gimmedelicious: require("../scrapers/GimmeDeliciousScraper"),
+  gimmesomeoven: require("../scrapers/GimmeSomeOvenScraper"),
+  julieblanner: require("../scrapers/JulieBlannerScraper"),
+  kitchenstories: require("../scrapers/KitchenStoriesScraper"),
+  melskitchencafe: require("../scrapers/MelsKitchenCafeScraper"),
+  minimalistbaker: require("../scrapers/MinimalistBakerScraper"),
+  myrecipes: require("../scrapers/MyRecipesScraper"),
+  nomnompaleo: require("../scrapers/NomNomPaleoScraper"),
+  omnivorescookbook: require("../scrapers/OmnivoresCookbookScraper"),
+  pinchofyum: require("../scrapers/PinchOfYumScraper"),
+  recipetineats: require("../scrapers/RecipeTinEatsScraper"),
+  seriouseats: require("../scrapers/SeriousEatsScraper"),
+  simplyrecipes: require("../scrapers/SimplyRecipesScraper"),
+  smittenkitchen: require("../scrapers/SmittenKitchenScraper"),
+  tastesbetterfromscratch: require("../scrapers/TastesBetterFromScratchScraper"),
+  tasteofhome: require("../scrapers/TasteOfHomeScraper"),
+  theblackpeppercorn: require("../scrapers/TheBlackPeppercornScraper"),
+  thepioneerwoman: require("../scrapers/ThePioneerWomanScraper"),
+  therecipecritic: require("../scrapers/TheRecipeCriticScraper"),
+  therealfoodrds: require("../scrapers/TheRealFoodDrsScraper"),
+  thespruceeats: require("../scrapers/TheSpruceEatsScraper"),
+  whatsgabycooking: require("../scrapers/WhatsGabyCookingScraper"),
+  woolworths: require("../scrapers/WoolworthsScraper"),
+  yummly: require("../scrapers/YummlyScraper")
+};
+
+/**
+ * A Factory that supplies an instance of a scraper based on a given URL
+ */
+class ScraperFactory {
+  getScraper(url) {
+    let parse = parseDomain(url);
+    if (parse) {
+      let domain = parse.domain;
+      if (domains[domain] !== undefined) {
+        return new domains[domain](url);
+      } else {
+        throw new Error("Site not yet supported");
+      }
+    } else {
+      throw new Error("Failed to parse domain");
+    }
+  }
+}
+
+module.exports = ScraperFactory;