Skip to content

Commit

Permalink
Merge pull request #39 from jadkins89/refactor
Browse files Browse the repository at this point in the history
2.0 refactor
  • Loading branch information
jadkins89 authored Jan 25, 2021
2 parents 7077709 + 356dccc commit 81104df
Show file tree
Hide file tree
Showing 147 changed files with 3,053 additions and 3,763 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# recipe-scraper

**A JS package for scraping recipes from the web.**
**A NodeJS package for scraping recipes from the web.**

[![Build Status](https://travis-ci.org/jadkins89/Recipe-Scraper.svg?branch=master)](https://travis-ci.org/jadkins89/Recipe-Scraper)
[![Coverage Status](https://coveralls.io/repos/github/jadkins89/Recipe-Scraper/badge.svg?branch=master)](https://coveralls.io/github/jadkins89/Recipe-Scraper?branch=master)
Expand Down Expand Up @@ -47,13 +47,13 @@ recipeScraper("some.recipe.url").then(recipe => {
- https://cookieandkate.com/
- https://copykat.com/
- https://damndelicious.net/
- http://www.eatingwell.com/
- https://www.eatingwell.com/
- https://www.epicurious.com/
- https://www.food.com/
- https://www.foodandwine.com/
- https://www.foodnetwork.com/
- https://gimmedelicious.com/
- http://www.gimmesomeoven.com/
- https://www.gimmesomeoven.com/
- https://julieblanner.com/
- https://www.kitchenstories.com/
- https://www.melskitchencafe.com/
Expand All @@ -80,7 +80,7 @@ Don't see a website you'd like to scrape? Open an [issue](https://github.com/jad

## Recipe Object

Depending on the recipe, certain fields may be left blank. All fields are represented as strings or arrays of strings.
Depending on the recipe, certain fields may be left blank. All fields are represented as strings or arrays of strings. The name, ingredients, and instructions properties are required for schema validation.

```javascript
{
Expand Down
102 changes: 102 additions & 0 deletions helpers/BaseScraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"use strict";

const fetch = require("node-fetch");
const cheerio = require("cheerio");
const { validate } = require("jsonschema");

const Recipe = require("./Recipe");
const recipeSchema = require("./RecipeSchema.json");

/**
* Abstract Class which all scrapers inherit from
*/
class BaseScraper {
constructor(url, subUrl = "") {
this.url = url;
this.subUrl = subUrl;
}

/**
* Checks if the url has the required sub url
*/
checkUrl() {
if (!this.url.includes(this.subUrl)) {
throw new Error(`url provided must include '${this.subUrl}'`);
}
}

/**
* Builds a new instance of Recipe
*/
createRecipeObject() {
this.recipe = new Recipe();
}

defaultError() {
throw new Error("No recipe found on page");
}

/**
* @param {object} $ - a cheerio object representing a DOM
* @returns {string|null} - if found, an image url
*/
defaultSetImage($) {
this.recipe.image =
$("meta[property='og:image']").attr("content") ||
$("meta[name='og:image']").attr("content") ||
$("meta[itemprop='image']").attr("content");
}

/**
* Fetches html from url
* @returns {object} - Cheerio instance
*/
async fetchDOMModel() {
try {
const res = await fetch(this.url);
const html = await res.text();
return cheerio.load(html);
} catch (err) {
this.defaultError();
}
}

/**
* Handles the workflow for fetching a recipe
* @returns {object} - an object representing the recipe
*/
async fetchRecipe() {
this.checkUrl();
const $ = await this.fetchDOMModel();
this.createRecipeObject();
this.scrape($);
return this.validateRecipe();
}

/**
* Abstract method
* @param {object} $ - cheerio instance
* @returns {object} - an object representing the recipe
*/
scrape($) {
throw new Error("scrape is not defined in BaseScraper");
}

textTrim(el) {
return el.text().trim();
}

/**
* Validates scraped recipes against defined recipe schema
* @returns {object} - an object representing the recipe
*/
validateRecipe() {
let res = validate(this.recipe, recipeSchema);
if (!res.valid) {
this.defaultError();
}
return this.recipe;
}
}

module.exports = BaseScraper;
92 changes: 92 additions & 0 deletions helpers/PuppeteerScraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"use strict";

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");

const blockedResourceTypes = [
"image",
"media",
"font",
"texttrack",
"object",
"beacon",
"csp_report",
"imageset",
"stylesheet",
"font"
];

const skippedResources = [
"quantserve",
"adzerk",
"doubleclick",
"adition",
"exelator",
"sharethrough",
"cdn.api.twitter",
"google-analytics",
"googletagmanager",
"google",
"fontawesome",
"facebook",
"analytics",
"optimizely",
"clicktale",
"mixpanel",
"zedo",
"clicksor",
"tiqcdn"
];

const BaseScraper = require("./BaseScraper");

/**
* Inheritable class which uses puppeteer instead of a simple http request
*/
class PuppeteerScraper extends BaseScraper {
/**
*
*/
async customPoll(page) {
return true;
}
/**
* @override
* Fetches html from url using puppeteer headless browser
* @returns {object} - Cheerio instance
*/
async fetchDOMModel() {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
await page.setRequestInterception(true);

await page.on("request", req => {
const requestUrl = req._url.split("?")[0].split("#")[0];
if (
blockedResourceTypes.indexOf(req.resourceType()) !== -1 ||
skippedResources.some(resource => requestUrl.indexOf(resource) !== -1)
) {
req.abort();
} else {
req.continue();
}
});

const response = await page.goto(this.url);

let html;
if (response._status < 400) {
await this.customPoll(page);
html = await page.content();
}
browser.close().catch(err => {});
if (response._status >= 400) {
this.defaultError();
}
return cheerio.load(html);
}
}

module.exports = PuppeteerScraper;
20 changes: 20 additions & 0 deletions helpers/Recipe.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
class Recipe {
constructor() {
this.name = "";
this.ingredients = [];
this.instructions = [];
this.tags = [];
this.time = {
prep: "",
cook: "",
active: "",
inactive: "",
ready: "",
total: ""
};
this.servings = "";
this.image = "";
}
}

module.exports = Recipe;
43 changes: 43 additions & 0 deletions helpers/RecipeSchema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "A recipe scraped from the web",
"type": "object",
"required": ["name", "ingredients", "instructions"],
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"ingredients": {
"type": "array",
"minItems": 1,
"items": { "type": "string" }
},
"instructions": {
"type": "array",
"minItems": 1,
"uniqueItems": true,
"items": { "type": "string" }
},
"tags": {
"type": "array",
"uniqueItems": true,
"items": { "type": "string" }
},
"time": {
"type": "object",
"properties": {
"prep": { "type": "string" },
"cook": { "type": "string" },
"active": { "type": "string" },
"inactive": { "type": "string" },
"ready": { "type": "string" },
"total": { "type": "string" }
}
},
"servings": {
"type": { "type": "string" }
},
"image": { "type": "string" }
}
}
69 changes: 69 additions & 0 deletions helpers/ScraperFactory.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"use strict";

const parseDomain = require("parse-domain");

const domains = {
"101cookbooks": require("../scrapers/101CookbooksScraper"),
allrecipes: require("../scrapers/AllRecipesScraper"),
ambitiouskitchen: require("../scrapers/AmbitiousKitchenScraper"),
averiecooks: require("../scrapers/AverieCooksScraper"),
bbc: require("../scrapers/BbcScraper"),
bbcgoodfood: require("../scrapers/BbcGoodFoodScraper"),
bonappetit: require("../scrapers/BonAppetitScraper"),
budgetbytes: require("../scrapers/BudgetBytesScraper"),
centraltexasfoodbank: require("../scrapers/CentralTexasFoodBankScraper"),
closetcooking: require("../scrapers/ClosetCookingScraper"),
cookieandkate: require("../scrapers/CookieAndKateScraper"),
copykat: require("../scrapers/CopyKatScraper"),
damndelicious: require("../scrapers/DamnDeliciousScraper"),
eatingwell: require("../scrapers/EatingWellScraper"),
epicurious: require("../scrapers/EpicuriousScraper"),
food: require("../scrapers/FoodScraper"),
foodandwine: require("../scrapers/FoodAndWineScraper"),
foodnetwork: require("../scrapers/FoodNetworkScraper"),
gimmedelicious: require("../scrapers/GimmeDeliciousScraper"),
gimmesomeoven: require("../scrapers/GimmeSomeOvenScraper"),
julieblanner: require("../scrapers/JulieBlannerScraper"),
kitchenstories: require("../scrapers/KitchenStoriesScraper"),
melskitchencafe: require("../scrapers/MelsKitchenCafeScraper"),
minimalistbaker: require("../scrapers/MinimalistBakerScraper"),
myrecipes: require("../scrapers/MyRecipesScraper"),
nomnompaleo: require("../scrapers/NomNomPaleoScraper"),
omnivorescookbook: require("../scrapers/OmnivoresCookbookScraper"),
pinchofyum: require("../scrapers/PinchOfYumScraper"),
recipetineats: require("../scrapers/RecipeTinEatsScraper"),
seriouseats: require("../scrapers/SeriousEatsScraper"),
simplyrecipes: require("../scrapers/SimplyRecipesScraper"),
smittenkitchen: require("../scrapers/SmittenKitchenScraper"),
tastesbetterfromscratch: require("../scrapers/TastesBetterFromScratchScraper"),
tasteofhome: require("../scrapers/TasteOfHomeScraper"),
theblackpeppercorn: require("../scrapers/TheBlackPeppercornScraper"),
thepioneerwoman: require("../scrapers/ThePioneerWomanScraper"),
therecipecritic: require("../scrapers/TheRecipeCriticScraper"),
therealfoodrds: require("../scrapers/TheRealFoodDrsScraper"),
thespruceeats: require("../scrapers/TheSpruceEatsScraper"),
whatsgabycooking: require("../scrapers/WhatsGabyCookingScraper"),
woolworths: require("../scrapers/WoolworthsScraper"),
yummly: require("../scrapers/YummlyScraper")
};

/**
* A Factory that supplies an instance of a scraper based on a given URL
*/
class ScraperFactory {
getScraper(url) {
let parse = parseDomain(url);
if (parse) {
let domain = parse.domain;
if (domains[domain] !== undefined) {
return new domains[domain](url);
} else {
throw new Error("Site not yet supported");
}
} else {
throw new Error("Failed to parse domain");
}
}
}

module.exports = ScraperFactory;
Loading

0 comments on commit 81104df

Please sign in to comment.