-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #39 from jadkins89/refactor
2.0 refactor
- Loading branch information
Showing
147 changed files
with
3,053 additions
and
3,763 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
"use strict"; | ||
|
||
const fetch = require("node-fetch"); | ||
const cheerio = require("cheerio"); | ||
const { validate } = require("jsonschema"); | ||
|
||
const Recipe = require("./Recipe"); | ||
const recipeSchema = require("./RecipeSchema.json"); | ||
|
||
/** | ||
* Abstract Class which all scrapers inherit from | ||
*/ | ||
class BaseScraper { | ||
constructor(url, subUrl = "") { | ||
this.url = url; | ||
this.subUrl = subUrl; | ||
} | ||
|
||
/** | ||
* Checks if the url has the required sub url | ||
*/ | ||
checkUrl() { | ||
if (!this.url.includes(this.subUrl)) { | ||
throw new Error(`url provided must include '${this.subUrl}'`); | ||
} | ||
} | ||
|
||
/** | ||
* Builds a new instance of Recipe | ||
*/ | ||
createRecipeObject() { | ||
this.recipe = new Recipe(); | ||
} | ||
|
||
defaultError() { | ||
throw new Error("No recipe found on page"); | ||
} | ||
|
||
/** | ||
* @param {object} $ - a cheerio object representing a DOM | ||
* @returns {string|null} - if found, an image url | ||
*/ | ||
defaultSetImage($) { | ||
this.recipe.image = | ||
$("meta[property='og:image']").attr("content") || | ||
$("meta[name='og:image']").attr("content") || | ||
$("meta[itemprop='image']").attr("content"); | ||
} | ||
|
||
/** | ||
* Fetches html from url | ||
* @returns {object} - Cheerio instance | ||
*/ | ||
async fetchDOMModel() { | ||
try { | ||
const res = await fetch(this.url); | ||
const html = await res.text(); | ||
return cheerio.load(html); | ||
} catch (err) { | ||
this.defaultError(); | ||
} | ||
} | ||
|
||
/** | ||
* Handles the workflow for fetching a recipe | ||
* @returns {object} - an object representing the recipe | ||
*/ | ||
async fetchRecipe() { | ||
this.checkUrl(); | ||
const $ = await this.fetchDOMModel(); | ||
this.createRecipeObject(); | ||
this.scrape($); | ||
return this.validateRecipe(); | ||
} | ||
|
||
/** | ||
* Abstract method | ||
* @param {object} $ - cheerio instance | ||
* @returns {object} - an object representing the recipe | ||
*/ | ||
scrape($) { | ||
throw new Error("scrape is not defined in BaseScraper"); | ||
} | ||
|
||
textTrim(el) { | ||
return el.text().trim(); | ||
} | ||
|
||
/** | ||
* Validates scraped recipes against defined recipe schema | ||
* @returns {object} - an object representing the recipe | ||
*/ | ||
validateRecipe() { | ||
let res = validate(this.recipe, recipeSchema); | ||
if (!res.valid) { | ||
this.defaultError(); | ||
} | ||
return this.recipe; | ||
} | ||
} | ||
|
||
module.exports = BaseScraper; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
"use strict"; | ||
|
||
const puppeteer = require("puppeteer"); | ||
const cheerio = require("cheerio"); | ||
|
||
const blockedResourceTypes = [ | ||
"image", | ||
"media", | ||
"font", | ||
"texttrack", | ||
"object", | ||
"beacon", | ||
"csp_report", | ||
"imageset", | ||
"stylesheet", | ||
"font" | ||
]; | ||
|
||
const skippedResources = [ | ||
"quantserve", | ||
"adzerk", | ||
"doubleclick", | ||
"adition", | ||
"exelator", | ||
"sharethrough", | ||
"cdn.api.twitter", | ||
"google-analytics", | ||
"googletagmanager", | ||
"google", | ||
"fontawesome", | ||
"facebook", | ||
"analytics", | ||
"optimizely", | ||
"clicktale", | ||
"mixpanel", | ||
"zedo", | ||
"clicksor", | ||
"tiqcdn" | ||
]; | ||
|
||
const BaseScraper = require("./BaseScraper"); | ||
|
||
/** | ||
* Inheritable class which uses puppeteer instead of a simple http request | ||
*/ | ||
class PuppeteerScraper extends BaseScraper { | ||
/** | ||
* | ||
*/ | ||
async customPoll(page) { | ||
return true; | ||
} | ||
/** | ||
* @override | ||
* Fetches html from url using puppeteer headless browser | ||
* @returns {object} - Cheerio instance | ||
*/ | ||
async fetchDOMModel() { | ||
const browser = await puppeteer.launch({ | ||
headless: true | ||
}); | ||
const page = await browser.newPage(); | ||
await page.setRequestInterception(true); | ||
|
||
await page.on("request", req => { | ||
const requestUrl = req._url.split("?")[0].split("#")[0]; | ||
if ( | ||
blockedResourceTypes.indexOf(req.resourceType()) !== -1 || | ||
skippedResources.some(resource => requestUrl.indexOf(resource) !== -1) | ||
) { | ||
req.abort(); | ||
} else { | ||
req.continue(); | ||
} | ||
}); | ||
|
||
const response = await page.goto(this.url); | ||
|
||
let html; | ||
if (response._status < 400) { | ||
await this.customPoll(page); | ||
html = await page.content(); | ||
} | ||
browser.close().catch(err => {}); | ||
if (response._status >= 400) { | ||
this.defaultError(); | ||
} | ||
return cheerio.load(html); | ||
} | ||
} | ||
|
||
module.exports = PuppeteerScraper; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
class Recipe { | ||
constructor() { | ||
this.name = ""; | ||
this.ingredients = []; | ||
this.instructions = []; | ||
this.tags = []; | ||
this.time = { | ||
prep: "", | ||
cook: "", | ||
active: "", | ||
inactive: "", | ||
ready: "", | ||
total: "" | ||
}; | ||
this.servings = ""; | ||
this.image = ""; | ||
} | ||
} | ||
|
||
module.exports = Recipe; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"description": "A recipe scraped from the web", | ||
"type": "object", | ||
"required": ["name", "ingredients", "instructions"], | ||
"properties": { | ||
"name": { | ||
"type": "string", | ||
"minLength": 1 | ||
}, | ||
"ingredients": { | ||
"type": "array", | ||
"minItems": 1, | ||
"items": { "type": "string" } | ||
}, | ||
"instructions": { | ||
"type": "array", | ||
"minItems": 1, | ||
"uniqueItems": true, | ||
"items": { "type": "string" } | ||
}, | ||
"tags": { | ||
"type": "array", | ||
"uniqueItems": true, | ||
"items": { "type": "string" } | ||
}, | ||
"time": { | ||
"type": "object", | ||
"properties": { | ||
"prep": { "type": "string" }, | ||
"cook": { "type": "string" }, | ||
"active": { "type": "string" }, | ||
"inactive": { "type": "string" }, | ||
"ready": { "type": "string" }, | ||
"total": { "type": "string" } | ||
} | ||
}, | ||
"servings": { | ||
"type": { "type": "string" } | ||
}, | ||
"image": { "type": "string" } | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
"use strict"; | ||
|
||
const parseDomain = require("parse-domain"); | ||
|
||
const domains = { | ||
"101cookbooks": require("../scrapers/101CookbooksScraper"), | ||
allrecipes: require("../scrapers/AllRecipesScraper"), | ||
ambitiouskitchen: require("../scrapers/AmbitiousKitchenScraper"), | ||
averiecooks: require("../scrapers/AverieCooksScraper"), | ||
bbc: require("../scrapers/BbcScraper"), | ||
bbcgoodfood: require("../scrapers/BbcGoodFoodScraper"), | ||
bonappetit: require("../scrapers/BonAppetitScraper"), | ||
budgetbytes: require("../scrapers/BudgetBytesScraper"), | ||
centraltexasfoodbank: require("../scrapers/CentralTexasFoodBankScraper"), | ||
closetcooking: require("../scrapers/ClosetCookingScraper"), | ||
cookieandkate: require("../scrapers/CookieAndKateScraper"), | ||
copykat: require("../scrapers/CopyKatScraper"), | ||
damndelicious: require("../scrapers/DamnDeliciousScraper"), | ||
eatingwell: require("../scrapers/EatingWellScraper"), | ||
epicurious: require("../scrapers/EpicuriousScraper"), | ||
food: require("../scrapers/FoodScraper"), | ||
foodandwine: require("../scrapers/FoodAndWineScraper"), | ||
foodnetwork: require("../scrapers/FoodNetworkScraper"), | ||
gimmedelicious: require("../scrapers/GimmeDeliciousScraper"), | ||
gimmesomeoven: require("../scrapers/GimmeSomeOvenScraper"), | ||
julieblanner: require("../scrapers/JulieBlannerScraper"), | ||
kitchenstories: require("../scrapers/KitchenStoriesScraper"), | ||
melskitchencafe: require("../scrapers/MelsKitchenCafeScraper"), | ||
minimalistbaker: require("../scrapers/MinimalistBakerScraper"), | ||
myrecipes: require("../scrapers/MyRecipesScraper"), | ||
nomnompaleo: require("../scrapers/NomNomPaleoScraper"), | ||
omnivorescookbook: require("../scrapers/OmnivoresCookbookScraper"), | ||
pinchofyum: require("../scrapers/PinchOfYumScraper"), | ||
recipetineats: require("../scrapers/RecipeTinEatsScraper"), | ||
seriouseats: require("../scrapers/SeriousEatsScraper"), | ||
simplyrecipes: require("../scrapers/SimplyRecipesScraper"), | ||
smittenkitchen: require("../scrapers/SmittenKitchenScraper"), | ||
tastesbetterfromscratch: require("../scrapers/TastesBetterFromScratchScraper"), | ||
tasteofhome: require("../scrapers/TasteOfHomeScraper"), | ||
theblackpeppercorn: require("../scrapers/TheBlackPeppercornScraper"), | ||
thepioneerwoman: require("../scrapers/ThePioneerWomanScraper"), | ||
therecipecritic: require("../scrapers/TheRecipeCriticScraper"), | ||
therealfoodrds: require("../scrapers/TheRealFoodDrsScraper"), | ||
thespruceeats: require("../scrapers/TheSpruceEatsScraper"), | ||
whatsgabycooking: require("../scrapers/WhatsGabyCookingScraper"), | ||
woolworths: require("../scrapers/WoolworthsScraper"), | ||
yummly: require("../scrapers/YummlyScraper") | ||
}; | ||
|
||
/** | ||
* A Factory that supplies an instance of a scraper based on a given URL | ||
*/ | ||
class ScraperFactory { | ||
getScraper(url) { | ||
let parse = parseDomain(url); | ||
if (parse) { | ||
let domain = parse.domain; | ||
if (domains[domain] !== undefined) { | ||
return new domains[domain](url); | ||
} else { | ||
throw new Error("Site not yet supported"); | ||
} | ||
} else { | ||
throw new Error("Failed to parse domain"); | ||
} | ||
} | ||
} | ||
|
||
module.exports = ScraperFactory; |
Oops, something went wrong.