From 7c2bfb82d02b75ca851bc616231b55c8ad499196 Mon Sep 17 00:00:00 2001 From: Sreedhar M B Date: Sat, 16 Sep 2017 15:38:45 +0530 Subject: [PATCH 1/3] Add files via upload --- crawler.js | 69 +++++++++++++++++++++++++++++++++++++++++++++++----- package.json | 4 ++- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/crawler.js b/crawler.js index b248dbc..46cfaaf 100644 --- a/crawler.js +++ b/crawler.js @@ -1,9 +1,64 @@ /** - * Created by tushar on 13/09/17. + * Created by Sreedhar M B on 15/09/17. */ 'use strict' +const request = require('request'); +const cheerio = require('cheerio'); +const fs = require('fs'); + +const siteURL = "http://localhost:8080"; + +let linksMap = {}; +let linksVisitedMap = {}; +let strLexElem; +let siteVisitCount = 0; + +const getSiteDetails = (nextSiteHash, resolveFunc, callback) => { + let nextSite = siteURL + nextSiteHash; + siteVisitCount++; + request(nextSite, function(error, response, body) { + if(error) { + console.log("Error: " + error); + } + + console.log(nextSite); + console.log("Status code: " + response.statusCode); + let $ = cheerio.load(body); + + $('a.link').each(function( index ) { + let linkElement = $(this).attr('href'); + if(!linksVisitedMap[linkElement]) { + linksMap[linkElement] = linkElement; + } + }); + + $('h1').each(function( index ) { + let strElem = $(this).text().trim(); + strLexElem = strLexElem ? (strElem < strLexElem) ? strElem : strLexElem : strElem; + }); + + + while(Object.keys(linksMap).length) { + let nextPageHash = linksMap[Object.keys(linksMap)[0]]; + getSiteDetails(nextPageHash, resolveFunc, (strLexElem) => { + console.log(strLexElem); + console.log(siteVisitCount); + siteVisitCount--; + if(siteVisitCount == 1) { + resolveFunc(strLexElem); + } + }); + linksVisitedMap[nextPageHash] = nextPageHash; + delete linksMap[nextPageHash]; + } + + callback(strLexElem); + }); +}; + + /** * Crawls a website using a start {url}, and returns the lexicographically smallest string. * @param url @@ -11,8 +66,10 @@ */ module.exports = url => new Promise((resolve, reject) => { - /** - * TODO: Write your high performance code here. - */ - reject(new Error('NotImplemented')) - }) + getSiteDetails('', resolve, (strLexElem) => { + console.log(strLexElem); + }); + + + // reject(new Error('NotImplemented')) + }); diff --git a/package.json b/package.json index 4795255..aa340f7 100644 --- a/package.json +++ b/package.json @@ -10,10 +10,12 @@ "author": "", "license": "ISC", "dependencies": { + "cheerio": "^1.0.0-rc.2", "express": "^4.15.4", "express-rate-limit": "^2.9.0", "mocha": "^3.5.3", "nodemon": "^1.12.0", - "pug": "^2.0.0-rc.4" + "pug": "^2.0.0-rc.4", + "request": "^2.81.0" } } From 6bff68d273b2b7a3b44b9835bdeb1409dcfab3f2 Mon Sep 17 00:00:00 2001 From: Sreedhar M B Date: Sat, 16 Sep 2017 16:05:38 +0530 Subject: [PATCH 2/3] Updated --- crawler.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawler.js b/crawler.js index 46cfaaf..d3dacc3 100644 --- a/crawler.js +++ b/crawler.js @@ -8,7 +8,7 @@ const request = require('request'); const cheerio = require('cheerio'); const fs = require('fs'); -const siteURL = "http://localhost:8080"; +let siteURL = "http://localhost:8080"; let linksMap = {}; let linksVisitedMap = {}; @@ -66,6 +66,7 @@ const getSiteDetails = (nextSiteHash, resolveFunc, callback) => { */ module.exports = url => new Promise((resolve, reject) => { + siteURL = url; getSiteDetails('', resolve, (strLexElem) => { console.log(strLexElem); }); From 9147b95382f9a3b398e9eeb81403329ddb482af3 Mon Sep 17 00:00:00 2001 From: Sreedhar M B Date: Sat, 16 Sep 2017 16:21:17 +0530 Subject: [PATCH 3/3] Remove Console logs --- crawler.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crawler.js b/crawler.js index d3dacc3..8cc9932 100644 --- a/crawler.js +++ b/crawler.js @@ -23,8 +23,8 @@ const getSiteDetails = (nextSiteHash, resolveFunc, callback) => { console.log("Error: " + error); } - console.log(nextSite); - console.log("Status code: " + response.statusCode); + // console.log(nextSite); + // console.log("Status code: " + response.statusCode); let $ = cheerio.load(body); $('a.link').each(function( index ) { @@ -43,8 +43,8 @@ const getSiteDetails = (nextSiteHash, resolveFunc, callback) => { while(Object.keys(linksMap).length) { let nextPageHash = linksMap[Object.keys(linksMap)[0]]; getSiteDetails(nextPageHash, resolveFunc, (strLexElem) => { - console.log(strLexElem); - console.log(siteVisitCount); + // console.log(strLexElem); + // console.log(siteVisitCount); siteVisitCount--; if(siteVisitCount == 1) { resolveFunc(strLexElem); @@ -68,7 +68,7 @@ module.exports = url => new Promise((resolve, reject) => { siteURL = url; getSiteDetails('', resolve, (strLexElem) => { - console.log(strLexElem); + // console.log(strLexElem); });