A web page content extractor for News websites
npm install html-article-extractor
var htmlArticleExtractor = require("html-article-extractor");
var dom = new JSDOM("...");
var body = dom.window.document.body
result = htmlArticleExtractor(body);
console.log(result)
Outputs:
{
html: '<div>contents</div>',
text: 'contents'
}
git clone https://github.com/jungyoun/html-article-extractor
cd html-article-extractor
npm install
node example/crawler.js