This repository has been archived by the owner on Sep 7, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
143 lines (131 loc) · 3.71 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
const _ = require('lodash')
const Url = require('url')
const pdfParse = require('pdf-parse');
const requestPage = require('./lib/request')
const renderPageChrome = require('./lib/render-chrome')
const cookie = require('./lib/cookie')
const analyze = require('./lib/analyze')
const imageExtensions = ['gif', 'png', 'svg', 'ico', 'jpg', 'jpeg']
const musicExtensions = ['mp3', 'wav', 'aiff']
const videoExtensions = ['avi', 'mpg', 'mpeg', 'mp4']
const pdfExtensions = ['pdf']
const defaultReqOptions = {
timeout: 6000,
maxRedirects: 30,
userAgent: 'Schenkerianbot/1.0 (+https://github.com/mix/schenkerian)',
ignoreHTTPSErrors: false
}
const RE_SPACES = /\s+/g
/**
* schenkerian scrapes and analyzes urls
* Acceptable options:
* - url: a string with full protocol and domain
* - body (optional): html text
* - tokens (optional): map of name, values for use in cookies
* - returnSource (optional): includes boilerplate free html in result
* - agent (optional):
* - agentClass: for use by the request library
* - socksHost: socks proxy host
* - socksPort: socks proxy port
*
* @param options
* @returns {*}
*/
module.exports = function (options) {
const { url, tokens, body, returnSource } = options
if (tokens) {
_.merge(options, {
jar: cookie.jarForRequest(tokens, url),
cookies: cookie.chromeCookies(tokens, url)
})
}
if (body) {
return analyze(url, body, returnSource)
}
return retrieveContent(url, options)
}
function retrieveContent(url, options) {
const { forceRequest, agent } = options
let requestOptions = _.defaults(
_.pick(options, ['url', 'timeout', 'userAgent', 'jar', 'cookies', 'ignoreHTTPSErrors']),
defaultReqOptions
)
if (agent) {
const { agentClass, socksHost, socksPort } = agent
_.merge(requestOptions, {
agentClass,
agentOptions: {
socksHost,
socksPort
}
})
}
if (isMedia(url) || forceRequest) {
return requestPage(_.merge({
url
}, requestOptions))
.then(results => {
// Media content can't be analyzed
// It does not contain html to be processed so just return the source
// Treat forceRequest flags the same way
return {
title: url,
image: url,
source: results.body,
url: results.url
}
})
}
if (isPDF(url)) {
return requestPage(_.merge({
url,
encoding: null
}, requestOptions))
.then(results => {
return pdfParse(Buffer.from(results.body, 'utf8'))
.then(pdfData => {
const { text, info } = pdfData
const source = text ? text.replace(RE_SPACES, ' ') : ''
const title = _.get(info, 'Title', url)
const description = _.get(info, 'Subject', url)
return {
source,
title,
description,
image: url,
url: results.url
}
})
})
}
return renderAndAnalyze(url, options, requestOptions)
}
function renderAndAnalyze(url, options, requestOptions) {
const { fallbackRequest, returnSource } = options
return renderPageChrome(url, requestOptions)
.catch(err => {
if (fallbackRequest) {
return requestPage(_.merge({
url
}, requestOptions))
}
throw err
})
.then(results => {
return analyze(results.url, results.body, returnSource)
.then(res =>
_.merge({
url: results.url
}, res)
)
})
}
function isMedia(url) {
const extension = Url.parse(url).pathname.split('.').pop()
return imageExtensions.includes(extension) || musicExtensions.includes(extension)
|| videoExtensions.includes(extension)
}
function isPDF(url) {
const extension = Url.parse(url).pathname.split('.').pop()
return pdfExtensions.includes(extension)
}