-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.js
55 lines (50 loc) · 1.47 KB
/
utils.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
const cheerio = require('cheerio');
/**
* Convert html page with lyrics to plain format
* @param {string} content
* @return {string}
*/
module.exports.getLyrics = function(content) {
const $ = cheerio.load(content);
let lyrics = '';
$('div[class^="Lyrics__Container"]').each((i, elem) => {
if ($(elem).text().length !== 0) {
const snippet = $(elem)
.html()
.replace(/<br>/g, '\n')
.replace(/<(?!\s*br\s*\/?)[^>]+>/gi, '');
lyrics += $('<textarea/>').html(snippet).text().trim() + '\n\n';
}
});
return lyrics;
};
const PUNCTUATION_REGEX = new RegExp(/[\p{P}$+<=>^`|~—]+/, 'gu');
const SPACE_REGEX = new RegExp(/\s+/, 'g');
/**
* Replace all punctuation symbols and converted to lower case.
* @param {string} str
* @return {string}
*/
function sanitizeString(str) {
return str
.replace(PUNCTUATION_REGEX, '')
.replace(SPACE_REGEX, ' ')
.toLocaleLowerCase().trim();
}
module.exports.sanitizeString = sanitizeString;
/**
* Convert plain lyrics to rows
* @param {string} lyrics
* @return {Array<{row: string, sanitizeRow: string, index: number}>}
*/
module.exports.getLyricsRows = function(lyrics) {
return lyrics.split('\n').filter((row) => {
const isEmpty = row.length === 0;
const isTitle = row.startsWith('[');
return !isEmpty && !isTitle;
}).reduce((acc, row, index) => {
const sanitizeRow = sanitizeString(row);
acc.push({row, sanitizeRow, index});
return acc;
}, []);
};