-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* dependencies * updated web-index * web-index * bug fix * bug fix * verify hashes * add web-index to CI * no echos for verify.sh * change hashing method to md5sum * web_index python script to shell * verify.sh --------- Co-authored-by: Zhuoxuan Zhang <[email protected]> Co-authored-by: Evangelos Lamprou <[email protected]>
- Loading branch information
1 parent
9fb3b2c
commit 938769f
Showing
29 changed files
with
1,912 additions
and
1,956 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
( mkfifo s2 > /dev/null ) ; | ||
( mkfifo s3 > /dev/null ) ; | ||
|
||
sed '$d' s2 > s3 & | ||
tee s2 | | ||
tail +2 | | ||
paste s3 - | ||
rm s2 | ||
rm s3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
|
||
rm -rf tmp articles node_modules output *grams *grams.txt *index*.txt *tar.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/usr/bin/env bash | ||
set -e | ||
# 7zip | ||
pkgs='p7zip-full curl wget nodejs unzip npm' | ||
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then | ||
sudo apt-get install $pkgs -y | ||
echo 'Packages Installed' | ||
fi | ||
|
||
if ! dpkg -s pandoc > /dev/null 2>&1 ; then | ||
# since pandoc v.2.2.1 does not support arm64, we use v.3.5 | ||
wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb | ||
fi | ||
|
||
if ! dpkg -s nodejs > /dev/null 2>&1 ; then | ||
# node version 18+ does not need external npm | ||
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - | ||
sudo apt-get install -y nodejs | ||
fi | ||
|
||
npm install | ||
# Install the npm packages | ||
npm install natural |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
while read -r line | ||
do | ||
cat $line | | ||
iconv -c -t ascii//TRANSLIT | | ||
pandoc +RTS -K64m -RTS --from html --to plain --quiet | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#!/usr/bin/env node | ||
// TODO: use node's URL to parse and emit a URL in normal form | ||
// URL validation as a stream transformer | ||
// | ||
// Contains code by Diego Perini, as compared in | ||
// http://mathiasbynens.be/demo/url-regex | ||
// | ||
// Notes on possible differences from a standard/generic validation: | ||
// | ||
// - utf-8 char class take in consideration the full Unicode range | ||
// - TLDs have been made mandatory so single names like "localhost" fails | ||
// - protocols have been restricted to ftp, http and https only as requested | ||
|
||
var re_weburl = new RegExp( | ||
"^" + | ||
// protocol identifier (optional) | ||
// short syntax // still required | ||
"(?:(?:(?:https?|ftp):)?\\/\\/)" + | ||
// user:pass BasicAuth (optional) | ||
"(?:\\S+(?::\\S*)?@)?" + | ||
"(?:" + | ||
// IP address exclusion | ||
// private & local networks | ||
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" + | ||
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" + | ||
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" + | ||
// IP address dotted notation octets | ||
// excludes loopback network 0.0.0.0 | ||
// excludes reserved space >= 224.0.0.0 | ||
// excludes network & broadcast addresses | ||
// (first & last IP address of each class) | ||
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" + | ||
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" + | ||
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + | ||
"|" + | ||
// host & domain names, may end with dot | ||
// can be replaced by a shortest alternative | ||
// (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+ | ||
"(?:" + | ||
"(?:" + | ||
"[a-z0-9\\u00a1-\\uffff]" + | ||
"[a-z0-9\\u00a1-\\uffff_-]{0,62}" + | ||
")?" + | ||
"[a-z0-9\\u00a1-\\uffff]\\." + | ||
")+" + | ||
// TLD identifier name, may end with dot | ||
"(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" + | ||
")" + | ||
// port number (optional) | ||
"(?::\\d{2,5})?" + | ||
// resource path (optional) | ||
"(?:[/?#]\\S*)?" + | ||
"$", "i" | ||
); | ||
|
||
let nregex = options => { | ||
options = { | ||
strict: true, | ||
...options | ||
}; | ||
|
||
const tlds = require('./tlds'); | ||
const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}'; | ||
const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`; | ||
const auth = '(?:\\S+(?::\\S*)?@)?'; | ||
const ip = v4; | ||
const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)'; | ||
const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*'; | ||
const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`; | ||
const port = '(?::\\d{2,5})?'; | ||
const path = '(?:[/?#][^\\s"]*)?'; | ||
const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`; | ||
|
||
return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig'); | ||
}; | ||
|
||
var readline = require('readline'); | ||
|
||
var rl = readline.createInterface({ | ||
input: process.stdin, | ||
output: process.stdout, | ||
terminal: false | ||
}); | ||
|
||
rl.on('line', function (line) { | ||
let r = line.match(nregex()); | ||
if (r) { | ||
for (let i = 0; i < r.length; i++) { | ||
//console.error(i);// (r[i]); | ||
console.log(r[i]); | ||
}; | ||
} else { | ||
console.log("pizza"); | ||
} | ||
// if (r) { | ||
// console.log(r.join('\n')); | ||
// } | ||
}); | ||
|
||
// console.log('foo http://github.com bar //google.com'.match(nregex())); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
b7006f6d425233137811f16eeb6ca668 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
a48e86700b02c50651e8d4b09a73170c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
73310ad60a0d2d50d805901c481a5dbc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
|
||
BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)} | ||
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index} | ||
|
||
mkdir -p $RESOURCES_DIR | ||
|
||
if [ "$1" = "--small" ]; then | ||
if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then | ||
# 1000 entries | ||
echo "Downloading the small dataset." | ||
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate | ||
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate | ||
fi | ||
else | ||
if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then | ||
# full dataset | ||
echo "Downloading the full dataset. Caution!! Extracted size >200GB" | ||
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate | ||
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate | ||
fi | ||
fi | ||
|
||
if [[ ! -d "$RESOURCES_DIR/articles" ]]; then | ||
if [ "$1" = "--small" ]; then | ||
# 1000 entries | ||
echo "Extracting the small dataset." | ||
tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR | ||
else | ||
# full dataset | ||
echo "Extracting the full dataset. Caution!! Extracted size >200GB" | ||
tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR | ||
fi | ||
else | ||
echo "Did not extract data because of existing data." | ||
echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script." | ||
fi | ||
|
||
echo "Data is ready." |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
# Define the base directory | ||
base_directory="input1000" | ||
|
||
# Check if the base directory exists | ||
if [ ! -d "$base_directory" ]; then | ||
echo "Base directory does not exist: $base_directory" | ||
exit 1 | ||
fi | ||
|
||
# Navigate to the base directory | ||
cd "$base_directory" | ||
|
||
# Create a tar archive of the en/articles directory | ||
tar -czvf en_articles.tar.gz en/articles | ||
|
||
echo "Archive created: $(pwd)/en_articles.tar.gz" |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
File renamed without changes.
Oops, something went wrong.