Skip to content

Commit

Permalink
just web-index (#31)
Browse files Browse the repository at this point in the history
* dependencies

* updated web-index

* web-index

* bug fix

* bug fix

* verify hashes

* add web-index to CI

* no echos for verify.sh

* change hashing method to md5sum

* web_index python script to shell

* verify.sh

---------

Co-authored-by: Zhuoxuan Zhang <[email protected]>
Co-authored-by: Evangelos Lamprou <[email protected]>
  • Loading branch information
3 people authored Nov 11, 2024
1 parent 9fb3b2c commit 938769f
Show file tree
Hide file tree
Showing 29 changed files with 1,912 additions and 1,956 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners]
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners, web-index]

steps:
- name: Checkout code
Expand Down
9 changes: 9 additions & 0 deletions web-index/bigrams_aux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
( mkfifo s2 > /dev/null ) ;
( mkfifo s3 > /dev/null ) ;

sed '$d' s2 > s3 &
tee s2 |
tail +2 |
paste s3 -
rm s2
rm s3
3 changes: 3 additions & 0 deletions web-index/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

rm -rf tmp articles node_modules output *grams *grams.txt *index*.txt *tar.gz
25 changes: 25 additions & 0 deletions web-index/deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -e
# 7zip
pkgs='p7zip-full curl wget nodejs unzip npm'
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
sudo apt-get install $pkgs -y
echo 'Packages Installed'
fi

if ! dpkg -s pandoc > /dev/null 2>&1 ; then
# since pandoc v.2.2.1 does not support arm64, we use v.3.5
wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb
sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb
rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb
fi

if ! dpkg -s nodejs > /dev/null 2>&1 ; then
# node version 18+ does not need external npm
curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
sudo apt-get install -y nodejs
fi

npm install
# Install the npm packages
npm install natural
6 changes: 6 additions & 0 deletions web-index/extract_text.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
while read -r line
do
cat $line |
iconv -c -t ascii//TRANSLIT |
pandoc +RTS -K64m -RTS --from html --to plain --quiet
done
100 changes: 100 additions & 0 deletions web-index/grep-url.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env node
// TODO: use node's URL to parse and emit a URL in normal form
// URL validation as a stream transformer
//
// Contains code by Diego Perini, as compared in
// http://mathiasbynens.be/demo/url-regex
//
// Notes on possible differences from a standard/generic validation:
//
// - utf-8 char class take in consideration the full Unicode range
// - TLDs have been made mandatory so single names like "localhost" fails
// - protocols have been restricted to ftp, http and https only as requested

var re_weburl = new RegExp(
"^" +
// protocol identifier (optional)
// short syntax // still required
"(?:(?:(?:https?|ftp):)?\\/\\/)" +
// user:pass BasicAuth (optional)
"(?:\\S+(?::\\S*)?@)?" +
"(?:" +
// IP address exclusion
// private & local networks
"(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
"(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
"(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
// IP address dotted notation octets
// excludes loopback network 0.0.0.0
// excludes reserved space >= 224.0.0.0
// excludes network & broadcast addresses
// (first & last IP address of each class)
"(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
"(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
"(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
"|" +
// host & domain names, may end with dot
// can be replaced by a shortest alternative
// (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+
"(?:" +
"(?:" +
"[a-z0-9\\u00a1-\\uffff]" +
"[a-z0-9\\u00a1-\\uffff_-]{0,62}" +
")?" +
"[a-z0-9\\u00a1-\\uffff]\\." +
")+" +
// TLD identifier name, may end with dot
"(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" +
")" +
// port number (optional)
"(?::\\d{2,5})?" +
// resource path (optional)
"(?:[/?#]\\S*)?" +
"$", "i"
);

let nregex = options => {
options = {
strict: true,
...options
};

const tlds = require('./tlds');
const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}';
const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`;
const auth = '(?:\\S+(?::\\S*)?@)?';
const ip = v4;
const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)';
const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*';
const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`;
const port = '(?::\\d{2,5})?';
const path = '(?:[/?#][^\\s"]*)?';
const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`;

return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig');
};

var readline = require('readline');

var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});

rl.on('line', function (line) {
let r = line.match(nregex());
if (r) {
for (let i = 0; i < r.length; i++) {
//console.error(i);// (r[i]);
console.log(r[i]);
};
} else {
console.log("pizza");
}
// if (r) {
// console.log(r.join('\n'));
// }
});

// console.log('foo http://github.com bar //google.com'.match(nregex()));
1 change: 1 addition & 0 deletions web-index/hashes/1-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b7006f6d425233137811f16eeb6ca668
1 change: 1 addition & 0 deletions web-index/hashes/2-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a48e86700b02c50651e8d4b09a73170c
1 change: 1 addition & 0 deletions web-index/hashes/3-grams.txt.small.hash
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
73310ad60a0d2d50d805901c481a5dbc
39 changes: 39 additions & 0 deletions web-index/input.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}

mkdir -p $RESOURCES_DIR

if [ "$1" = "--small" ]; then
if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then
# 1000 entries
echo "Downloading the small dataset."
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate
fi
else
if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then
# full dataset
echo "Downloading the full dataset. Caution!! Extracted size >200GB"
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate
fi
fi

if [[ ! -d "$RESOURCES_DIR/articles" ]]; then
if [ "$1" = "--small" ]; then
# 1000 entries
echo "Extracting the small dataset."
tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR
else
# full dataset
echo "Extracting the full dataset. Caution!! Extracted size >200GB"
tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR
fi
else
echo "Did not extract data because of existing data."
echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script."
fi

echo "Data is ready."
25 changes: 0 additions & 25 deletions web-index/input/dependencies.sh

This file was deleted.

24 changes: 0 additions & 24 deletions web-index/input/generte_index.sh

This file was deleted.

41 changes: 0 additions & 41 deletions web-index/input/input.sh

This file was deleted.

Empty file removed web-index/inputs/cleanup.sh
Empty file.
Empty file removed web-index/inputs/dependencies.sh
Empty file.
Empty file removed web-index/inputs/input.sh
Empty file.
Empty file removed web-index/inputs/run.sh
Empty file.
Empty file removed web-index/inputs/verify.sh
Empty file.
18 changes: 18 additions & 0 deletions web-index/move_articles.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# Define the base directory
base_directory="input1000"

# Check if the base directory exists
if [ ! -d "$base_directory" ]; then
echo "Base directory does not exist: $base_directory"
exit 1
fi

# Navigate to the base directory
cd "$base_directory"

# Create a tar archive of the en/articles directory
tar -czvf en_articles.tar.gz en/articles

echo "Archive created: $(pwd)/en_articles.tar.gz"
18 changes: 0 additions & 18 deletions web-index/p1.sh

This file was deleted.

13 changes: 0 additions & 13 deletions web-index/p2.sh

This file was deleted.

File renamed without changes.
Loading

0 comments on commit 938769f

Please sign in to comment.