just web-index (#31)

* dependencies * updated web-index * web-index * bug fix * bug fix * verify hashes * add web-index to CI * no echos for verify.sh * change hashing method to md5sum * web_index python script to shell * verify.sh --------- Co-authored-by: Zhuoxuan Zhang <[email protected]> Co-authored-by: Evangelos Lamprou <[email protected]>
binpash · Nov 11, 2024 · 938769f · 938769f
1 parent 9fb3b2c
commit 938769f
Show file tree

Hide file tree

Showing 29 changed files with 1,912 additions and 1,956 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners]
+        benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn, covid-mts, riker, oneliners, web-index]
 
     steps:
       - name: Checkout code

diff --git a/web-index/bigrams_aux.sh b/web-index/bigrams_aux.sh
@@ -0,0 +1,9 @@
+( mkfifo s2 > /dev/null ) ;
+( mkfifo s3 > /dev/null ) ;
+
+sed '$d' s2 > s3 &
+tee s2 |
+    tail +2 |
+    paste s3 -
+rm s2
+rm s3
diff --git a/web-index/cleanup.sh b/web-index/cleanup.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+rm -rf tmp articles node_modules output *grams *grams.txt *index*.txt *tar.gz
diff --git a/web-index/deps.sh b/web-index/deps.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -e
+# 7zip
+pkgs='p7zip-full curl wget nodejs unzip npm' 
+if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
+  sudo apt-get install $pkgs -y
+  echo 'Packages Installed'
+fi
+
+if ! dpkg -s pandoc > /dev/null 2>&1 ; then
+  # since pandoc v.2.2.1 does not support arm64, we use v.3.5
+  wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb
+  sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb
+  rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb
+fi
+
+if ! dpkg -s nodejs > /dev/null 2>&1 ; then
+    # node version 18+ does not need external npm
+    curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
+    sudo apt-get install -y nodejs
+fi
+
+npm install
+# Install the npm packages
+npm install natural
diff --git a/web-index/extract_text.sh b/web-index/extract_text.sh
@@ -0,0 +1,6 @@
+while read -r line
+do
+    cat $line |
+        iconv -c -t ascii//TRANSLIT |
+        pandoc +RTS -K64m -RTS --from html --to plain --quiet
+done
diff --git a/web-index/grep-url.js b/web-index/grep-url.js
@@ -0,0 +1,100 @@
+#!/usr/bin/env node
+// TODO: use node's URL to parse and emit a URL in normal form
+// URL validation as a stream transformer
+//
+// Contains code by Diego Perini, as compared in  
+//   http://mathiasbynens.be/demo/url-regex
+//
+// Notes on possible differences from a standard/generic validation:
+//
+// - utf-8 char class take in consideration the full Unicode range
+// - TLDs have been made mandatory so single names like "localhost" fails
+// - protocols have been restricted to ftp, http and https only as requested
+
+var re_weburl = new RegExp(
+    "^" +
+      // protocol identifier (optional)
+      // short syntax // still required
+      "(?:(?:(?:https?|ftp):)?\\/\\/)" +
+      // user:pass BasicAuth (optional)
+      "(?:\\S+(?::\\S*)?@)?" +
+      "(?:" +
+        // IP address exclusion
+        // private & local networks
+        "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
+        "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
+        "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
+        // IP address dotted notation octets
+        // excludes loopback network 0.0.0.0
+        // excludes reserved space >= 224.0.0.0
+        // excludes network & broadcast addresses
+        // (first & last IP address of each class)
+        "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
+        "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
+        "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
+      "|" +
+        // host & domain names, may end with dot
+        // can be replaced by a shortest alternative
+        // (?![-_])(?:[-\\w\\u00a1-\\uffff]{0,63}[^-_]\\.)+
+        "(?:" +
+          "(?:" +
+            "[a-z0-9\\u00a1-\\uffff]" +
+            "[a-z0-9\\u00a1-\\uffff_-]{0,62}" +
+          ")?" +
+          "[a-z0-9\\u00a1-\\uffff]\\." +
+        ")+" +
+        // TLD identifier name, may end with dot
+        "(?:[a-z\\u00a1-\\uffff]{2,}\\.?)" +
+      ")" +
+      // port number (optional)
+      "(?::\\d{2,5})?" +
+      // resource path (optional)
+      "(?:[/?#]\\S*)?" +
+    "$", "i"
+  );
+
+  let nregex = options => {
+      options = {
+          strict: true,
+          ...options
+      };
+
+      const tlds = require('./tlds');
+      const v4 = '(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)(?:\\.(?:25[0-5]|2[0-4]\\d|1\\d\\d|[1-9]\\d|\\d)){3}';
+      const protocol = `(?:(?:[a-z]+:)?//)${options.strict ? '' : '?'}`;
+      const auth = '(?:\\S+(?::\\S*)?@)?';
+      const ip = v4;
+      const host = '(?:(?:[a-z\\u00a1-\\uffff0-9][-_]*)*[a-z\\u00a1-\\uffff0-9]+)';
+      const domain = '(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*';
+      const tld = `(?:\\.${options.strict ? '(?:[a-z\\u00a1-\\uffff]{2,})' : `(?:${tlds.sort((a, b) => b.length - a.length).join('|')})`})\\.?`;
+      const port = '(?::\\d{2,5})?';
+      const path = '(?:[/?#][^\\s"]*)?';
+      const regex = `(?:${protocol}|www\\.)${auth}(?:localhost|${ip}|${host}${domain}${tld})${port}${path}`;
+
+      return options.exact ? new RegExp(`(?:^${regex}$)`, 'i') : new RegExp(regex, 'ig');
+  };
+
+  var readline = require('readline');
+
+  var rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+    terminal: false
+  });
+
+  rl.on('line', function (line) {
+    let r = line.match(nregex());
+    if (r) {
+      for (let i = 0; i < r.length; i++) {
+        //console.error(i);// (r[i]);
+        console.log(r[i]);
+      };
+    } else {
+      console.log("pizza");
+    }
+  //  if (r) {
+  //    console.log(r.join('\n'));
+  //  }
+  });
+
+  // console.log('foo http://github.com bar //google.com'.match(nregex()));
diff --git a/web-index/hashes/1-grams.txt.small.hash b/web-index/hashes/1-grams.txt.small.hash
@@ -0,0 +1 @@
+b7006f6d425233137811f16eeb6ca668
diff --git a/web-index/hashes/2-grams.txt.small.hash b/web-index/hashes/2-grams.txt.small.hash
@@ -0,0 +1 @@
+a48e86700b02c50651e8d4b09a73170c
diff --git a/web-index/hashes/3-grams.txt.small.hash b/web-index/hashes/3-grams.txt.small.hash
@@ -0,0 +1 @@
+73310ad60a0d2d50d805901c481a5dbc
diff --git a/web-index/input.sh b/web-index/input.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}
+RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}
+
+mkdir -p $RESOURCES_DIR
+
+if [ "$1" = "--small" ]; then
+	if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then
+		# 1000 entries
+		echo "Downloading the small dataset."
+		wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate
+		wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate
+	fi
+else
+	if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then
+		# full dataset
+		echo "Downloading the full dataset. Caution!! Extracted size >200GB"
+		wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate
+		wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate
+	fi
+fi
+
+if [[ ! -d "$RESOURCES_DIR/articles" ]]; then
+	if [ "$1" = "--small" ]; then
+		# 1000 entries
+		echo "Extracting the small dataset."
+		tar -xf $RESOURCES_DIR/wikipedia-small.tar.gz -C $RESOURCES_DIR
+	else
+		# full dataset
+		echo "Extracting the full dataset. Caution!! Extracted size >200GB"
+		tar -xf $RESOURCES_DIR/wikipedia.tar.gz -C $RESOURCES_DIR
+	fi
+else
+	echo "Did not extract data because of existing data."
+	echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script."
+fi
+
+echo "Data is ready."
diff --git a/web-index/input/dependencies.sh b/web-index/input/dependencies.sh
diff --git a/web-index/input/generte_index.sh b/web-index/input/generte_index.sh
diff --git a/web-index/input/input.sh b/web-index/input/input.sh
diff --git a/web-index/inputs/cleanup.sh b/web-index/inputs/cleanup.sh
diff --git a/web-index/inputs/dependencies.sh b/web-index/inputs/dependencies.sh
diff --git a/web-index/inputs/input.sh b/web-index/inputs/input.sh
diff --git a/web-index/inputs/run.sh b/web-index/inputs/run.sh
diff --git a/web-index/inputs/verify.sh b/web-index/inputs/verify.sh
diff --git a/web-index/move_articles.sh b/web-index/move_articles.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Define the base directory
+base_directory="input1000"
+
+# Check if the base directory exists
+if [ ! -d "$base_directory" ]; then
+    echo "Base directory does not exist: $base_directory"
+    exit 1
+fi
+
+# Navigate to the base directory
+cd "$base_directory"
+
+# Create a tar archive of the en/articles directory
+tar -czvf en_articles.tar.gz en/articles
+
+echo "Archive created: $(pwd)/en_articles.tar.gz"
diff --git a/web-index/p1.sh b/web-index/p1.sh
diff --git a/web-index/p2.sh b/web-index/p2.sh
diff --git a/web-index/input/package.json → web-index/package.json b/web-index/input/package.json → web-index/package.json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		rm -rf tmp articles node_modules output grams grams.txt index.txt *tar.gz