Skip to content

Commit

Permalink
web-index
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhuoxuan Zhang committed Oct 28, 2024
1 parent 32d1c4b commit ded97fe
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 10 deletions.
3 changes: 3 additions & 0 deletions web-index/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

rm -rf tmp articles node_modules output *grams* *index*.txt *tar.gz
10 changes: 5 additions & 5 deletions web-index/setup.sh → web-index/deps.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
#!/usr/bin/env bash
set -e
# 7zip
pkgs='p7zip-full curl wget nodejs unzip'
pkgs='p7zip-full curl wget nodejs unzip npm'
if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
sudo apt-get install $pkgs -y
echo 'Packages Installed'
fi

if ! dpkg -s pandoc > /dev/null 2>&1 ; then
# pandoc v.2.2.1
wget https://github.com/jgm/pandoc/releases/download/2.2.1/pandoc-2.2.1-1-$(dpkg --print-architecture).deb
sudo dpkg -i ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb
rm ./pandoc-2.2.1-1-$(dpkg --print-architecture).deb
# since pandoc v.2.2.1 does not support arm64, we use v.3.5
wget https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-1-$(dpkg --print-architecture).deb
sudo dpkg -i ./pandoc-3.5-1-$(dpkg --print-architecture).deb
rm ./pandoc-3.5-1-$(dpkg --print-architecture).deb
fi

if ! dpkg -s nodejs > /dev/null 2>&1 ; then
Expand Down
File renamed without changes.
12 changes: 7 additions & 5 deletions web-index/input.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
#!/bin/bash

BENCH_TOP=${BENCH_TOP:-$(git rev-parse --show-toplevel)}
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/report/resources/web-index/}
RESOURCES_DIR=${RESOURCES_DIR:-$BENCH_TOP/web-index}

mkdir -p $RESOURCES_DIR

if [ "$1" = "--small" ]; then
if [[ ! -f "$RESOURCES_DIR/wikipedia-small.tar.gz" ]]; then
# 1000 entries
echo "Downloading the small dataset."
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt
wget -O $RESOURCES_DIR/wikipedia-small.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input_small/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index_small.txt https://atlas-group.cs.brown.edu/data/wikipedia/input_small/index.txt --no-check-certificate
fi
else
if [[ ! -f "$RESOURCES_DIR/wikipedia.tar.gz" ]]; then
# full dataset
echo "Downloading the full dataset. Caution!! Extracted size >200GB"
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt
wget -O $RESOURCES_DIR/wikipedia.tar.gz https://atlas-group.cs.brown.edu/data/wikipedia/input/articles.tar.gz --no-check-certificate
wget -O $RESOURCES_DIR/index.txt https://atlas-group.cs.brown.edu/data/wikipedia/input/index.txt --no-check-certificate
fi
fi

Expand All @@ -35,3 +35,5 @@ else
echo "Did not extract data because of existing data."
echo "Please rm -r $RESOURCES_DIR/articles manually and rerun this script."
fi

echo "Data is ready."
7 changes: 7 additions & 0 deletions web-index/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

cd "$(dirname "$0")"

./generate_index.sh articles

./web-index/run --target sh-only
Empty file added web-index/verify.sh
Empty file.

0 comments on commit ded97fe

Please sign in to comment.