IndoNLP · atnanahidiw · Sep 26, 2020 · Sep 26, 2020 · Oct 3, 2020 · Oct 3, 2020
diff --git a/Makefile b/Makefile
@@ -0,0 +1,63 @@
+fork-setup:
+	git remote add upstream https://github.com/indobenchmark/indonlu.git
+	git remote -v
+
+HYPERPARAMETER ?= default
+EARLY_STOP ?= 15
+BATCH_SIZE ?= 16
+
+.PHONY : reproduce
+
+reproduce:
+	python3 scripts/reproducer.py $(DATASET) $(EARLY_STOP) $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all:
+	python3 scripts/reproducer.py absa-airy 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py absa-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py doc-sentiment-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py emotion-twitter 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py entailment-ui 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py keyword-extraction-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py qa-factoid-itb 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py ner-grit 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py ner-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py pos-idn 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py term-extraction-airy 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py pos-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_1:
+	python3 scripts/reproducer.py absa-airy 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py absa-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py doc-sentiment-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_2:
+	python3 scripts/reproducer.py emotion-twitter 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py entailment-ui 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py keyword-extraction-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_3:
+	python3 scripts/reproducer.py qa-factoid-itb 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_4:
+	python3 scripts/reproducer.py ner-grit 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py ner-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_5:
+	python3 scripts/reproducer.py pos-idn 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+reproduce_all_6:
+	python3 scripts/reproducer.py term-extraction-airy 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+	python3 scripts/reproducer.py pos-prosa 15 $(BATCH_SIZE) $(HYPERPARAMETER)
+
+run_non_pretrained_no_special_token:
+	python3 scripts/reproducer_non_pretrained.py $(DATASET) $(EARLY_STOP) $(BATCH_SIZE)
+
+run_non_pretrained_no_special_token_all:
+	python3 scripts/reproducer_non_pretrained.py emotion-twitter 10 16
+	python3 scripts/reproducer_non_pretrained.py pos-idn 10 16
+	python3 scripts/reproducer_non_pretrained.py ner-grit 10 16
+	python3 scripts/reproducer_non_pretrained.py absa-airy 10 16
+	python3 scripts/reproducer_non_pretrained.py term-extraction-airy 10 16
+	python3 scripts/reproducer_non_pretrained.py entailment-ui 10 16
+	python3 scripts/reproducer_non_pretrained.py doc-sentiment-prosa 10 16
+	python3 scripts/reproducer_non_pretrained.py keyword-extraction-prosa 10 16
diff --git a/README.md b/README.md
@@ -53,3 +53,45 @@ We provide the access to our large pretraining dataset. In this version, we excl
 ## Leaderboard
 - Community Portal and Public Leaderboard [[Link]](https://www.indobenchmark.com/leaderboard.html)
 - Submission Portal https://competitions.codalab.org/competitions/26537
+
+## Quick Start
+
+### Predict
+_TBD_
+
+### Train
+_TBD_
+
+### Reproduce Result
+
+1. Set the `CUDA_VISIBLE_DEVICES` environment variable first
+    ```
+    export CUDA_VISIBLE_DEVICES=0
+    ```
+2. Then, simply execute the following command to run the training
+    ```
+    make reproduce DATASET=<dataset>
+    ```
+    It will train all of the models for the specified _\<dataset\>_ with default parameter
+3. Check the available datasets in `datasets/` directory
+4. All of the models used are listed in `scripts/config/model/train.yaml` \
+    Feel free to add or comment as you see fit
+5. To use different hyperparameter, create a new file in `scripts/config/hyperparameter/` \
+    Then specify it in the command like this
+    ```
+    make reproduce DATASET=<dataset> HYPERPARAMETER=<hyperparameter_filename_without_the_extension>
+    ```
+6. There are 2 more parameters that can be specified in the command:
+    - EARLY_STOP
+    - BATCH_SIZE
+
+    Use the following command to utilize it
+    ```
+    make reproduce DATASET=<dataset> EARLY_STOP=<early_stop> BATCH_SIZE=<batch_size>
+    ```
+7. There are also a grouping command of specific task for easy access like
+    ```
+    make reproduce_all_1
+    make reproduce_all_2
+    etc
+    ```
diff --git a/run_all_tasks.sh → archives/run_all_tasks.sh b/run_all_tasks.sh → archives/run_all_tasks.sh
diff --git a/run_single_task.sh → archives/run_single_task.sh b/run_single_task.sh → archives/run_single_task.sh
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,11 @@
+PyYAML==5.3.1
+numpy
+pandas
+torch
+tqdm
+transformers
+nltk
+sklearn
+matplotlib
+seaborn
+ipywidgets
diff --git a/scripts/config/hyperparameter/default.yaml b/scripts/config/hyperparameter/default.yaml
@@ -0,0 +1,6 @@
+n_epochs: 100
+step_size: 1
+gamma: 0.9
+lr: 1e-5
+options:
+  - --force
diff --git a/scripts/config/hyperparameter/no_special_token_1.yaml b/scripts/config/hyperparameter/no_special_token_1.yaml
@@ -0,0 +1,6 @@
+n_epochs: 100
+step_size: 1
+gamma: 0.5
+lr: 6.25e-5
+options:
+  - --no_special_token
diff --git a/scripts/config/hyperparameter/no_special_token_2.yaml b/scripts/config/hyperparameter/no_special_token_2.yaml
@@ -0,0 +1,6 @@
+n_epochs: 100
+step_size: 1
+gamma: 0.8
+lr: 6.25e-5
+options:
+  - --no_special_token
diff --git a/scripts/config/model/non_pretrained.yaml b/scripts/config/model/non_pretrained.yaml
@@ -0,0 +1,10 @@
+- model_checkpoint: scratch
+  hyperparameter_config: no_special_token_1.yaml
+- model_checkpoint: word2vec
+  hyperparameter_config: no_special_token_1.yaml
+- model_checkpoint: fasttext-twitter
+  hyperparameter_config: no_special_token_2.yaml
+- model_checkpoint: fasttext-cc-id
+  hyperparameter_config: no_special_token_2.yaml
+- model_checkpoint: fasttext-cc-id-no-oov
+  hyperparameter_config: no_special_token_2.yaml
diff --git a/scripts/config/model/train.yaml b/scripts/config/model/train.yaml
@@ -0,0 +1,146 @@
+# list of used configuration
+# model_checkpoint: 
+# lower: 
+# num_layers: 
+
+# # albert-base-uncased-96000
+# - model_checkpoint: albert-base-uncased-96000
+#   lower: True
+#   num_layers:
+#     - 12
+
+# # albert-base-uncased-96000-spm
+# - model_checkpoint: albert-base-uncased-96000-spm
+#   lower: True
+#   num_layers:
+#     - 12
+
+# # albert-base-uncased-112500-spm
+# - model_checkpoint: albert-base-uncased-112500-spm
+#   lower: True
+#   num_layers:
+#     - 12
+
+# scratch
+- model_checkpoint: scratch
+  lower: True
+  num_layers:
+    - 2
+    - 4
+    - 6
+
+# fasttext-cc-id-300-no-oov-uncased
+- model_checkpoint: fasttext-cc-id-300-no-oov-uncased
+  lower: True
+  num_layers:
+    - 2
+    - 4
+    - 6
+
+# fasttext-4B-id-300-no-oov-uncased
+- model_checkpoint: fasttext-4B-id-300-no-oov-uncased
+  lower: True
+  num_layers:
+    - 2
+    - 4
+    - 6
+
+# babert-base-512
+- model_checkpoint: babert-base-512
+  lower: True
+  num_layers:
+    - 12
+
+# babert-bpe-mlm-large-512
+- model_checkpoint: babert-bpe-mlm-large-512
+  lower: True
+  num_layers:
+    - 24
+
+# mbert
+- model_checkpoint: bert-base-multilingual-uncased
+  lower: False
+  num_layers:
+    - 12
+
+# xlm-roberta
+- model_checkpoint: xlm-roberta-base
+  lower: False
+  num_layers:
+    - 12
+- model_checkpoint: xlm-roberta-base
+  lower: False
+  num_layers:
+    - 12
+
+# babert-opensubtitle
+- model_checkpoint: babert-opensubtitle
+  lower: False
+  num_layers:
+    - 12
+
+# xlm
+- model_checkpoint: xlm-mlm-100-1280
+  lower: False
+  num_layers:
+    - 16
+
+# albert-large-wwmlm-128
+- model_checkpoint: albert-large-wwmlm-128
+  lower: True
+  num_layers:
+    - 24
+
+# albert-base-wwmlm-512
+- model_checkpoint: albert-base-wwmlm-512
+  lower: True
+  num_layers:
+    - 12
+
+# albert-large-wwmlm-512
+- model_checkpoint: albert-large-wwmlm-512
+  lower: True
+  num_layers:
+    - 24
+
+# albert-base-uncased-112500
+- model_checkpoint: albert-base-uncased-112500
+  lower: True
+  num_layers:
+    - 12
+
+# albert-base-uncased-191k
+- model_checkpoint: albert-base-uncased-191k
+  lower: True
+  num_layers:
+    - 12
+
+# cartobert
+- model_checkpoint: cartobert
+  lower: True
+  num_layers:
+    - 12
+
+# babert-bpe-mlm-large-uncased
+- model_checkpoint: babert-bpe-mlm-large-uncased
+  lower: True
+  num_layers:
+    - 24
+
+# babert-bpe-mlm-large-uncased-1m
+- model_checkpoint: babert-bpe-mlm-large-uncased-1m
+  lower: True
+  num_layers:
+    - 24
+
+# babert-bpe-mlm-large-uncased-1100k
+- model_checkpoint: babert-bpe-mlm-large-uncased-1100k
+  lower: True
+  num_layers:
+    - 24
+
+# babert-bpe-mlm-uncased-128-dup10-5
+- model_checkpoint: babert-bpe-mlm-uncased-128-dup10-5
+  lower: True
+  num_layers:
+    - 12
diff --git a/scripts/reproducer.py b/scripts/reproducer.py
@@ -0,0 +1,61 @@
+import os
+import sys
+# import subprocess
+import yaml
+
+CUDA = os.getenv("CUDA_VISIBLE_DEVICES", "0")
+
+path = "scripts/config/model/train.yaml"
+with open(path, "r") as f:
+    model_configs = yaml.safe_load(f)
+
+hyperparams_config = sys.argv[4]
+path = f"scripts/config/hyperparameter/{hyperparams_config}.yaml"
+with open(path, "r") as f:
+    hyperparams = yaml.safe_load(f)
+hyperparams["dataset"] = sys.argv[1]
+hyperparams["early_stop"] = sys.argv[2]
+hyperparams["train_batch_size"] = sys.argv[3]
+
+hyp_list = [
+    "n_epochs",
+    "train_batch_size",
+    "model_checkpoint",
+    "step_size",
+    "gamma",
+    "experiment_name",
+    "lr",
+    "early_stop",
+    "dataset",
+]
+for m in model_configs:
+    hyperparams["model_checkpoint"] = m["model_checkpoint"]
+    for layer in m["num_layers"]:
+        exp = [
+            hyperparams["model_checkpoint"],
+            f"b{hyperparams['train_batch_size']}",
+            f"step{hyperparams['step_size']}",
+            f"gamma{hyperparams['gamma']}",
+            f"lr{hyperparams['lr']}",
+            f"early{hyperparams['early_stop']}",
+            f"layer{layer}",
+            f"lower{m['lower']}"
+        ]
+        hyperparams["experiment_name"] = "_".join(exp)
+
+        cmd = f"CUDA_VISIBLE_DEVICES={CUDA} python3 main.py"
+        for hl in hyp_list:
+            cmd += f" --{hl} {hyperparams[hl]}"
+        if m["lower"]:
+            cmd += " --lower"
+        cmd += f" --num_layers {layer}"
+        for o in hyperparams["options"]:
+            cmd += f" {o}"
+
+        print(f"Running: {cmd}")
+
+        os.system(cmd)
+
+        # # run in parallel, comment above command
+        # results = subprocess.run(
+        #     cmd, shell=True, universal_newlines=True, check=True, text=True)