Merge pull request #36 from BaiBlanc/master

Bugs Fixed and optional parameters added
LiberAI · Aug 31, 2020 · 3288dbf · 3288dbf
2 parents 44f679d + 382f4a0
commit 3288dbf
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 19 deletions.
diff --git a/generator.py b/generator.py
@@ -45,8 +45,8 @@
     'dbo:Athlete': ['dbo:LacrossePlayer'],
     'dbo:SportsTeam': ['dboBasketballTeam']
 }
-EXAMPLES_PER_TEMPLATE = 600
 
+# EXAMPLES_PER_TEMPLATE = 600
 
 def extract_bindings(data, template):
     matches = list()
@@ -316,12 +316,18 @@ def normalize(ontology_class):
                                metavar='templateFile', help='templates', required=True)
     requiredNamed.add_argument(
         '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
+    requiredNamed.add_argument(
+        '--examples', dest='examples', metavar='examples per template', help='dataset directory', required=False)
     args = parser.parse_args()
 
     template_file = args.templates
     output_dir = args.output
     use_resources_dump = args.continue_generation
-
+    examples = args.examples
+    if examples:
+        EXAMPLES_PER_TEMPLATE = int(examples)
+    else:
+        EXAMPLES_PER_TEMPLATE = 600
    # print use_resources_dump => False
 
     time = datetime.datetime.today()

diff --git a/gsoc/zheyuan/README.md b/gsoc/zheyuan/README.md
diff --git a/gsoc/zheyuan/pipeline/README.md b/gsoc/zheyuan/pipeline/README.md
@@ -3,19 +3,20 @@
 To run the complete pipeline, please use the command:
 
 ```bash
-./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer]
+./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer]
 ```
  $1 -- The project's name -- String -- Required
  $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
  $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default
-
+ $4 -- Training steps -- Integer -- Optional, 60000 by default
+ $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default
 Examples
 
 ```bash
 ./pipeline.sh Project1
 ```
 ```bash
-./pipeline.sh Project2 300 512
+./pipeline.sh Project2 300 512 60000 600
 ```
 
 

diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 import tensorflow as tf
 tf.compat.v1.enable_eager_execution()
 from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
@@ -13,15 +14,17 @@ def batch_paraphrase(templates_path, model_dir):
     folder_path = get_pretrained_model(const.URL)
     set_seed(42)
     tokenizer, device, model = prepare_model(folder_path)
-    with open(templates_path, "r") as lines:
-        with open(templates_path + "_paraphrased", "w") as w:
+    dir = os.path.realpath(templates_path)
+    with open(dir, "r") as lines:
+        with open(dir + "_paraphrased", "w") as w:
             for line in lines:
                 prop = line.strip("\n").split(seperator)
                 question = prop[3]
                 paraphrased_candidates = paraphrase_questions(tokenizer, device, model, question)
                 paraphrased = pick_final_sentence(question, paraphrased_candidates)
                 advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
                 w.write(line)
+                print("Original", line)
                 # for i, candidate in enumerate(paraphrased_candidates):
                 #     new_prop = prop[:-1]
                 #     new_prop[3] = candidate
@@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir):
                 new_prop.append("Paraphrased \n")
                 new_line = seperator.join(new_prop)
                 w.write(new_line)
+                print("Paraphrase", new_line)
+
                 new_prop = prop[:-1]
                 new_prop[3] = advanced
                 new_prop.append("Paraphrased advanced\n")
                 new_line = seperator.join(new_prop)
                 w.write(new_line)
+                print("Advanced", new_line)
 
 
 if __name__=="__main__":

diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh
@@ -3,11 +3,14 @@
 # $1 -- The project's name -- String -- Required
 # $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
 # $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default
+# $4 -- Training steps -- Integer -- Optional, 60000 by default
+# $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default
 
 if [ ! -n "$1" ] ;then
     echo "you have not input a project name!"
 else
     echo "The project name will be set to $1"
+fi
 if [ ! -n "$2" ] ;then
     dimension=300
 elif [[ ! $2 =~ ^[0-9]*$ ]]; then
@@ -27,18 +30,42 @@ else
 fi
 if [ ! -n "$3" ] ;then
     num_units=512
-elif [[ ! $2 =~ ^[0-9]*$ ]]; then
+elif [[ ! $3 =~ ^[0-9]*$ ]]; then
     echo "Please enter an integer [ >=512 recommended ] to the third parameter to set the number of units of LSTM cells"
 
 else
     num_units=$3
     echo "The number of units of LSTM cells is set to $num_units"
 fi
+if [ ! -n "$4" ] ;then
+    training_steps=60000
+elif [[ ! $4 =~ ^[0-9]*$ ]]; then
+    echo "Please enter an integer [ >=60000 recommended ] to the fourth parameter to set the number of training steps for Learner"
+
+else
+    training_steps=$4
+    echo "The number of training steps for Learner is set to $training_steps"
+fi
+if [ ! -n "$5" ] ;then
+    examples_per_template=600
+elif [[ ! $5 =~ ^[0-9]*$ ]]; then
+    echo "Please enter an integer [ >=600 recommended ] to the fifth parameter to set the number of examples per template"
+
+else
+    examples_per_template=$5
+    echo "The number of examples per template is set to $examples_per_template"
+fi
 
 
     # 1. Generate templates
-    python multi_generate_templates.py --label '['Colour', 'Organisation', 'Person', 'Software', 'Artwork', 'Place', 'Work', 'Bird']' --project_name $1 --depth 1 --multi True
+    partr="../utility/part-r-00000"
 
+    if [ ! -d $partr ]; then
+      wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
+      gzip -d part-r-00000.gz
+    fi
+    python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True
+#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']'
     # 2. Batch Paraphrasing
     # 2.1 Download BERT-Classifier
 
@@ -58,7 +85,7 @@ fi
     cd ../../../  # [neural-qa]/gsoc/
 
     mkdir ./data/$1
-    python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1
+    python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 --examples $examples_per_template
     # 3.2 Generate vocab (simple tokenizing and normalization)
     cd ./gsoc/zheyuan/utility   # [neural-qa]/gsoc/zheyuan/utility
     python vocab_creator.py --path ../../../data/$1
@@ -69,7 +96,7 @@ fi
         if [ ! -d ./GloVe/glove.6B ]; then
             curl --output ./GloVe/glove.6B.zip http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
 
-            unzip ./GloVe/glove.6B.zip -d ./Glove/glove.6B
+            unzip ./GloVe/glove.6B.zip -d ./GloVe/glove.6B
 
           else
             ls ./GloVe/glove.6b
@@ -79,14 +106,17 @@ fi
     cd ./GloVe
     python glove_finetune.py --path ../../../../data/$1
     cd ../../../../GloVe
-    if [ "$(uname)"=="Darwin" ]; then
+    if [ "$(uname)" == "Darwin" ]; then
     # Mac OS X
+      echo "This is a Mac OSX environment"
       sed -i "" "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
       sed -i "" "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
       sed -i "" "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
       sed -i "" "s/VOCAB_MIN_COUNT=.*/VOCAB_MIN_COUNT=1/" demo.sh
-    elif [ "$(expr substr $(uname -s) 1 5)"=="Linux" ]; then
+    elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+
     # GNU/Linux
+      echo "This is a Linux environment"
       sed -i "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
       sed -i "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
       sed -i "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
@@ -104,8 +134,7 @@ fi
     cd ../../
     # 4.2 Training with embedding
     cd nmt
-    python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=60000 --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
+    python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=$training_steps --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
     cd ..
 
 
-fi
diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
@@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
             sentence = sentence.strip("\n")
             sentence = "<s> " + sentence + " </s>"
             for word in sentence.split():
-                word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", ""))
+                word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
     print(len(word_en), word_en[:20])
 
     vocab_en = list(set(word_en) - set(["<s>", "</s>"]))
@@ -67,7 +67,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
         finetune_glove = batch_finetune(finetune_glove, word_split, dimension)
         start = end
         end = start + stride
-    finetune_glove = batch_finetune(finetune_glove, word_en[start:])
+    finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension)
     unk = calculate_unknown(finetune_glove, dimension)
     finetune_glove["<UNK>"] = unk
     with open(project_path+"/embed.en", "w") as w:

diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py
@@ -12,7 +12,10 @@ def english_vocab(project_path):
                 word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
 
     vocab_en = list(set(word_en))
-    vocab_en.remove("")
+    try:
+        vocab_en.remove("")
+    except:
+        print("There is no \'\' in vocab_en")
     with open(project_path+"/vocab.en", "w") as w:
         for vocab in vocab_en:
 
@@ -36,7 +39,7 @@ def sparql_vocab(project_path):
 
 def add_s_tokens(path):
     with open(path+"/data.sparql", "r") as lines:
-        with open("./GloVe/GloVe-master/data_s.sparql", "w") as w:
+        with open(path+"/../../GloVe/data_s.sparql", "w") as w:
             for line in lines:
                 new_line = "<s> " + line.strip() + " </s>\n"
                 w.write(new_line)