diff --git a/generator.py b/generator.py
index e110477..06ca52e 100755
--- a/generator.py
+++ b/generator.py
@@ -45,8 +45,8 @@
'dbo:Athlete': ['dbo:LacrossePlayer'],
'dbo:SportsTeam': ['dboBasketballTeam']
}
-EXAMPLES_PER_TEMPLATE = 600
+# EXAMPLES_PER_TEMPLATE = 600
def extract_bindings(data, template):
matches = list()
@@ -316,12 +316,18 @@ def normalize(ontology_class):
metavar='templateFile', help='templates', required=True)
requiredNamed.add_argument(
'--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
+ requiredNamed.add_argument(
+ '--examples', dest='examples', metavar='examples per template', help='dataset directory', required=False)
args = parser.parse_args()
template_file = args.templates
output_dir = args.output
use_resources_dump = args.continue_generation
-
+ examples = args.examples
+ if examples:
+ EXAMPLES_PER_TEMPLATE = int(examples)
+ else:
+ EXAMPLES_PER_TEMPLATE = 600
# print use_resources_dump => False
time = datetime.datetime.today()
diff --git a/gsoc/zheyuan/README.md b/gsoc/zheyuan/README.md
deleted file mode 100644
index e69de29..0000000
diff --git a/gsoc/zheyuan/pipeline/README.md b/gsoc/zheyuan/pipeline/README.md
index 72606ac..e56ce76 100644
--- a/gsoc/zheyuan/pipeline/README.md
+++ b/gsoc/zheyuan/pipeline/README.md
@@ -3,19 +3,20 @@
To run the complete pipeline, please use the command:
```bash
-./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer]
+./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer]
```
$1 -- The project's name -- String -- Required
$2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
$3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default
-
+ $4 -- Training steps -- Integer -- Optional, 60000 by default
+ $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default
Examples
```bash
./pipeline.sh Project1
```
```bash
-./pipeline.sh Project2 300 512
+./pipeline.sh Project2 300 512 60000 600
```
diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py
index 0afe08a..d1c12a5 100644
--- a/gsoc/zheyuan/pipeline/batch_paraphrase.py
+++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py
@@ -1,4 +1,5 @@
import argparse
+import os
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
@@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir):
folder_path = get_pretrained_model(const.URL)
set_seed(42)
tokenizer, device, model = prepare_model(folder_path)
- with open(templates_path, "r") as lines:
- with open(templates_path + "_paraphrased", "w") as w:
+ dir = os.path.realpath(templates_path)
+ with open(dir, "r") as lines:
+ with open(dir + "_paraphrased", "w") as w:
for line in lines:
prop = line.strip("\n").split(seperator)
question = prop[3]
@@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir):
paraphrased = pick_final_sentence(question, paraphrased_candidates)
advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
w.write(line)
+ print("Original", line)
# for i, candidate in enumerate(paraphrased_candidates):
# new_prop = prop[:-1]
# new_prop[3] = candidate
@@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir):
new_prop.append("Paraphrased \n")
new_line = seperator.join(new_prop)
w.write(new_line)
+ print("Paraphrase", new_line)
+
new_prop = prop[:-1]
new_prop[3] = advanced
new_prop.append("Paraphrased advanced\n")
new_line = seperator.join(new_prop)
w.write(new_line)
+ print("Advanced", new_line)
if __name__=="__main__":
diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh
index cd17c6c..051de28 100755
--- a/gsoc/zheyuan/pipeline/pipeline.sh
+++ b/gsoc/zheyuan/pipeline/pipeline.sh
@@ -3,11 +3,14 @@
# $1 -- The project's name -- String -- Required
# $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
# $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default
+# $4 -- Training steps -- Integer -- Optional, 60000 by default
+# $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default
if [ ! -n "$1" ] ;then
echo "you have not input a project name!"
else
echo "The project name will be set to $1"
+fi
if [ ! -n "$2" ] ;then
dimension=300
elif [[ ! $2 =~ ^[0-9]*$ ]]; then
@@ -27,18 +30,42 @@ else
fi
if [ ! -n "$3" ] ;then
num_units=512
-elif [[ ! $2 =~ ^[0-9]*$ ]]; then
+elif [[ ! $3 =~ ^[0-9]*$ ]]; then
echo "Please enter an integer [ >=512 recommended ] to the third parameter to set the number of units of LSTM cells"
else
num_units=$3
echo "The number of units of LSTM cells is set to $num_units"
fi
+if [ ! -n "$4" ] ;then
+ training_steps=60000
+elif [[ ! $4 =~ ^[0-9]*$ ]]; then
+ echo "Please enter an integer [ >=60000 recommended ] to the fourth parameter to set the number of training steps for Learner"
+
+else
+ training_steps=$4
+ echo "The number of training steps for Learner is set to $training_steps"
+fi
+if [ ! -n "$5" ] ;then
+ examples_per_template=600
+elif [[ ! $5 =~ ^[0-9]*$ ]]; then
+ echo "Please enter an integer [ >=600 recommended ] to the fifth parameter to set the number of examples per template"
+
+else
+ examples_per_template=$5
+ echo "The number of examples per template is set to $examples_per_template"
+fi
# 1. Generate templates
- python multi_generate_templates.py --label '['Colour', 'Organisation', 'Person', 'Software', 'Artwork', 'Place', 'Work', 'Bird']' --project_name $1 --depth 1 --multi True
+ partr="../utility/part-r-00000"
+ if [ ! -d $partr ]; then
+ wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
+ gzip -d part-r-00000.gz
+ fi
+ python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True
+#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']'
# 2. Batch Paraphrasing
# 2.1 Download BERT-Classifier
@@ -58,7 +85,7 @@ fi
cd ../../../ # [neural-qa]/gsoc/
mkdir ./data/$1
- python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1
+ python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 --examples $examples_per_template
# 3.2 Generate vocab (simple tokenizing and normalization)
cd ./gsoc/zheyuan/utility # [neural-qa]/gsoc/zheyuan/utility
python vocab_creator.py --path ../../../data/$1
@@ -69,7 +96,7 @@ fi
if [ ! -d ./GloVe/glove.6B ]; then
curl --output ./GloVe/glove.6B.zip http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
- unzip ./GloVe/glove.6B.zip -d ./Glove/glove.6B
+ unzip ./GloVe/glove.6B.zip -d ./GloVe/glove.6B
else
ls ./GloVe/glove.6b
@@ -79,14 +106,17 @@ fi
cd ./GloVe
python glove_finetune.py --path ../../../../data/$1
cd ../../../../GloVe
- if [ "$(uname)"=="Darwin" ]; then
+ if [ "$(uname)" == "Darwin" ]; then
# Mac OS X
+ echo "This is a Mac OSX environment"
sed -i "" "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
sed -i "" "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
sed -i "" "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
sed -i "" "s/VOCAB_MIN_COUNT=.*/VOCAB_MIN_COUNT=1/" demo.sh
- elif [ "$(expr substr $(uname -s) 1 5)"=="Linux" ]; then
+ elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+
# GNU/Linux
+ echo "This is a Linux environment"
sed -i "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
sed -i "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
sed -i "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
@@ -104,8 +134,7 @@ fi
cd ../../
# 4.2 Training with embedding
cd nmt
- python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=60000 --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
+ python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=$training_steps --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
cd ..
-fi
\ No newline at end of file
diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
index aa452c8..54da0c3 100644
--- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py
+++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
@@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
sentence = sentence.strip("\n")
sentence = " " + sentence + " "
for word in sentence.split():
- word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", ""))
+ word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
print(len(word_en), word_en[:20])
vocab_en = list(set(word_en) - set(["", ""]))
@@ -67,7 +67,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
finetune_glove = batch_finetune(finetune_glove, word_split, dimension)
start = end
end = start + stride
- finetune_glove = batch_finetune(finetune_glove, word_en[start:])
+ finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension)
unk = calculate_unknown(finetune_glove, dimension)
finetune_glove[""] = unk
with open(project_path+"/embed.en", "w") as w:
diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py
index cc30994..2519853 100644
--- a/gsoc/zheyuan/utility/vocab_creator.py
+++ b/gsoc/zheyuan/utility/vocab_creator.py
@@ -12,7 +12,10 @@ def english_vocab(project_path):
word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
vocab_en = list(set(word_en))
- vocab_en.remove("")
+ try:
+ vocab_en.remove("")
+ except:
+ print("There is no \'\' in vocab_en")
with open(project_path+"/vocab.en", "w") as w:
for vocab in vocab_en:
@@ -36,7 +39,7 @@ def sparql_vocab(project_path):
def add_s_tokens(path):
with open(path+"/data.sparql", "r") as lines:
- with open("./GloVe/GloVe-master/data_s.sparql", "w") as w:
+ with open(path+"/../../GloVe/data_s.sparql", "w") as w:
for line in lines:
new_line = " " + line.strip() + " \n"
w.write(new_line)