From 24e370109e10e6018b08e1d9d3da746d945f7ebe Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Sat, 29 Aug 2020 02:03:36 +0200 Subject: [PATCH 1/5] Added all root ontology classes --- gsoc/zheyuan/pipeline/pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index cd17c6c..c4b9a89 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -37,7 +37,7 @@ fi # 1. Generate templates - python multi_generate_templates.py --label '['Colour', 'Organisation', 'Person', 'Software', 'Artwork', 'Place', 'Work', 'Bird']' --project_name $1 --depth 1 --multi True + python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' --project_name $1 --depth 1 --multi True # 2. Batch Paraphrasing # 2.1 Download BERT-Classifier From bd8c2fe6161488a336fc8c529e99c583d4baecf6 Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Sat, 29 Aug 2020 13:01:59 +0200 Subject: [PATCH 2/5] EXAMPLES_PER_TEMPLATE and training_steps are added as optional parameters of the pipeline --- generator.py | 10 +++++++-- gsoc/zheyuan/pipeline/pipeline.sh | 36 +++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/generator.py b/generator.py index e110477..96424bc 100755 --- a/generator.py +++ b/generator.py @@ -45,8 +45,8 @@ 'dbo:Athlete': ['dbo:LacrossePlayer'], 'dbo:SportsTeam': ['dboBasketballTeam'] } -EXAMPLES_PER_TEMPLATE = 600 +# EXAMPLES_PER_TEMPLATE = 600 def extract_bindings(data, template): matches = list() @@ -316,12 +316,18 @@ def normalize(ontology_class): metavar='templateFile', help='templates', required=True) requiredNamed.add_argument( '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True) + requiredNamed.add_argument( + '--examples', dest='examples', metavar='examples per template', help='dataset directory', required=False) args = parser.parse_args() template_file = args.templates output_dir = args.output use_resources_dump = args.continue_generation - + examples = args.examples + if examples: + EXAMPLES_PER_TEMPLATE = examples + else: + EXAMPLES_PER_TEMPLATE = 600 # print use_resources_dump => False time = datetime.datetime.today() diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index c4b9a89..07226fe 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -3,11 +3,14 @@ # $1 -- The project's name -- String -- Required # $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default # $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default +# $4 -- Training steps +# $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default if [ ! -n "$1" ] ;then echo "you have not input a project name!" else echo "The project name will be set to $1" +fi if [ ! -n "$2" ] ;then dimension=300 elif [[ ! $2 =~ ^[0-9]*$ ]]; then @@ -27,13 +30,31 @@ else fi if [ ! -n "$3" ] ;then num_units=512 -elif [[ ! $2 =~ ^[0-9]*$ ]]; then +elif [[ ! $3 =~ ^[0-9]*$ ]]; then echo "Please enter an integer [ >=512 recommended ] to the third parameter to set the number of units of LSTM cells" else num_units=$3 echo "The number of units of LSTM cells is set to $num_units" fi +if [ ! -n "$4" ] ;then + training_steps=60000 +elif [[ ! $4 =~ ^[0-9]*$ ]]; then + echo "Please enter an integer [ >=60000 recommended ] to the fourth parameter to set the number of training steps for Learner" + +else + training_steps=$4 + echo "The number of training steps for Learner is set to $training_steps" +fi +if [ ! -n "$5" ] ;then + examples_per_template=600 +elif [[ ! $5 =~ ^[0-9]*$ ]]; then + echo "Please enter an integer [ >=600 recommended ] to the fifth parameter to set the number of examples per template" + +else + examples_per_template=$5 + echo "The number of examples per template is set to $examples_per_template" +fi # 1. Generate templates @@ -58,7 +79,7 @@ fi cd ../../../ # [neural-qa]/gsoc/ mkdir ./data/$1 - python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 + python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 --examples $examples_per_template # 3.2 Generate vocab (simple tokenizing and normalization) cd ./gsoc/zheyuan/utility # [neural-qa]/gsoc/zheyuan/utility python vocab_creator.py --path ../../../data/$1 @@ -69,7 +90,7 @@ fi if [ ! -d ./GloVe/glove.6B ]; then curl --output ./GloVe/glove.6B.zip http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip - unzip ./GloVe/glove.6B.zip -d ./Glove/glove.6B + unzip ./GloVe/glove.6B.zip -d ./GloVe/glove.6B else ls ./GloVe/glove.6b @@ -79,14 +100,17 @@ fi cd ./GloVe python glove_finetune.py --path ../../../../data/$1 cd ../../../../GloVe - if [ "$(uname)"=="Darwin" ]; then + if [ "$(uname)" == "Darwin" ]; then # Mac OS X + echo "This is a Mac OSX environment" sed -i "" "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh sed -i "" "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh sed -i "" "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh sed -i "" "s/VOCAB_MIN_COUNT=.*/VOCAB_MIN_COUNT=1/" demo.sh - elif [ "$(expr substr $(uname -s) 1 5)"=="Linux" ]; then + elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then + # GNU/Linux + echo "This is a Linux environment" sed -i "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh sed -i "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh sed -i "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh @@ -104,7 +128,7 @@ fi cd ../../ # 4.2 Training with embedding cd nmt - python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=60000 --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy + python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=$training_steps --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy cd .. From 44e27fb94b9c8d68a24d1fd47d3cdd417eaf37c3 Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Sat, 29 Aug 2020 21:13:26 +0200 Subject: [PATCH 3/5] Fixing bugs --- gsoc/zheyuan/README.md | 0 gsoc/zheyuan/pipeline/README.md | 7 ++++--- gsoc/zheyuan/pipeline/pipeline.sh | 2 +- gsoc/zheyuan/utility/GloVe/glove_finetune.py | 2 +- gsoc/zheyuan/utility/vocab_creator.py | 5 ++++- 5 files changed, 10 insertions(+), 6 deletions(-) delete mode 100644 gsoc/zheyuan/README.md diff --git a/gsoc/zheyuan/README.md b/gsoc/zheyuan/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/gsoc/zheyuan/pipeline/README.md b/gsoc/zheyuan/pipeline/README.md index 72606ac..e56ce76 100644 --- a/gsoc/zheyuan/pipeline/README.md +++ b/gsoc/zheyuan/pipeline/README.md @@ -3,19 +3,20 @@ To run the complete pipeline, please use the command: ```bash -./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] +./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer] ``` $1 -- The project's name -- String -- Required $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default - + $4 -- Training steps -- Integer -- Optional, 60000 by default + $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default Examples ```bash ./pipeline.sh Project1 ``` ```bash -./pipeline.sh Project2 300 512 +./pipeline.sh Project2 300 512 60000 600 ``` diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index 07226fe..d5b5e5b 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -3,7 +3,7 @@ # $1 -- The project's name -- String -- Required # $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default # $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default -# $4 -- Training steps +# $4 -- Training steps -- Integer -- Optional, 60000 by default # $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default if [ ! -n "$1" ] ;then diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py index aa452c8..14a2629 100644 --- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py +++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py @@ -67,7 +67,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): finetune_glove = batch_finetune(finetune_glove, word_split, dimension) start = end end = start + stride - finetune_glove = batch_finetune(finetune_glove, word_en[start:]) + finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension) unk = calculate_unknown(finetune_glove, dimension) finetune_glove[""] = unk with open(project_path+"/embed.en", "w") as w: diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py index cc30994..4755f5a 100644 --- a/gsoc/zheyuan/utility/vocab_creator.py +++ b/gsoc/zheyuan/utility/vocab_creator.py @@ -12,7 +12,10 @@ def english_vocab(project_path): word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) vocab_en = list(set(word_en)) - vocab_en.remove("") + try: + vocab_en.remove("") + except: + print("There is no \'\' in vocab_en") with open(project_path+"/vocab.en", "w") as w: for vocab in vocab_en: From 82105bdef65319422edeed4fed07b1236e824b93 Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Sat, 29 Aug 2020 21:29:48 +0200 Subject: [PATCH 4/5] Fixing bugs --- generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generator.py b/generator.py index 96424bc..06ca52e 100755 --- a/generator.py +++ b/generator.py @@ -325,7 +325,7 @@ def normalize(ontology_class): use_resources_dump = args.continue_generation examples = args.examples if examples: - EXAMPLES_PER_TEMPLATE = examples + EXAMPLES_PER_TEMPLATE = int(examples) else: EXAMPLES_PER_TEMPLATE = 600 # print use_resources_dump => False From 382f4a0c7cf43eb2b5359fbea4bc6d718ded9b61 Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Mon, 31 Aug 2020 01:06:29 +0200 Subject: [PATCH 5/5] Fixing bugs --- gsoc/zheyuan/pipeline/batch_paraphrase.py | 10 ++++++++-- gsoc/zheyuan/pipeline/pipeline.sh | 9 +++++++-- gsoc/zheyuan/utility/GloVe/glove_finetune.py | 2 +- gsoc/zheyuan/utility/vocab_creator.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py index 0afe08a..d1c12a5 100644 --- a/gsoc/zheyuan/pipeline/batch_paraphrase.py +++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py @@ -1,4 +1,5 @@ import argparse +import os import tensorflow as tf tf.compat.v1.enable_eager_execution() from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced @@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir): folder_path = get_pretrained_model(const.URL) set_seed(42) tokenizer, device, model = prepare_model(folder_path) - with open(templates_path, "r") as lines: - with open(templates_path + "_paraphrased", "w") as w: + dir = os.path.realpath(templates_path) + with open(dir, "r") as lines: + with open(dir + "_paraphrased", "w") as w: for line in lines: prop = line.strip("\n").split(seperator) question = prop[3] @@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir): paraphrased = pick_final_sentence(question, paraphrased_candidates) advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir) w.write(line) + print("Original", line) # for i, candidate in enumerate(paraphrased_candidates): # new_prop = prop[:-1] # new_prop[3] = candidate @@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir): new_prop.append("Paraphrased \n") new_line = seperator.join(new_prop) w.write(new_line) + print("Paraphrase", new_line) + new_prop = prop[:-1] new_prop[3] = advanced new_prop.append("Paraphrased advanced\n") new_line = seperator.join(new_prop) w.write(new_line) + print("Advanced", new_line) if __name__=="__main__": diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index d5b5e5b..051de28 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -58,8 +58,14 @@ fi # 1. Generate templates - python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' --project_name $1 --depth 1 --multi True + partr="../utility/part-r-00000" + if [ ! -d $partr ]; then + wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz + gzip -d part-r-00000.gz + fi + python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True +#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' # 2. Batch Paraphrasing # 2.1 Download BERT-Classifier @@ -132,4 +138,3 @@ fi cd .. -fi \ No newline at end of file diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py index 14a2629..54da0c3 100644 --- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py +++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py @@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): sentence = sentence.strip("\n") sentence = " " + sentence + " " for word in sentence.split(): - word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", "")) + word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) print(len(word_en), word_en[:20]) vocab_en = list(set(word_en) - set(["", ""])) diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py index 4755f5a..2519853 100644 --- a/gsoc/zheyuan/utility/vocab_creator.py +++ b/gsoc/zheyuan/utility/vocab_creator.py @@ -39,7 +39,7 @@ def sparql_vocab(project_path): def add_s_tokens(path): with open(path+"/data.sparql", "r") as lines: - with open("./GloVe/GloVe-master/data_s.sparql", "w") as w: + with open(path+"/../../GloVe/data_s.sparql", "w") as w: for line in lines: new_line = " " + line.strip() + " \n" w.write(new_line)