diff --git a/generator.py b/generator.py index e110477..06ca52e 100755 --- a/generator.py +++ b/generator.py @@ -45,8 +45,8 @@ 'dbo:Athlete': ['dbo:LacrossePlayer'], 'dbo:SportsTeam': ['dboBasketballTeam'] } -EXAMPLES_PER_TEMPLATE = 600 +# EXAMPLES_PER_TEMPLATE = 600 def extract_bindings(data, template): matches = list() @@ -316,12 +316,18 @@ def normalize(ontology_class): metavar='templateFile', help='templates', required=True) requiredNamed.add_argument( '--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True) + requiredNamed.add_argument( + '--examples', dest='examples', metavar='examples per template', help='dataset directory', required=False) args = parser.parse_args() template_file = args.templates output_dir = args.output use_resources_dump = args.continue_generation - + examples = args.examples + if examples: + EXAMPLES_PER_TEMPLATE = int(examples) + else: + EXAMPLES_PER_TEMPLATE = 600 # print use_resources_dump => False time = datetime.datetime.today() diff --git a/gsoc/zheyuan/README.md b/gsoc/zheyuan/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/gsoc/zheyuan/pipeline/README.md b/gsoc/zheyuan/pipeline/README.md index 72606ac..e56ce76 100644 --- a/gsoc/zheyuan/pipeline/README.md +++ b/gsoc/zheyuan/pipeline/README.md @@ -3,19 +3,20 @@ To run the complete pipeline, please use the command: ```bash -./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] +./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer] ``` $1 -- The project's name -- String -- Required $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default - + $4 -- Training steps -- Integer -- Optional, 60000 by default + $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default Examples ```bash ./pipeline.sh Project1 ``` ```bash -./pipeline.sh Project2 300 512 +./pipeline.sh Project2 300 512 60000 600 ``` diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py index 0afe08a..d1c12a5 100644 --- a/gsoc/zheyuan/pipeline/batch_paraphrase.py +++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py @@ -1,4 +1,5 @@ import argparse +import os import tensorflow as tf tf.compat.v1.enable_eager_execution() from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced @@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir): folder_path = get_pretrained_model(const.URL) set_seed(42) tokenizer, device, model = prepare_model(folder_path) - with open(templates_path, "r") as lines: - with open(templates_path + "_paraphrased", "w") as w: + dir = os.path.realpath(templates_path) + with open(dir, "r") as lines: + with open(dir + "_paraphrased", "w") as w: for line in lines: prop = line.strip("\n").split(seperator) question = prop[3] @@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir): paraphrased = pick_final_sentence(question, paraphrased_candidates) advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir) w.write(line) + print("Original", line) # for i, candidate in enumerate(paraphrased_candidates): # new_prop = prop[:-1] # new_prop[3] = candidate @@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir): new_prop.append("Paraphrased \n") new_line = seperator.join(new_prop) w.write(new_line) + print("Paraphrase", new_line) + new_prop = prop[:-1] new_prop[3] = advanced new_prop.append("Paraphrased advanced\n") new_line = seperator.join(new_prop) w.write(new_line) + print("Advanced", new_line) if __name__=="__main__": diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index cd17c6c..051de28 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -3,11 +3,14 @@ # $1 -- The project's name -- String -- Required # $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default # $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default +# $4 -- Training steps -- Integer -- Optional, 60000 by default +# $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default if [ ! -n "$1" ] ;then echo "you have not input a project name!" else echo "The project name will be set to $1" +fi if [ ! -n "$2" ] ;then dimension=300 elif [[ ! $2 =~ ^[0-9]*$ ]]; then @@ -27,18 +30,42 @@ else fi if [ ! -n "$3" ] ;then num_units=512 -elif [[ ! $2 =~ ^[0-9]*$ ]]; then +elif [[ ! $3 =~ ^[0-9]*$ ]]; then echo "Please enter an integer [ >=512 recommended ] to the third parameter to set the number of units of LSTM cells" else num_units=$3 echo "The number of units of LSTM cells is set to $num_units" fi +if [ ! -n "$4" ] ;then + training_steps=60000 +elif [[ ! $4 =~ ^[0-9]*$ ]]; then + echo "Please enter an integer [ >=60000 recommended ] to the fourth parameter to set the number of training steps for Learner" + +else + training_steps=$4 + echo "The number of training steps for Learner is set to $training_steps" +fi +if [ ! -n "$5" ] ;then + examples_per_template=600 +elif [[ ! $5 =~ ^[0-9]*$ ]]; then + echo "Please enter an integer [ >=600 recommended ] to the fifth parameter to set the number of examples per template" + +else + examples_per_template=$5 + echo "The number of examples per template is set to $examples_per_template" +fi # 1. Generate templates - python multi_generate_templates.py --label '['Colour', 'Organisation', 'Person', 'Software', 'Artwork', 'Place', 'Work', 'Bird']' --project_name $1 --depth 1 --multi True + partr="../utility/part-r-00000" + if [ ! -d $partr ]; then + wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz + gzip -d part-r-00000.gz + fi + python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True +#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' # 2. Batch Paraphrasing # 2.1 Download BERT-Classifier @@ -58,7 +85,7 @@ fi cd ../../../ # [neural-qa]/gsoc/ mkdir ./data/$1 - python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 + python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 --examples $examples_per_template # 3.2 Generate vocab (simple tokenizing and normalization) cd ./gsoc/zheyuan/utility # [neural-qa]/gsoc/zheyuan/utility python vocab_creator.py --path ../../../data/$1 @@ -69,7 +96,7 @@ fi if [ ! -d ./GloVe/glove.6B ]; then curl --output ./GloVe/glove.6B.zip http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip - unzip ./GloVe/glove.6B.zip -d ./Glove/glove.6B + unzip ./GloVe/glove.6B.zip -d ./GloVe/glove.6B else ls ./GloVe/glove.6b @@ -79,14 +106,17 @@ fi cd ./GloVe python glove_finetune.py --path ../../../../data/$1 cd ../../../../GloVe - if [ "$(uname)"=="Darwin" ]; then + if [ "$(uname)" == "Darwin" ]; then # Mac OS X + echo "This is a Mac OSX environment" sed -i "" "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh sed -i "" "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh sed -i "" "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh sed -i "" "s/VOCAB_MIN_COUNT=.*/VOCAB_MIN_COUNT=1/" demo.sh - elif [ "$(expr substr $(uname -s) 1 5)"=="Linux" ]; then + elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then + # GNU/Linux + echo "This is a Linux environment" sed -i "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh sed -i "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh sed -i "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh @@ -104,8 +134,7 @@ fi cd ../../ # 4.2 Training with embedding cd nmt - python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=60000 --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy + python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=$training_steps --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy cd .. -fi \ No newline at end of file diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py index aa452c8..54da0c3 100644 --- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py +++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py @@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): sentence = sentence.strip("\n") sentence = " " + sentence + " " for word in sentence.split(): - word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", "")) + word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) print(len(word_en), word_en[:20]) vocab_en = list(set(word_en) - set(["", ""])) @@ -67,7 +67,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): finetune_glove = batch_finetune(finetune_glove, word_split, dimension) start = end end = start + stride - finetune_glove = batch_finetune(finetune_glove, word_en[start:]) + finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension) unk = calculate_unknown(finetune_glove, dimension) finetune_glove[""] = unk with open(project_path+"/embed.en", "w") as w: diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py index cc30994..2519853 100644 --- a/gsoc/zheyuan/utility/vocab_creator.py +++ b/gsoc/zheyuan/utility/vocab_creator.py @@ -12,7 +12,10 @@ def english_vocab(project_path): word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) vocab_en = list(set(word_en)) - vocab_en.remove("") + try: + vocab_en.remove("") + except: + print("There is no \'\' in vocab_en") with open(project_path+"/vocab.en", "w") as w: for vocab in vocab_en: @@ -36,7 +39,7 @@ def sparql_vocab(project_path): def add_s_tokens(path): with open(path+"/data.sparql", "r") as lines: - with open("./GloVe/GloVe-master/data_s.sparql", "w") as w: + with open(path+"/../../GloVe/data_s.sparql", "w") as w: for line in lines: new_line = " " + line.strip() + " \n" w.write(new_line)