Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python 3 create_data script #4

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 28 additions & 25 deletions create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import os
import re
import shutil
import urllib
import six
from six.moves.urllib.request import urlopen
from collections import OrderedDict
from io import BytesIO
from zipfile import ZipFile
Expand All @@ -26,11 +27,11 @@
MAX_LENGTH = 50
IGNORE_KEYS_IN_GOAL = ['eod', 'topic', 'messageLen', 'message']

fin = file('mapping.pair')
replacements = []
for line in fin.readlines():
tok_from, tok_to = line.replace('\n', '').split('\t')
replacements.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))
with open('mapping.pair') as fin:
replacements = []
for line in fin:
tok_from, tok_to = line.replace('\n', '').split('\t')
replacements.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))


def is_ascii(s):
Expand Down Expand Up @@ -139,14 +140,15 @@ def normalize(text, clean_value=True):

return text


def fixDelex(filename, data, data2, idx, idx_acts):
"""Given system dialogue acts fix automatic delexicalization."""
try:
turn = data2[filename.strip('.json')][str(idx_acts)]
except:
return data

if not isinstance(turn, str) and not isinstance(turn, unicode):
if not isinstance(turn, str) and not isinstance(turn, six.text_type):
for k, act in turn.items():
if 'Attraction' in k:
if 'restaurant_' in data['log'][idx]['text']:
Expand Down Expand Up @@ -175,7 +177,7 @@ def getDialogueAct(filename, data, data2, idx, idx_acts):
except:
return acts

if not isinstance(turn, str) and not isinstance(turn, unicode):
if not isinstance(turn, str) and not isinstance(turn, six.text_type):
for k in turn.keys():
# temp = [k.split('-')[0].lower(), k.split('-')[1].lower()]
# for a in turn[k]:
Expand All @@ -190,7 +192,6 @@ def getDialogueAct(filename, data, data2, idx, idx_acts):

return acts


def get_summary_bstate(bstate, get_domain=False):
"""Based on the mturk annotations we form multi-domain belief state"""
domains = [u'taxi',u'restaurant', u'hospital', u'hotel',u'attraction', u'train', u'police']
Expand Down Expand Up @@ -258,7 +259,7 @@ def analyze_dialogue(dialogue, maxlen):
#print path
print('odd # of turns')
return None # odd number of turns, wrong dialogue
d_pp = {}
d_pp = OrderedDict()
d_pp['goal'] = d['goal'] # for now we just copy the goal
usr_turns = []
sys_turns = []
Expand Down Expand Up @@ -304,6 +305,7 @@ def get_dial(dialogue):
return dial



def loadData():
data_url = "data/multi-woz/data.json"
dataset_url = "https://www.repository.cam.ac.uk/bitstream/handle/1810/280608/MULTIWOZ2.zip?sequence=3&isAllowed=y"
Expand All @@ -313,7 +315,7 @@ def loadData():

if not os.path.exists(data_url):
print("Downloading and unzipping the MultiWOZ dataset")
resp = urllib.urlopen(dataset_url)
resp = urlopen(dataset_url)
zip_ref = ZipFile(BytesIO(resp.read()))
zip_ref.extractall("data/multi-woz")
zip_ref.close()
Expand All @@ -323,22 +325,23 @@ def loadData():
shutil.copy('data/multi-woz/MULTIWOZ2 2/dialogue_acts.json', 'data/multi-woz/')



def getDomain(idx, log, domains, last_domain):
if idx == 1:
active_domains = get_summary_bstate(log[idx]["metadata"], True)
active_domains = get_summary_bstate(log[idx]["metadata"], True)
crnt_doms = active_domains[0] if len(active_domains)!=0 else domains[0]
return crnt_doms
else:
ds_diff = get_ds_diff(log[idx-2]["metadata"], log[idx]["metadata"])
if len(ds_diff.keys()) == 0: # no clues from dialog states
crnt_doms = last_domain
else:
crnt_doms = ds_diff.keys()
crnt_doms = list(ds_diff.keys())
return crnt_doms[0] # How about multiple domains in one sentence senario ?


def get_ds_diff(prev_d, crnt_d):
diff = {}
diff = OrderedDict()
# Sometimes, metadata is an empty dictionary, bug?
if not prev_d or not crnt_d:
return diff
Expand All @@ -356,12 +359,12 @@ def createData():

# create dictionary of delexicalied values that then we will search against, order matters here!
# dic = delexicalize.prepareSlotValuesIndependent()
delex_data = {}
delex_data = OrderedDict()

fin1 = file('data/multi-woz/data.json')
fin1 = open('data/multi-woz/data.json', 'r')
data = json.load(fin1)

fin2 = file('data/multi-woz/dialogue_acts.json')
fin2 = open('data/multi-woz/dialogue_acts.json', 'r')
data2 = json.load(fin2)

for didx, dialogue_name in enumerate(data):
Expand Down Expand Up @@ -405,7 +408,7 @@ def createData():


def buildDelexDict(origin_sent, delex_sent):
dictionary = {}
dictionary = OrderedDict()
s = difflib.SequenceMatcher(None, delex_sent.split(), origin_sent.split())
bs = s.get_matching_blocks()
for i, b in enumerate(bs):
Expand All @@ -421,13 +424,13 @@ def divideData(data):
"""Given test and validation sets, divide
the data for three different sets"""
testListFile = []
fin = file('data/multi-woz/testListFile.json')
fin = open('data/multi-woz/testListFile.json', 'r')
for line in fin:
testListFile.append(line[:-1])
fin.close()

valListFile = []
fin = file('data/multi-woz/valListFile.json')
fin = open('data/multi-woz/valListFile.json', 'r')
for line in fin:
valListFile.append(line[:-1])
fin.close()
Expand All @@ -454,15 +457,15 @@ def divideData(data):

dial = get_dial(data[dialogue_name])
if dial:
dialogue = {}
dialogue = OrderedDict()
dialogue['dialogue_idx'] = dialogue_name
dialogue['domains'] = list(set(domains)) #list(set([d['domain'] for d in dial]))
last_bs = []
dialogue['dialogue'] = []

for turn_i, turn in enumerate(dial):
# usr, usr_o, sys, sys_o, sys_a, domain
turn_dialog = {}
turn_dialog = OrderedDict()
turn_dialog['system_transcript'] = dial[turn_i-1]['sys'] if turn_i > 0 else ""
turn_dialog['turn_idx'] = turn_i
turn_dialog['belief_state'] = [{"slots": [s], "act": "inform"} for s in turn['bvs']]
Expand All @@ -487,13 +490,13 @@ def divideData(data):
print("# of dialogues: Train {}, Val {}, Test {}".format(count_train, count_val, count_test))

# save all dialogues
with open('data/dev_dials.json', 'wb') as f:
with open('data/dev_dials.json', 'w') as f:
json.dump(val_dials, f, indent=4)

with open('data/test_dials.json', 'wb') as f:
with open('data/test_dials.json', 'w') as f:
json.dump(test_dials, f, indent=4)

with open('data/train_dials.json', 'wb') as f:
with open('data/train_dials.json', 'w') as f:
json.dump(train_dials, f, indent=4)

# return word_freqs_usr, word_freqs_sys
Expand Down