-
Notifications
You must be signed in to change notification settings - Fork 0
/
09_prepare_APM_dictionaries.py
111 lines (91 loc) · 5 KB
/
09_prepare_APM_dictionaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#### Master Script 9: Train APM dictionaries and convert tokens to embedding layer indices ####
#
# Shubhayu Bhattacharyay
# University of Cambridge
# email address: [email protected]
#
### Contents:
# I. Initialisation
# II. Train APM dictionaries per repeated cross-validation partition and convert tokens to indices
### I. Initialisation
# Fundamental libraries
import os
import re
import sys
import time
import glob
import random
import datetime
import warnings
import itertools
import numpy as np
import pandas as pd
import pickle as cp
from tqdm import tqdm
import seaborn as sns
import multiprocessing
from scipy import stats
from pathlib import Path
from ast import literal_eval
import matplotlib.pyplot as plt
from collections import Counter
from argparse import ArgumentParser
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
warnings.filterwarnings(action="ignore")
from collections import Counter, OrderedDict
# PyTorch and PyTorch.Text methods
from torchtext.vocab import vocab, Vocab
# Scikit-Learn methods
from sklearn.preprocessing import KBinsDiscretizer
# Custom token preparation functions
from functions.token_preparation import no_digit_strings_to_na, categorizer, tokenize_categoricals, get_ts_event_tokens, get_date_event_tokens, load_tokens
# Set directory from which to load the APM tokens
token_dir = '/home/sb2406/rds/hpc-work/APM_tokens'
# Load cross-validation splits
cv_splits = pd.read_csv('../cross_validation_splits.csv')
# Set the number of cores for parallel processing
NUM_CORES = multiprocessing.cpu_count()
### II. Train APM dictionaries per repeated cross-validation partition and convert tokens to indices
# Iterate through cross-validation splits
for curr_repeat in cv_splits.repeat.unique():
for curr_fold in cv_splits.fold.unique():
# Define token dictionary of current split
fold_dir = os.path.join(token_dir,'repeat'+str(curr_repeat).zfill(2),'fold'+str(curr_fold).zfill(1))
# Extract current training and testing GUPIs
train_GUPIs = cv_splits.GUPI[(cv_splits.repeat == curr_repeat) & (cv_splits.fold == curr_fold) & (cv_splits.test_or_train == 'train')].values
test_GUPIs = cv_splits.GUPI[(cv_splits.repeat == curr_repeat) & (cv_splits.fold == curr_fold) & (cv_splits.test_or_train == 'test')].values
# Create partition resamples among cores
train_s = [train_GUPIs.shape[0] // NUM_CORES for _ in range(NUM_CORES)]
train_s[:(train_GUPIs.shape[0] - sum(train_s))] = [val+1 for val in train_s[:(train_GUPIs.shape[0] - sum(train_s))]]
train_end_indices = np.cumsum(train_s)
train_start_indices = np.insert(train_end_indices[:-1],0,0)
train_files_per_core = [(train_GUPIs[train_start_indices[idx]:train_end_indices[idx]],fold_dir,True,'Collecting training tokens') for idx in range(len(train_start_indices))]
test_s = [test_GUPIs.shape[0] // NUM_CORES for _ in range(NUM_CORES)]
test_s[:(test_GUPIs.shape[0] - sum(test_s))] = [val+1 for val in test_s[:(test_GUPIs.shape[0] - sum(test_s))]]
test_end_indices = np.cumsum(test_s)
test_start_indices = np.insert(test_end_indices[:-1],0,0)
test_files_per_core = [(test_GUPIs[test_start_indices[idx]:test_end_indices[idx]],fold_dir,True,'Collecting testing tokens') for idx in range(len(test_start_indices))]
# Compile training set tokens
with multiprocessing.Pool(NUM_CORES) as pool:
train_tokens = pd.concat(pool.starmap(load_tokens, train_files_per_core),ignore_index=True)
# Compile testing set tokens
with multiprocessing.Pool(NUM_CORES) as pool:
test_tokens = pd.concat(pool.starmap(load_tokens, test_files_per_core),ignore_index=True)
# Create an ordered dictionary to create a token vocabulary
train_token_freqs = train_tokens.Tokens.value_counts().to_dict(into=OrderedDict)
# Build vocabulary (PyTorch Text)
curr_vocab = vocab(train_token_freqs, min_freq=1)
unk_token = '<unk>'
if unk_token not in curr_vocab: curr_vocab.insert_token(unk_token, 0)
curr_vocab.set_default_index(curr_vocab[unk_token])
# Convert training and testing tokens to indices
train_tokens['Index'] = [curr_vocab[t] for t in train_tokens.Tokens]
test_tokens['Index'] = [curr_vocab[t] for t in test_tokens.Tokens]
# Group indices by GUPI
train_indices = train_tokens.groupby('GUPI',as_index=False)['Index'].aggregate(lambda col: col.tolist()).reset_index(drop=True)
test_indices = test_tokens.groupby('GUPI',as_index=False)['Index'].aggregate(lambda col: col.tolist()).reset_index(drop=True)
# Store training and testing indices into selected directory
train_indices.to_pickle(os.path.join(fold_dir,'training_indices.pkl'))
test_indices.to_pickle(os.path.join(fold_dir,'testing_indices.pkl'))
# Store torch Vocab object
cp.dump(curr_vocab, open(os.path.join(fold_dir,'token_dictionary.pkl'), "wb" ))