-
Notifications
You must be signed in to change notification settings - Fork 8
/
main.py
executable file
·135 lines (119 loc) · 5.25 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Executable to run AAE on the PubMed (CITREC) and Econis datasets
- For PubMed models can use no metadata, just titles and titles + more metadata
- For Econis models can use no metadata and titles
- To run the models on Econis using titles + more metadata see the separate script /eval/econis.py
"""
import argparse
import os
from collections import OrderedDict
from aaerec.datasets import Bags
from aaerec.evaluation import Evaluation
from aaerec.aae import AAERecommender, DecodingRecommender
from aaerec.baselines import RandomBaseline, Countbased, MostPopular
from aaerec.svd import SVDRecommender
from aaerec.vae import VAERecommender
from aaerec.dae import DAERecommender
from gensim.models.keyedvectors import KeyedVectors
from aaerec.condition import ConditionList, PretrainedWordEmbeddingCondition, CategoricalCondition
# Set this to the word2vec Google News corpus file
W2V_PATH = "./vectors/GoogleNews-vectors-negative300.bin.gz"
W2V_IS_BINARY = True
# Command line arguments
PARSER = argparse.ArgumentParser()
PARSER.add_argument('dataset', type=str,
help='path to dataset')
PARSER.add_argument('year', type=int,
help='First year of the testing set.')
PARSER.add_argument('-m', '--min-count', type=int,
help='Pruning parameter', default=50)
PARSER.add_argument('-o', '--outfile', type=str, default=None)
PARSER.add_argument('-dr', '--drop', type=str,
help='Drop parameter', default="1")
ARGS = PARSER.parse_args()
# Drop could also be a callable according to evaluation.py but not managed as input parameter
try:
drop = int(ARGS.drop)
except ValueError:
drop = float(ARGS.drop)
# Only with more metadata (generic conditions) for Pubmed (Econis thorugh separate script /eval/econis.py)
# key: name of a table
# owner_id: ID of citing paper
# fields: list of column names in table
# target names: key for these data in the owner_attributes dictionary
# path: absolute path to the csv file
PMC_DATA_PATH = "/media/nvme1n1/lgalke/datasets/AAEREC/pmc_final_data"
mtdt_dic = OrderedDict()
mtdt_dic["author"] = {"owner_id": "pmId", "fields": ["name"],"target_names": ["author"],
"path": os.path.join(PMC_DATA_PATH, "author.csv")}
# No need to even load those
# mtdt_dic["mesh"] = {"owner_id": "document", "fields": ["descriptor"], "target_names": ["mesh"],
# "path": os.path.join(PMC_DATA_PATH, "mesh.csv")}
# With no metadata or just titles
# DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True)
# With more metadata for PubMed (generic conditions)
DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True, owner_str="pmId",
set_str="cited", meta_data_dic=mtdt_dic)
EVAL = Evaluation(DATASET, ARGS.year, logfile=ARGS.outfile)
EVAL.setup(min_count=ARGS.min_count, min_elements=2, drop=drop)
print("Loading pre-trained embedding", W2V_PATH)
VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)
# Hyperparameters
ae_params = {
'n_code': 50,
'n_epochs': 100,
'batch_size': 500,
'n_hidden': 100,
'normalize_inputs': True,
}
vae_params = {
'n_code': 50,
# VAE results get worse with more epochs in preliminary optimization
#(Pumed with threshold 50)
'n_epochs': 50,
'batch_size': 500,
'n_hidden': 100,
'normalize_inputs': True,
}
# Models without metadata
BASELINES = [
# RandomBaseline(),
# MostPopular(),
Countbased(),
SVDRecommender(100, use_title=False)
]
RECOMMENDERS = [
AAERecommender(adversarial=False, lr=0.001, **ae_params),
AAERecommender(gen_lr=0.001, reg_lr=0.001, **ae_params),
VAERecommender(conditions=None, **vae_params),
DAERecommender(conditions=None, **ae_params)
]
# Metadata to use (apart for SVD, which uses only titles)
CONDITIONS = ConditionList([
('title', PretrainedWordEmbeddingCondition(VECTORS)),
('journal', CategoricalCondition(embedding_dim=32, reduce="sum",
sparse=False, embedding_on_gpu=True)),
('author', CategoricalCondition(embedding_dim=32, reduce="sum",
sparse=False, embedding_on_gpu=True)),
# ('mesh', CategoricalCondition(embedding_dim=32, reduce="sum",
# sparse=True, embedding_on_gpu=True))
])
# Model with metadata (metadata used as set in CONDITIONS above)
CONDITIONED_MODELS = [
# SVD can use only titles not generic conditions
# SVDRecommender(1000, use_title=True),
AAERecommender(adversarial=False, conditions=CONDITIONS, **ae_params),
AAERecommender(adversarial=True, conditions=CONDITIONS, **ae_params),
DecodingRecommender(conditions=CONDITIONS, n_epochs=100, batch_size=500,
optimizer='adam',n_hidden=100, lr=0.001, verbose=True),
VAERecommender(conditions=CONDITIONS, **vae_params),
DAERecommender(conditions=CONDITIONS, **ae_params)
]
# Use only partial citations/labels list (no additional metadata)
# with open(ARGS.outfile, 'a') as fh:
# print("~ Partial List", "~" * 42, file=fh)
# EVAL(BASELINES + RECOMMENDERS)
# Use only additional metadata (as defined in CONDITIONS for all models but SVD, which uses only titles)
with open(ARGS.outfile, 'a') as fh:
print("~ Conditioned Models", "~" * 42, file=fh)
EVAL(CONDITIONED_MODELS)