-
Notifications
You must be signed in to change notification settings - Fork 0
/
workflow.sql
356 lines (280 loc) · 14.9 KB
/
workflow.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# Some basic parameter tuning
SET max_heap_table_size = 4294967295;
SET tmp_table_size = 4294967295;
SET bulk_insert_buffer_size = 256217728;
# Tables to import data
DROP TABLE IF EXISTS `EVAL_categories_clear`;
CREATE TABLE `EVAL_categories_clear` (
`resource` varchar(1000) NOT NULL,
`category` varchar(1000) NOT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8;
DROP TABLE IF EXISTS `EVAL_statements_clear`;
CREATE TABLE `EVAL_statements_clear` (
`subject` varchar(1000) NOT NULL,
`predicate` varchar(1000) NOT NULL,
`object` varchar(1000) NOT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8;
# Tables for suggestion cleansing
DROP TABLE IF EXISTS `EVAL_redirects_clear`;
CREATE TABLE `EVAL_redirects_clear` (
`resource` varchar(1000) NOT NULL,
`redirect` varchar(1000) NOT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8;
DROP TABLE IF EXISTS `EVAL_defined_functional_properties`;
CREATE TABLE `EVAL_defined_functional_properties` (
`predicate` varchar(1000) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
LOAD DATA LOCAL INFILE '~/KnowMin/repo/data/test_categories.csv' INTO TABLE EVAL_categories_clear FIELDS TERMINATED BY ',' ENCLOSED BY '\'' ESCAPED BY '\\' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE '~/KnowMin/repo/data/test_statements.csv' INTO TABLE EVAL_statements_clear FIELDS TERMINATED BY ',' ENCLOSED BY '\'' ESCAPED BY '\\' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE '~/KnowMin/repo/data/redirects.csv' INTO TABLE EVAL_redirects_clear FIELDS TERMINATED BY ',' ENCLOSED BY '\'' ESCAPED BY '\\' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INFILE '~/KnowMin/repo/data/func_prop.csv' INTO TABLE EVAL_defined_functional_properties FIELDS TERMINATED BY ',' ENCLOSED BY '\"' ESCAPED BY '\\' LINES TERMINATED BY '\n';
#################################################################################################################################################################################
# Clean data from lists
DELETE FROM EVAL_categories_clear WHERE resource LIKE 'http://dbpedia.org/resource/List\_of\_%';
DELETE FROM EVAL_statements_clear WHERE subject LIKE 'http://dbpedia.org/resource/List\_of\_%';
#################################################################################################################################################################################
# Translate tables to md5 and create tables for re-translation
DROP TABLE IF EXISTS `EVAL_categories_md5`;
CREATE TABLE `EVAL_categories_md5` (
`resource_md5` char(32) NOT NULL,
`category_md5` char(32) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_categories_md5 SELECT md5(resource),md5(category) FROM EVAL_categories_clear;
ALTER TABLE `EVAL_categories_md5`
ADD INDEX `idx_categories_md5_resource` (`resource_md5` ASC),
ADD INDEX `idx_categories_md5_category` (`category_md5` ASC);
DROP TABLE IF EXISTS `EVAL_statements_md5`;
CREATE TABLE `EVAL_statements_md5` (
`subject_md5` char(32) NOT NULL,
`predicate_md5` char(32) NOT NULL,
`object_md5` char(32) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_statements_md5 SELECT md5(subject), md5(predicate), md5(object) FROM EVAL_statements_clear;
ALTER TABLE `EVAL_statements_md5`
ADD INDEX `idx_statements_md5_subject` (`subject_md5` ASC),
ADD INDEX `idx_statements_md5_predicate` (`predicate_md5` ASC),
ADD INDEX `idx_statements_md5_object` (`object_md5` ASC);
DROP TABLE IF EXISTS `EVAL_redirects_md5`;
CREATE TABLE `EVAL_redirects_md5` (
`resource_md5` char(32) NOT NULL,
`redirect_md5` char(32) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_redirects_md5 SELECT md5(resource),md5(redirect) FROM EVAL_redirects_clear;
ALTER TABLE `EVAL_redirects_md5`
ADD INDEX `idx_redirects_md5_resource` (`resource_md5` ASC),
ADD INDEX `idx_redirects_md5_redirect` (`redirect_md5` ASC);
DROP TABLE IF EXISTS `EVAL_category_translation`;
CREATE TABLE `EVAL_category_translation` (
`category` varchar(1000) NOT NULL,
`category_md5` char(32) NOT NULL,
PRIMARY KEY (`category_md5`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT IGNORE INTO EVAL_category_translation SELECT category,md5(category) FROM EVAL_categories_clear;
DROP TABLE IF EXISTS `EVAL_predicate_translation`;
CREATE TABLE `EVAL_predicate_translation` (
`predicate` varchar(1000) NOT NULL,
`predicate_md5` char(32) NOT NULL,
PRIMARY KEY (`predicate_md5`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT IGNORE INTO EVAL_predicate_translation SELECT predicate,md5(predicate) FROM EVAL_statements_clear;
DROP TABLE IF EXISTS `EVAL_resource_translation`;
CREATE TABLE `EVAL_resource_translation` (
`resource` varchar(1000) NOT NULL,
`resource_md5` char(32) NOT NULL,
PRIMARY KEY (`resource_md5`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT IGNORE INTO EVAL_resource_translation SELECT resource,md5(resource) FROM EVAL_categories_clear;
INSERT IGNORE INTO EVAL_resource_translation SELECT subject,md5(subject) FROM EVAL_statements_clear;
INSERT IGNORE INTO EVAL_resource_translation SELECT object,md5(object) FROM EVAL_statements_clear;
INSERT IGNORE INTO EVAL_resource_translation SELECT resource,md5(resource) FROM EVAL_redirects_clear;
INSERT IGNORE INTO EVAL_resource_translation SELECT redirect,md5(redirect) FROM EVAL_redirects_clear;
#################################################################################################################################################################################
# Clean data from redirects
DELETE FROM EVAL_statements_md5 WHERE subject_md5 IN(SELECT resource_md5 FROM EVAL_redirects_md5);
DELETE FROM EVAL_categories_md5 WHERE resource_md5 IN(SELECT resource_md5 FROM EVAL_redirects_md5);
#################################################################################################################################################################################
# Join categories and statements with inverted statements
DROP TABLE IF EXISTS `EVAL_cs_join_md5`;
CREATE TABLE `EVAL_cs_join_md5` (
`category_md5` CHAR(32),
`subject_md5` CHAR(32),
`predicate_md5` CHAR(32),
`object_md5` CHAR(32),
`inverted` tinyint(1)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_cs_join_md5
SELECT c.category_md5, st.subject_md5, st.predicate_md5, st.object_md5, FALSE
FROM EVAL_categories_md5 AS c
INNER JOIN EVAL_statements_md5 AS st
ON c.resource_md5 = st.subject_md5;
DROP TABLE IF EXISTS `EVAL_cat_wo_stat_md5`;
CREATE TABLE `EVAL_cat_wo_stat_md5` (
`resource_md5` char(32) NOT NULL,
`category_md5` char(32) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_cat_wo_stat_md5
SELECT c.resource_md5, c.category_md5
FROM EVAL_categories_md5 AS c
LEFT JOIN EVAL_statements_md5 AS st
ON c.resource_md5 = st.subject_md5
WHERE st.subject_md5 IS NULL;
CREATE INDEX `cws_md5_resource` on `EVAL_cat_wo_stat_md5`(`resource_md5`);
CREATE INDEX `cws_md5_category` on `EVAL_cat_wo_stat_md5`(`category_md5`);
INSERT INTO EVAL_cs_join_md5
SELECT cwo.category_md5, st.object_md5, st.predicate_md5, st.subject_md5, TRUE
FROM EVAL_cat_wo_stat_md5 AS cwo
INNER JOIN EVAL_statements_md5 AS st
ON cwo.resource_md5 = st.object_md5;
CREATE INDEX `idx_cs_join_md5_category` on `EVAL_cs_join_md5`(`category_md5`);
CREATE INDEX `idx_cs_join_md5_cpo` on `EVAL_cs_join_md5`(`category_md5`, `predicate_md5`, `object_md5`);
CREATE INDEX `idx_cs_join_md5_subject` on `EVAL_cs_join_md5`(`subject_md5`);
#################################################################################################################################################################################
# Precomputation of intermediate results
DROP TABLE IF EXISTS `EVAL_subjects_per_category_md5`;
CREATE TABLE `EVAL_subjects_per_category_md5`(
`category_md5` CHAR(32),
`subject_count` INT
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_subjects_per_category_md5
SELECT distinct category_md5, COUNT(distinct resource_md5)
FROM EVAL_categories_md5
GROUP BY category_md5;
CREATE INDEX `idx_spc_category` ON `EVAL_subjects_per_category_md5`(`category_md5`);
DROP TABLE IF EXISTS `EVAL_predicate_object_count_md5`;
CREATE TABLE `EVAL_predicate_object_count_md5`(
`category_md5` CHAR(32) NOT NULL,
`predicate_md5` CHAR(32) NOT NULL,
`object_md5` CHAR(32) NOT NULL,
`count` INT NOT NULL,
`inverted` TINYINT(1) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_predicate_object_count_md5
SELECT cs.category_md5, cs.predicate_md5, cs.object_md5, COUNT(DISTINCT cs.subject_md5), cs.inverted
FROM EVAL_cs_join_md5 AS cs
LEFT JOIN EVAL_subjects_per_category_md5 AS spc
ON cs.category_md5 = spc.category_md5
WHERE spc.subject_count >2
GROUP BY cs.category_md5, cs.predicate_md5, cs.object_md5;
CREATE INDEX `idx_poc_md5_category` ON `EVAL_predicate_object_count_md5`(`category_md5`);
CREATE INDEX `idx_poc_md5_count` ON `EVAL_predicate_object_count_md5`(`count`);
DROP TABLE IF EXISTS `EVAL_relative_frequencies_md5`;
CREATE TABLE `EVAL_relative_frequencies_md5`(
`category_md5` CHAR(32) NOT NULL,
`predicate_md5` CHAR(32) NOT NULL,
`object_md5` CHAR(32) NOT NULL,
`probability` float NOT NULL,
`inverted` TINYINT(1) NOT NULL
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_relative_frequencies_md5
SELECT poc.category_md5, poc.predicate_md5, poc.object_md5, (poc.count/spc.subject_count), poc.inverted
FROM EVAL_predicate_object_count_md5 AS poc
LEFT JOIN EVAL_subjects_per_category_md5 AS spc
ON poc.category_md5 = spc.category_md5;
CREATE INDEX `idx_pp_category` ON `EVAL_relative_frequencies_md5`(`category_md5`);
CREATE INDEX `idx_pp_cpo` ON `EVAL_relative_frequencies_md5`(`category_md5`, `predicate_md5`, `object_md5`);
#################################################################################################################################################################################
# Precomputation of functional properties
DROP TABLE IF EXISTS `EVAL_tmp_functional_properties_md5`;
CREATE TABLE `EVAL_tmp_functional_properties_md5` (
`predicate_md5` char(32) NOT NULL,
`defined_functional` tinyint(1) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_tmp_functional_properties_md5
SELECT md5(predicate), 1
FROM EVAL_defined_functional_properties;
INSERT INTO EVAL_tmp_functional_properties_md5
SELECT DISTINCT predicate_md5, 0
FROM EVAL_predicate_translation
WHERE predicate_md5
NOT IN (SELECT DISTINCT predicate_md5 FROM EVAL_tmp_functional_properties_md5);
DROP TABLE IF EXISTS `EVAL_functional_properties_md5`;
CREATE TABLE `EVAL_functional_properties_md5` (
`predicate_md5` varchar(1000) NOT NULL,
`avg_values` double NOT NULL,
`defined_functional` tinyint(1) NOT NULL,
`considered_functional` tinyint(1) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_functional_properties_md5
SELECT avgs.predicate_md5, avg, defined_functional, 0
FROM
(SELECT predicate_md5, AVG(count) AS avg
FROM
(SELECT subject_md5, predicate_md5, COUNT(predicate_md5) AS count
FROM EVAL_statements_md5
GROUP BY subject_md5, predicate_md5) counts
GROUP BY predicate_md5) avgs
INNER JOIN EVAL_tmp_functional_properties_md5 fp
ON avgs.predicate_md5 = fp.predicate_md5
ORDER BY avg;
UPDATE EVAL_functional_properties_md5
SET considered_functional = 1
WHERE defined_functional = 1
OR avg_values = 1;
#################################################################################################################################################################################
# Create suggestions
DROP TABLE IF EXISTS `EVAL_suggestions_md5`;
CREATE TABLE `EVAL_suggestions_md5` (
`status` varchar(7),
`subject_md5` CHAR(32),
`predicate_md5` CHAR(32),
`object_md5` CHAR(32),
`probability` float,
`category_md5` CHAR(32),
`inverted` tinyint(1)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_suggestions_md5
SELECT "A" AS status, ca.resource_md5 AS subject_md5, pp.predicate_md5, pp.object_md5, pp.probability, pp.category_md5, pp.inverted
FROM
(SELECT pp.predicate_md5, pp.object_md5, pp.probability, pp.category_md5, pp.inverted
FROM EVAL_relative_frequencies_md5 AS pp
WHERE pp.probability >= 0.9
AND pp.probability < 1) AS pp
JOIN EVAL_categories_md5 AS ca ON pp.category_md5 = ca.category_md5
LEFT JOIN EVAL_cs_join_md5 AS st ON st.subject_md5 = ca.resource_md5
AND st.predicate_md5 = pp.predicate_md5
AND st.object_md5 = pp.object_md5
WHERE st.predicate_md5 IS NULL
AND st.object_md5 IS NULL;
#################################################################################################################################################################################
# Clean suggestions from selflinks
DELETE FROM EVAL_suggestions_md5 WHERE subject_md5 = object_md5;
#################################################################################################################################################################################
# Clean suggestions from functional properties
DROP TABLE IF EXISTS `EVAL_functional_prop_suggestions_md5`;
CREATE TABLE `EVAL_functional_prop_suggestions_md5` (
`status` varchar(7),
`subject_md5` CHAR(32),
`predicate_md5` CHAR(32),
`object_md5` CHAR(32),
`probability` float,
`category_md5` CHAR(32),
`inverted` tinyint(1)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_functional_prop_suggestions_md5
SELECT *
FROM EVAL_suggestions_md5
WHERE EXISTS (SELECT * FROM EVAL_statements_md5 st
WHERE EVAL_suggestions_md5.subject_md5 = st.subject_md5
AND EVAL_suggestions_md5.predicate_md5 = st.predicate_md5)
AND EVAL_suggestions_md5.predicate_md5 IN (SELECT predicate_md5 FROM EVAL_functional_properties_md5 WHERE considered_functional = 1);
DELETE FROM EVAL_suggestions_md5
WHERE EXISTS (SELECT * FROM EVAL_statements_md5 st
WHERE EVAL_suggestions_md5.subject_md5 = st.subject_md5
AND EVAL_suggestions_md5.predicate_md5 = st.predicate_md5)
AND EVAL_suggestions_md5.predicate_md5 IN (SELECT predicate_md5 FROM EVAL_functional_properties_md5 WHERE considered_functional = 1);
#################################################################################################################################################################################
# Re-translate suggestions
DROP TABLE IF EXISTS `EVAL_suggestions_clear`;
CREATE TABLE `EVAL_suggestions_clear` (
`status` VARCHAR(7),
`subject` VARCHAR(1000),
`predicate` VARCHAR(1000),
`object` VARCHAR(1000),
`probability` FLOAT,
`category` VARCHAR(1000),
`inverted` tinyint(1)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO EVAL_suggestions_clear
SELECT sug.status, s2md5.resource, p2md5.predicate, o2md5.resource, sug.probability, c2md5.category, sug.inverted
FROM EVAL_suggestions_md5 AS sug
LEFT JOIN EVAL_resource_translation AS s2md5 ON sug.subject_md5 = s2md5.resource_md5
LEFT JOIN EVAL_predicate_translation AS p2md5 ON sug.predicate_md5 = p2md5.predicate_md5
LEFT JOIN EVAL_resource_translation AS o2md5 ON sug.object_md5 = o2md5.resource_md5
LEFT JOIN EVAL_category_translation AS c2md5 ON sug.category_md5 = c2md5.category_md5;