-
Notifications
You must be signed in to change notification settings - Fork 0
/
text-mining-workflow-v10 (1).R
1074 lines (878 loc) · 60.3 KB
/
text-mining-workflow-v10 (1).R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Workflow of phases and steps as described in Pham et al. Text mining to support abstract screening for systematic reviews: A workflow approach
# The R codes were adopted from Jelena Jovanovic, LASI'18 Workshop on Text mining for learning content analysis
# https://github.com/jeljov/Text_Mining_at_LASI18
# Codes were written in R using RStudio. Recommended latest version of RStudio and R.
# See "Humans and computing resources required for implementing the workflow" in the online Appendix of the article.
# Required packages for Figure 1 in the main paper.
# Figure.1.packages = c("dplyr", "tidyr", "caret", "rpart", "e1071", "DMwR", "quanteda", "kernlab", "randomForest", "nnet",
# "rpart.plot", "klaR", "irlba", "doSNOW","text2vec","glmnet","udpipe","topicmodels","purr")
# install.packages(Figure.1.packages) # To only install once for R installation (and update). To attach the package libraries for each R session.
# rm(list = ls()) # clean up all previous objects
# Step 0. Attach the library
library(readr) # tools to read rectangular data (like 'csv', 'tsv', and 'fwf')
library(dplyr) # tools for working with R data frames
library(tidyr) # tools to tidy R codes
library(caret) # Classification And REgression Training
library(rpart) # Recursive PARTitioning for building Classification and Regression Trees
library(e1071) # tools for support vector machines and many other topics
library(DMwR) # data mining tools
library(kernlab) # tools for kernel-based machine learning methods for classification,regression
library(quanteda) # tools for managing and analyzing textual data
library(randomForest) # tools for building random forest for classification
library(irlba) # tools for Fast Truncated Singular Value Decomposition and Principal Components Analysis
library(stringr) # tools for character manipulation
library(doSNOW) # parallel processing
library(text2vec) # text analysis and natural language processing (e.g., dist2)
# library(nnet) # neural network model
library(udpipe) # NLP language models
library(topicmodels) # topic modeling, including Latent Dirichlet Allocation model
library(purrr) # tools for working with functions and vectors
# Initialization
pos.label="INCLUDE" # coding of screening results
neg.label="EXCLUDE"
###################################################################################
# Step 1. Import citations (Figure 1)
## Import input data of abstract text and results of abstract screening by human reviewers into R
## Input data: Comma-separated values (CSV) file with column names: id, title, abstract, status. See the file sample and some explanations below.
## column "id": numerical id of abstracts. Each abstract appears twice, as it was reviewed by pairs of reviewers.
## columns "title", "abstract": title and abstract of each abstract
## column "status": results of abstract screening by human reviewers, such as "INCLUDED" or "EXCLUDED"
## For status, use uppercase in the coding of "INCLUDED" or "EXCLUDED"; otherwise, please find and replace the hard coding categories above.
## In our practice, we put the protocol as the first record in the corpus as it contains all the relevant terms for abstract screening
raw.file="/home/ba/ranking/csvfiles/sr t2d final.csv"
raw.ta=read.csv(file=raw.file,header=TRUE,stringsAsFactors=FALSE)
## check and remove abstracts with missing status
jid=which(raw.ta$status=="") # retrieve row number of abstracts with missing screening results
raw.ta=raw.ta[-jid,] # remove these abstracts
## The input data is organized as screening results (after reconciliation) from two reviewers
## Each abstract is represented by two rows, for the first and second reviewers, but the screening results are the same after reconciliation
selected.ta= seq(from=1, to=nrow(raw.ta) - 1, by=2)
## keep data from one reviewer only in the working TA's database
wkta=raw.ta[selected.ta,]
rm(raw.ta) # remove unused R datasets from the R space
# coding the text categories of the screening results into R categories
wkta[,4] = factor(wkta[,4],levels=c(pos.label,neg.label)) # machine-learning models to predict the first category of included abstracts
summary(wkta[,4])
### merge titles and abstracts into text for analysis, retain variables in wkta
wkta=wkta %>% mutate(text=paste(title,abstract,sep=". ")) %>% dplyr::select(id,title,abstract,text,status)
dim(wkta)
###################################################################################
# Step 2. Pre-process text (Figure 1)
###################################################################################
## Take a look at a specific abstract and observe the "copyright" info
display_id=662067 # this abstract is with copyright info
wkta$text[wkta$id==display_id] # display th etext, see Copyright info at the end of the text
## A regular expression is a special text string for describing a search pattern.
## See for example a quick reference and an online testing tool for building your regular expressions
## https://www.regexbuddy.com/regex.html
## https://regex101.com/
## Regular expression to locate "Copy right" information in an abstract.
## Note: in R, you need to double the "\\" in the specification of the regular expression.
copyright.pattern = "\\bCopyright\\b(.)*" # match the word and any characters beyond to the end of the abstract
## remove all "copyright" info from text
jx = wkta$text %>% purrr::map(sub,pattern=copyright.pattern,replacement="",ignore.case=TRUE) # tidyR format
jy = sub(pattern=copyright.pattern,replacement="",x=wkta$text,ignore.case=TRUE) # old format
jx[wkta$id==display_id] # check if copyright info in abstract 662067 is gone
wkta$text=unlist(jx) # remove [[]] to [] # now all copyright info were removed
###############################################################################################
# Step 3. Construct features for singular value decomposition: 1-, 2, 3-gram phrases (Figure 1)
###############################################################################################
## Tokenize the text while removing numbers, punctuation marks, symbols, and so on
svd.tokens <- quanteda::tokens(x = wkta$text, what = "word", remove_numbers = TRUE,remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens= FALSE, ngrams=1:3,
concatenator="_") # including single words, and two-word and three-word word-phrases
## if needed, see "tokenization" in the Glossary, online Appendix
str(svd.tokens) # check the volume of tokens
## remove tokens with 1 or 2 characters only as they rarely bear any meaning
svd.tokens <- tokens_keep(x = svd.tokens, min_nchar = 3)
## to lower letter
svd.tokens <- tokens_tolower(svd.tokens)
## remove stopwords (if needed, see "pre-process text" in Glossary, online Appendix)
head(stopwords(), n = 20) # display the first 20 stopwords in a total of 175 words
svd.tokens <- tokens_remove(svd.tokens, stopwords())
## Perform stemming on the tokens (if needed, see "pre-process text" in Glossary, online Appendix)
svd.tokens <- tokens_wordstem(svd.tokens, language = "english")
## create Document-Feature Matrix with Term frequencies (TF) - Inverse Document Frequencies (IDF) as feature weights
## if needed, see "feature construction" and related terms in Glossary, online Appendix
svd.dfm <- dfm(x = svd.tokens, tolower = FALSE) %>% dfm_tfidf(scheme_tf = "prop")
dim(svd.dfm) # number of terms from the corpus, typically a few million terms
##################################################################################
## Step 4. Select features (Figure 1)
## Step 4a. Select features based upon selected threshold of feature weights
## Step 4b. Use singular value decomposition for further dimension reduction (see Glossary, online Appendix)
## Step 4c. Retain 300 features which are linear combinations of the selected features
##################################################################################
## Step 4a. Select features based upon selected threshold of feature weights
### Examine the distribution of the feature weights.
features.wts=colSums(svd.dfm) # each feature weight is the sum of the weights of the feature across abstracts
summary(features.wts) # inspect the distribution of weights of features in the corpus
### set threshold to keep only features with significant feature weights in the corpus (see Table 2 in the main paper)
## keep features with weight above the quantile of 0.7, 0.8, 0.9 of the feature weight distribution (Table 2)
threshold.p = 0.90
threshold.wts <- quantile(features.wts, probs = threshold.p)
threshold.wts
to_keep <- which(features.wts > threshold.wts) # indices of selected features
### keep features with weights above threshold of weight distribution in the document-feature matrix
svd.dfm1 <- dfm_keep(svd.dfm, pattern = names(to_keep), valuetype = "fixed", verbose = TRUE) # matrix with TF-IDF for SVD
### inspect the number of vocabulary
nc.retained.features=dim(svd.dfm1) # ~702K terms for case study 1
topfeatures(svd.dfm1,n=50) # inspect top features with highest feature weights
jx=tail(topfeatures(svd.dfm1, n =ncol(svd.dfm1)), n=100) # inspect the terms that are at the tail of the distribution
attributes(jx)$names
## Step 4b. Use Singular Value Decomposition (SVD, if needed, see Glossary, online Appendix) for dimension reduction
### Obtain 300 linear combinations of selected features (see supporting evidence for the value of 300 in Table 2)
start.time <- Sys.time() # this is time consuming of up to 8 hours for a threshold of 70%
# cat(" HERE IS WHERE WE START: ", format(Sys.time(), "%a %b %d %X %Y"), "\n")
my.cluster <- makeCluster(spec=6, type = "SOCK") # parallel processing
registerDoSNOW(my.cluster)
sr.svd <- irlba(t(svd.dfm1), # it is transposed as SVD requires Term-Document Matrix as an input
nv = 300, # number of singular vectors to estimate
maxit = 600) # maxit is recommended to be twice larger than nv
stopCluster(my.cluster)
svd.time=Sys.time() - start.time ## keep track of this.
## cat("Computing time for SVD: ", Sys.time() - start.time, "\n")
## system("echo \"All done\" | mailx -s \'Complete SVD \' [email protected]")
### Reduced Document-Features Matrix for ML classification
sr.svd.X = sr.svd$v # Reduced DFM of 'n' abstracts x 300 features
dim(sr.svd.X)
## X matrix from SVD at differenct cutoff of feature weights
# saveRDS(sr.svd.X, "sr_svd70.RData") # threshold of 70%
# saveRDS(sr.svd.X, "sr_svd80.RData") # threshold of 80%
# saveRDS(sr.svd.X, "sr_svd90.RData") # threshold of 90%
# sr.svd.X <- readRDS("sr_svd70.RData")
###########################################################################################################################
## Steps 2-4. Build additional feature matrix to supplement the feature matrix from Singular Value Decomposition (Figure 1)
## Step 2. Natural language processing of abstracts to extract nouns and verbs
## Step 3. Build document-feature matrix of lemmatized nouns and verbs
## Step 4a. Conduct topic modeling with Latent Dirichlet Allocation
## Step 4b. Obtain the posterior topic distributions of abstracts to be used as feature matrix with 300 topics
###########################################################################################################################
## Step 2 - use the *udpipe* R package
## https://github.com/bnosac/udpipe
my.dir="/home/ba/ranking/csvfiles" # storage
language.model="/home/ba/ranking/csvfiles/english-ud-2.0-170801.udpipe"
## Load the appropriate language model (the one for English language)
tagger <- udpipe_download_model("english", model_dir = my.dir)
tagger <- udpipe_load_model(file = language.model)
## Annotate the text of the abstracts using the loaded model (tagger).
## This will produce several linguistic annotations for each word, including the appropriate POS tags and lemmas
start.time <- Sys.time()
my.cluster <- makeCluster(spec=6, type = "SOCK") # parallel processing
registerDoSNOW(my.cluster)
abstract_annotated <- udpipe_annotate(tagger, wkta$text, doc_id =wkta$id) # this step is time consuming, about 2 hours
stopCluster(my.cluster)
## To be able to use the udpipe object easily, we'll transform it into a data frame, see Jelena Jovanovic, LASI'18
abst.ann.df <- as.data.frame(abstract_annotated) # Notes: udpipe provides words and their lemma's
## Save the object to have it available for later
saveRDS(abst.ann.df, "abst_ann_df.RData")
# abst.ann.df <- readRDS("abst_ann_df.RData")
## and remove the large udpipe object, to release memory
remove(abstract_annotated)
abstract.annotated.time=Sys.time() - start.time ## keep track of computing time
## check the total numbers of nouns and verbs in the annotated data
summary(factor(abst.ann.df$upos))
## use the annotated df of the abstracts to build topic modeling at the document level
## See instructions at https://bnosac.github.io/udpipe/docs/doc6.html
## Build Latent Dirichlet Allocation model with nouns and verbs - see
## Fiona Martin and Mark Johnson. More Efficient Topic Modelling Through a Noun Only Approach.
## Proceedings of Australasian Language Technology Association Workshop 2015.
dtf <- subset(abst.ann.df, upos %in% c("NOUN", "VERB"))
dtf$lemma=tolower(dtf$lemma) # fix instances we detected that the lemma of words are not in lower case
dtf1 <- document_term_frequencies(dtf, document = "doc_id", term = "lemma") # topic modeling is at the abstract level "doc_id"
## Create a document/term/matrix for building a topic model
lda.dtm <- document_term_matrix(x = dtf1)
jterms=colnames(lda.dtm)
deleted.terms=grep("[-0-9<>/#%.=\\*\\?]",jterms,value=TRUE) # fix instances where weird terms are detected as nouns in udpipe
lda.dtm1= dtm_remove_terms(lda.dtm, terms=deleted.terms)
jterms=colnames(lda.dtm1)
jterms.len = nchar(jterms)
jterms.short.terms=jterms.len > 3
deleted.terms=jterms[jterms.short.terms==FALSE]
lda.dtm2= dtm_remove_terms(lda.dtm1, terms=deleted.terms) # delete nouns or verbs with 1-3 characters with little meaning
## Remove nouns with low frequencies and remove abstracts without nouns or verbs
lda.dtm3 <- dtm_remove_lowfreq(lda.dtm2, minfreq = 5)
dim(lda.dtm)
dim(lda.dtm1)
dim(lda.dtm2)
dim(lda.dtm3)
head(dtm_colsums(lda.dtm3))
## identify removed abstracts through the steps above
jj = wkta$id %in% as.integer(rownames(lda.dtm3))
lda.deleted.abstract.ids=wkta$id[jj==FALSE] # abstracts with limited content are not used in ML training
length(lda.deleted.abstract.ids)
head(lda.deleted.abstract.ids)
lda.dtm=lda.dtm3
rm(lda.dtm1,lda.dtm2,lda.dtm3) # remove unused document-term matrices
# testing method="VEM" # Use this estimation method as the Bayesian method takes over 4 days without convergence (versus 14 hours for VEM)
lda.control.lst.test=list(verbose=5000) # this is more for the Bayesian method, now we use VEM method for estimation, so this is a place holder only
start.time <- Sys.time()
my.cluster <- makeCluster(spec=4, type = "SOCK") # SOCK stands for socket cluster, parallel processing
registerDoSNOW(my.cluster)
tm.lda.test <- LDA(x= lda.dtm, k = 300, method = "VEM", control = lda.control.lst.test) # set 300 topics
# Notes: the Bayesian approach to estimation ran for 4 days without convergence - we used the VEM estimation
stopCluster(my.cluster)
lda.time.test= Sys.time() - start.time # this step is time consuming, about 14 hours of run-time
# tm.lda <- readRDS("tm_lda.RData")
saveRDS(tm.lda.test, "tm_lda_test.RData")
## extract the posterior distributions of topics for each abstract in the corpus
sr.lda.X.test <- posterior(tm.lda.test)$topics
dim(sr.lda.X.test)
sr.lda.X=sr.lda.X.test # 300 topics
saveRDS(sr.lda.X.test, "sr_lda_X_test.RData") #
# sr.lda.X.test <- readRDS("sr_lda_X_test.RData")
##################################################################################################################################
## Steps 2-5. Build additional feature matrix and distance matrix to supplement those from Singular Value Decomposition (Figure 1)
## Clinical Concept Embeddings
## Run this section for Systematic review of clinical studies
## Step 2. Pre-process text - Annotate abstracts using the biomedical semantic annotator RysannMD
## Obtain Unique Medical Language System Clinical Concept Unique Identifiers UMLS CUI's for each abstract
## See John Cuzzola, Jelena Jovanovic, Ebrahim Bagheri. RysannMD: A biomedical semantic annotator balancing speed and accuracy.
## Journal of Biomedical Informatics 71 (2017) 91–109
## Step 3. Construct features as UMLS CUI's and obtain vector representations of CUI's using pre-trained word vectors for CUI's
## See Beam et al. Clinical Concept Embeddings Learned from Massive Sources of Multimodal Medical Data.
## Step 4. Obtain vector representations of abstracts as weighted average of vector representations of CUI's, weighting on CUI frequency
## Step 5. Calculate the distance between abstracts as the minimum amount of distance that the embedded CUI'ss of one abstract need to
## “travel” to reach the embedded CUI's of another abstract.
## Kusner M, Sun Y, Kolkin N, Weinberger K. From Word Embeddings To Document Distances.
## Proceedings of the 32 nd International Conference on Machine Learning, Lille, France, 2015.
######################################################################################################################################
## Step 2. Pre-process text - Annotate abstracts using the biomedical semantic annotator RysannMD
#### Create Dataset from the directory containing RysannMD outputs, each abstract is an annotated file
rysannmd_folder<-"/home/ba/ranking/rysannmd/MD.TA.all" # each abstract is a text file in this folder
rys_data <- data.frame(read_folder(rysannmd_folder)) %>% transmute(id, text)
# remove ".MD.title.abstract.txt" from id's in the first column of the dataset
rys_data[,1]=rys_data[,1] %>% map(sub,pattern=".MD.title.abstract.txt",replacement="") %>% as.integer()
# extract cui's and replace the text in each abstract in the dataset rys_data by a list of CUI's
start.time=Sys.time()
cui_dd <- rys_data %>% extr_cuis(certainty.threshold=0.5) # keep CUI's with high certainty of correct annotation - see function "extr_cuis" below
cui.time=Sys.time() - start.time # keep track of computing time
rm(rys_data)
## Step 3. Construct features as CUI's and obtain vector representations of CUI's using pre-trained CUI vectors from project cui2vec
## Create dtm
cui.dtm <- dfm(cui_dd$text, tolower = FALSE)
rownames(cui.dtm)=cui_dd$id
abstract.cuis=colnames(cui.dtm) # CUI's in abstracts
## Load the pre-trained vector representation of CUI from project cui2vec with 500 dimensions
cui2vec.file <- "/home/ba/ranking/csvfiles/cui2vec_pretrained.csv" # each row consists of a CUI and 500 variables
cui2vec.dd=read.csv(file=cui2vec.file,header=TRUE)
colnames(cui2vec.dd)[1]="cui" # name the column of CUI's as "cui"
## Match CUI's from the abstracts to CUI's from the pretrained dataset cui2vec
cuis.to.keep <- intersect(abstract.cuis, cui2vec.dd$cui)
## check the 'level' of matching
length(cuis.to.keep)/length(abstract.cuis)
## 44% of CUIs from our abstracts have their vectors in cui2vec - low coverage
## Create a new DTM that will keep only those common CUI's - TF
cui.dtm1 <- dfm_keep(cui.dtm, pattern=cuis.to.keep, valuetype="fixed", verbose=TRUE)
abstract.cuis=colnames(cui.dtm1) # CUI's in abstracts and in pre-trained cui2vec dataset
## Likewise, from cui2vec, select CUI's that are in the corpus abstracts
cuis.to.keep.indices <- which(cui2vec.dd$cui %in% cuis.to.keep) # rows in the cui2vec dataset
cui2vec.dd1=cui2vec.dd[cuis.to.keep.indices,]
dim(cui2vec.dd1)
## Order the columns in the cui2vec.dd1 dataset to be the same as in the columns of the cui.dtm1
jj = cui2vec.dd1[,-1] # drop the column containing CUI's names
jj = t(jj)
colnames(jj)=cui2vec.dd1$cui
jj = jj[,abstract.cuis] # ordering
jj[1:5,1:5] # check
abstract.cuis[1:5] # check
cui2vec.dd1=jj;
rm(jj);
## Step 4. Obtain vector representations of abstracts as weighted average of vector representations of CUI's, weighting on CUI frequency
start.time <- Sys.time()
sr.cui.X <- data.frame()
for(i in 1:nrow(cui.dtm1)) { # this is time consuming,about 10 hours
abst.tf <- as.matrix(cui.dtm1)[i,] # DTM
abst.matrix <- abst.tf * t(cui2vec.dd1) # weighted cloud of CUIs in 500d space
abst.mapped <- apply(abst.matrix, 2, mean) # the central point of the cloud
sr.cui.X <- as.data.frame(rbind(sr.cui.X, abst.mapped)) # store abstract representation in 500d
}
colnames(sr.cui.X) <- paste0("V",1:ncol(sr.cui.X)) # V1:V500
rownames(sr.cui.X) <- rownames(cui.dtm1) # abstract names
cui.time=Sys.time() - start.time # Time difference
sr.cui.X[1:5,1:5] # check
dim(sr.cui.X) #### matrix of n abstracts and 500 vectors
saveRDS(sr.cui.X, "sr_cui_X.RData")
# Load the saved object
# sr.cui.X <- readRDS("sr_cui_X.RData")
## Step 5. Calculate the distance between abstracts as the minimum amount of distance that the embedded words of one abstract need to
## “travel” to reach the embedded words of another abstract.
## Create a Relaxed Word Mover Distance (RWMD) object by specifying 2 input parameters:
## - word vector matrix with words given in rows and dimensions of the embedding space in columns; rows should have word names.
## - the method to be used for computing the distance between word vectors
cui.rwmd.model = RWMD$new(wv = t(cui2vec.dd1), method = "cosine") # Notes: transpose is needed here
## Now, we use the RWMD object and our DTM to compute WMD distances between
# each document pair. However, before that, we need to normalize TFs in
# the DTM matrix (required by the WMD algorithm; see the original paper)
start.time <- Sys.time() # this step is time consuming, about 23 hours
cui.dtm1.norm <- dfm_weight(cui.dtm1, scheme = "prop")
sr.cui.dist = dist2(x = cui.dtm1.norm, method = cui.rwmd.model, norm = 'none')
dim(sr.cui.dist) ##### WMD matrix for abstracts
cui.rwmd.time=Sys.time()-start.time
saveRDS(sr.cui.dist, "sr_cui_dist.RData")
# sr.cui.dist <- readRDS("sr_cui_dist.RData")
###############################################################################################################
## Steps 2-5. Build additional feature matrix and distance matrix to supplement those from Singular Value Decomposition (Figure 1)
## Word Embeddings: Global Vectors for Word Representation
## Run this section for Systematic review of non-clinical studies (e.g., SRs of health services research methods, SLR of computing topics)
## Step 2. Pre-process text to extract words from abstracts
## Step 3. Construct features as words and obtain vector representations of words using pre-trained word vectors from Glove
## Jeffrey Pennington, Richard Socher, Christopher D. Manning. GloVe: Global Vectors for Word Representation
## https://nlp.stanford.edu/projects/glove/
## Step 4. Obtain vector representations of abstracts as weighted average of vector representations of words, weighting on word frequency
## Step 5. Calculate the distance between abstracts as the minimum amount of distance that the embedded words of one abstract need to
## “travel” to reach the embedded words of another abstract.
## Kusner M, Sun Y, Kolkin N, Weinberger K. From Word Embeddings To Document Distances.
## Proceedings of the 32 nd International Conference on Machine Learning, Lille, France, 2015.
###############################################################################################################
## Step 2. Pre-process text to extract words from abstracts
glove.tokens <- tokens(x = wkta$text, what = "word", remove_numbers = TRUE,remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens= FALSE, ngrams=1) # words
## remove tokens with 1 or 2 characters only as they rarely bear any meaning
glove.tokens <- tokens_keep(x = glove.tokens, min_nchar = 3)
## to lower letter
glove.tokens <- tokens_tolower(glove.tokens)
## remove stopwords.
glove.tokens <- tokens_remove(glove.tokens, stopwords())
## Note that we are not stemming the tokens since words in the GloVe model were not stemmed, and we need to match against those words.
## Create DTM
glove.dtm <- dfm(glove.tokens, tolower=FALSE)
## Extract words (features) from the DTM since we need to match these against the words in the pre-trained GloVe model
abstract.words <- colnames(glove.dtm)
## ... and examine them
head(abstract.words, n = 20)
tail(abstract.words, n = 20)
## Notice the presence of words ending with "'s" (such as "kaiser's"). Replace such words with their version without "'s"
end.with.s <- str_detect(abstract.words, "(\\w+)'s$")
end.with.s <- abstract.words[which(end.with.s)]
words.no.s <- str_replace(end.with.s, "(\\w+)'s", "\\1")
## Replace, in the tokens object, tokens that end with "'s" with their 'cleaned' version
glove.tokens <- tokens_replace(glove.tokens, pattern = end.with.s, replacement = words.no.s)
## Step 3. Construct features as words and obtain vector representations of words using pre-trained word vectors from Glove
## Create again dtm
glove.dtm <- dfm(glove.tokens, tolower = FALSE)
## Create again a vector of vocabulary terms
abstract.words <- colnames(glove.dtm)
## Load the pre-trained GloVe word vectors of 840 billions terms and a vectorized space of 300 dimensions
glove.840B.300d.file <- "/home/ba/ranking/csvfiles/glove.840B.300d.txt"
start.time <- Sys.time()
g840B.300d <- scan(file = glove.840B.300d.file, what="", sep="\n")
glove.load.time=Sys.time() - start.time
## What we have read - g840B.300d - is in fact a huge character vector, consisting of millions of entries
## Each entry is given as a string that consists of 301 items delimited by a space:
## the 1st item is a word and the rest (300 items) are the estimated values of the 300 dimensions of that word
## Create a data frame out of the large vector read from the file
## (get_word_vectors_df() is defined in the UtilityFunctions.R script)
g840B.300d.df <- get_word_vectors_df(g840B.300d, verbose = TRUE)
dim(g840B.300d.df)
# save this large object
saveRDS(g840B.300d.df, "g840B_300d_df.RData")
# Load the saved object
# g840B.300d.df <- readRDS("g840B_300d_df.RData")
## Remove unused objects to release memory
remove(g840B.300d)
## Take the words from the GloVe model - we need these words to match them against the features (words) from the corpus DTM
glove.words <- colnames(g840B.300d.df)
## Match words from the abstracts to words from Glove
words.to.keep <- intersect(abstract.words, glove.words)
## check the 'level' of matching
length(words.to.keep)/length(abstract.words)
## 71% of words from our DTM have their vectors in GloVe
## Let's briefly inspect words from abstracts that are not in GloVe
setdiff(abstract.words, glove.words)[1:50]
## 30% missing of words, mainly words pertaining to KS methods
## Create a new DTM that will keep only those words (columns) - TF
glove.dtm1 <- dfm_keep(glove.dtm, pattern=words.to.keep, valuetype="fixed", verbose=TRUE)
## Likewise, from GloVe, select word vectors that will be used for building a feature set, that is, words present in abstracts
glove.to.keep.indices <- which(glove.words %in% words.to.keep)
g840B.300d.df1 <- g840B.300d.df[,glove.to.keep.indices]
## Order the columns (words) in the reduced g840B_300d_df1, to be the same as in the reduced glove_dtm1
g840B.300d.df1 <- g840B.300d.df1[,colnames(glove.dtm1)]
## Before proceeding, remove large objects that are no longer needed
remove(g840B.300d.df, glove.tokens, glove.words, abstract.words, glove.dtm)
## Step 4. Obtain vector representations of abstracts as weighted average of vector representations of words, weighting on word frequency
## Compute feature values for each abstract as the (coordinate-wise) TF-weighted mean value across all the word vectors.
##
## Note that after the above reduction of DTM and GloVe to the common set of
## features (words), the two matrices have the same number of columns.
## Now, we will take each abstract (row) from the DTM and multiply it with the transposed
## GloVe matrix, thus, in fact weighting word vectors in GloVe with the post-specific
## TF weights of the corresponding words. As the result, we will get a matrix of
## TF-weighted word vectors (words in rows, dimensions in columns) for each abstract.
## Next, we take the mean value (across words) for each dimension (columns), to obtain
## a new feature vector for each abstract; these vectors have the same number of features
## as there are dimensions in the GloVe model (300). This way, we are, in fact,
## translating the existing feature space (words in DTM) into a new feature space
## (dimensions of the GloVe word vectors).
start.time <- Sys.time() # this step is time consuming
sr.glove.X <- data.frame()
for(i in 1:nrow(glove.dtm1)) {
abst.tf <- as.matrix(glove.dtm1)[i,] # DTM
abst.matrix <- abst.tf * t(g840B.300d.df1) # weighted cloud of words in 300d space
abst.mapped <- apply(abst.matrix, 2, mean) # the central point of the cloud
sr.glove.X <- as.data.frame(rbind(sr.glove.X, abst.mapped)) # store document representation in 300d
}
colnames(sr.glove.X) <- paste0("V",1:ncol(sr.glove.X))
dim(sr.glove.X) #### matrix of n abstracts and 300 vectors
glove.time=Sys.time() - start.time # Time difference of 4.600928 mins
saveRDS(sr.glove.X, "sr_glove_X.RData")
# Load the saved object
# sr.glove.X <- readRDS("sr_glove_X.RData")
## Step 5. Calculate the distance between abstracts as the minimum amount of distance that the embedded words of one abstract need to
## “travel” to reach the embedded words of another abstract.
## Create a Relaxed Word Mover Distance (RWMD) object by specifying 2 input parameters:
## - word vector matrix with words given in rows and dimensions of the embedding space in columns; rows should have word names.
## - the method to be used for computing the distance between word vectors
rwmd.model = RWMD$new(wv = t(g840B.300d.df1), method = "cosine")
## Now, we use the RWMD object and our DTM to compute WMD distances between
# each document pair. However, before that, we need to normalize TFs in
# the DTM matrix (required by the WMD algorithm; see the original paper)
start.time <- Sys.time() # this step is time consuming
glove.dtm1.norm <- dfm_weight(glove.dtm1, scheme = "prop")
sr.glove.dist = dist2(x = glove.dtm1.norm, method = rwmd.model, norm = 'none')
dim(sr.glove.dist) ##### WMD matrix for abstracts
glove.rwmd.time=Sys.time()-start.time
saveRDS(sr.glove.dist, "sr_glove_dist.RData")
#############################################################
# Step 5. Quantify citation similarity (Figure 1)
#############################################################
start.time <- Sys.time()
my.cluster <- makeCluster(spec=6, type = "SOCK") # parallel processing
registerDoSNOW(my.cluster)
sr.svd.dist = dist2(x=sr.svd.X, method="cosine") # symmetric matrix of pairwise distances between pairs of citations # SA - Threshold - Repeat
# sr.lda.dist = dist2(x=sr.lda.X, method="cosine")
stopCluster(my.cluster)
Sys.time() - start.time
dim(sr.svd.dist) # Distance matrix to be used for candidate selection for screening in the workflow steps
dim(sr.lda.dist)
##################################################################################
# Workflow function - Phases 1 and 2 (Figure 1)
##################################################################################
## inputs: (see Table 2 in the main paper)
## m.x - Document-feature matrix from SVD
## m.distance: List of 3 matrices of dimension n x n of pairwise distances between n abstracts: svd, lda, and text2vec
## dd - dataset with columns "id" and "status" of abstracts. Specify status categories in pos.label and neg.label variables below.
## l.seeds: initial list of ID's of seed studies for prioritizing abstracts for screening by human reviewers
## n.rounds: maximum number of rounds of screening by human reviewers (e.g., 20 rounds, see Table 2 in the paper)
## n.initial: Minimum sample size of the initial train dataset, such as 600 abstracts when the corpus of abstracts is represented by 300 features (see Table 2)
## pick.init: The size k of k nearest-neighbors of an eligible abstract used to build the initial train dataset (e.g., k=8, see Table 2)
## pick.ml: The size k of k nearest-neighbors of an eligible abstract used to iterate the loop in the Workflow Phase 2 (e.g., k=15, see Table 2)
## outputs: (see Table 2A in the online Appendix)
## ll.candidates: list of lists of rows of abstracts in the corpus to be screened by human reviewers
## d.results: Step-specific results of the execution of the workflow (see examples in the Appendix of the paper)
## rf.svd: final random forest model with feature representations from singular value decomposition SVD
workflow.phase1and2 <- function(m.x, m.distance, dd, l.seeds, n.rounds, n.initial=600, pick.init=8, pick.ml=15) {
system("echo \"Initialization\" | mailx -s \'Workflow starts \' [email protected]") # tracking workflow steps over some running hours
## initialization
ll.candidates=NULL # list of lists of candidate abstracts for screening by human reviewers
d.results= data.frame(matrix(NA,nrow=4*n.rounds,ncol=14)) # panel data to record step-specific results from the workflow history
colnames(d.results)=c("phase","round","n.seeds","n.candidates","n.eligibles","percent",
"precision","recall","f1","accuracy","tp","fp","fn","tn") # see Table 2A of online Appendix
cum.candidate.rows=NULL # row indices of abstracts that have been screened by human reviewers. The indices refer to the rows of the corpus dataset
initial.phase.done = FALSE # flag to indicate the initial phase that accumulates the training dataset is done
train.level.done=FALSE # flag to indicate the two iterations of ML phase is done
metric="Sens" # metric to maximize in cross-validation of ML models: 1) maximizing sensitivity for all cross-validated models and 2) maximizing ROC for the final model
curr.eligibles=unlist(lapply(l.seeds,indx.lkup.all,dd=dd)) # look up the rows in the corpus of the initially eligible abstracts, see function "indx.lkup.all" below
cum.eligibles.rows=NULL # cumulative list of all seeds across iterations
c.round = 1 # round denotes the number of times the workflow interacts with human reviewers for abstract screening
while(c.round<n.rounds) { # rounds denote the number of times the workflow asks human reviewers to screen batches of selected abstracts
if(!initial.phase.done) { # Start Phase 1 to gather the training data by iterative steps 6-9, Figure 1)
## Step 6 - Prioritize citations using 3 distance matrices: SVD, feature embeddings and topic modeling LDA,(Figure 1) ##################
curr.candidate.rows = similar.abstracts(distance.matrices=m.distance,seed.id=curr.eligibles,pick=pick.init) # Step 6 - identify abstracts similar to an eligible abstract
curr.candidate.rows=c(curr.candidate.rows,curr.eligibles) # include the seeds into current candidates
curr.candidate.rows=unique(curr.candidate.rows) # remove duplicates
duplicates = curr.candidate.rows %in% cum.candidate.rows # identify duplications with already screened abstracts
curr.candidate.rows =curr.candidate.rows[duplicates==FALSE] # only candidates that are yet to be screened
if(length(curr.candidate.rows)==0) {
cat("Initial phase to generate training data: cannot generate new candidates \n")
initial.phase.done=TRUE
} # no new candidates
else {
cum.candidate.rows=c(cum.candidate.rows,curr.candidate.rows) # update the list of all candidates
cum.candidate.rows=unique(cum.candidate.rows) # remove duplicates
cum.candidate.rows=na.omit(cum.candidate.rows) # remove missing row numbers, if necessary
cum.eligibles.rows=c(cum.eligibles.rows,curr.eligibles) # update the list of all eligible abstracts
cum.eligibles.rows=unique(cum.eligibles.rows) # remove duplicates
## recording step-specific results of steps 6-7 (Figure 1)
n.seeds=length(curr.eligibles) # record the number of eligible abstracts in this round of human screening
curr.status=dd$status[cum.candidate.rows] # results of screening by human reviewers
numerator.prevalence=length(cum.candidate.rows[curr.status==pos.label]) # record the number of predicted eligible abstracts
denomerator.prevalence=length(cum.candidate.rows) # record the number of screened abstracts
recording(d.results,"initial",c.round,c.round,n.seeds,denomerator.prevalence,numerator.prevalence,
rep(NA,4),rep(NA,4)) # recording the step-specific results into the workflow panel data, see sample of the panel in the Appendix
## Step 7 - Screen citations (Figure 1) #####################
ll.candidates[[c.round]]=curr.candidate.rows # record the set of abstracts to be screened by human reviewers
curr.status=dd$status[curr.candidate.rows] # results of screening by human reviewers
curr.eligibles=curr.candidate.rows[curr.status==pos.label] # identify eligible citations as seeds for the next round of iteration
duplicates=curr.eligibles %in% cum.eligibles.rows # identify duplications in the cumulative list of eligible abstracts
curr.eligibles=curr.eligibles[duplicates==FALSE] # Step 9 - remove duplicates. Newly identified eligible abstracts are used in the next iteration of steps 6-9
if(length(curr.eligibles)==0) {
cat("Initial phase to generate training data: cannot generate new seeds \n")
initial.phase.done=TRUE
} # no new eligible abstracts
if(length(cum.candidate.rows)>n.initial) { # step 8 (Figure 1)
cat("Initial phase to generate training data: completed \n")
initial.phase.done=TRUE
k=c.round+1 # set up for fitting ML models - index to the row of the workflow panel
} # accumulate enough training data
}
## prepare for another round of human screening, if needed
c.round=c.round+1
} # end if(!initial.phase.done)
if(initial.phase.done && !train.level.done) { # start the Workflow Phase 2 in Figure 1 ##################
system("echo \"ML modeling\" | mailx -s \'Workflow in ML phase \' [email protected]") # tracking workflow steps over some running hours
## Step 7. Screen citations by human reviewers (Figure 1) ######################################
curr.dd = dd[cum.candidate.rows,] # current training dataset, given the screening results of the list of cumulative candidates
### collect step-specific statistics from the training dataset
numerator.prevalence=length(curr.dd$status[curr.dd$status==pos.label]) # record the number of eligible abstracts
denomerator.prevalence=nrow(curr.dd) # record the number of screened abstracts in the training dataset
## Step 10. Assemble training data (Figure 1) ####################################################
curr.m.x=m.x[cum.candidate.rows,] # assemble the X matrix of features from the SVD method
curr.m.xy=data.frame(curr.m.x) %>% mutate(status=curr.dd$status) # assemble training dataset, including features and screening results
## Step 11. Training random forest models (Figure 1) #################################################
rf.svd=rf.model(mdata=curr.m.xy, metric=metric)
## Step 12. Predict eligible abstracts (Figure 1) #################################################
pred.rf.raw.svd.corpus <- predict(rf.svd, newdata = m.x, type="raw") # corpus level
rf.confusion.m <- confusionMatrix(data = pred.rf.raw.svd.corpus, reference = dd$status, positive=pos.label) # evaluate model performance
m.rf.eval <- get_eval_measures(rf.confusion.m) # calculate performance measures
jtp = rf.confusion.m$table
m.rf.abcd <- c(jtp[1,1],jtp[1,2],jtp[2,1],jtp[2,2]) # Extract TP, FP, FN, TN
### recording the results of the RF classifier into workflow panel data
m.round=d.results[k-1,"round"]+2 # each fitted RF involves 2 sets of candidates for screening - see below
recording(d.results,"rf.svd",k,m.round,NA,denomerator.prevalence,numerator.prevalence, m.rf.eval,m.rf.abcd)
k= k + 1 # next rows
cat("fitting rf.svd \n")
## Step 12 - Prepare predicted eligible abstracts for screening (Figure 1) #####################
curr.candidates.rows = which(pred.rf.raw.svd.corpus == pos.label) # identify rows with predicted eligibles in the corpus
duplicates=curr.candidates.rows %in% cum.candidate.rows # identify duplications with already screened abstracts
curr.candidate.rows = curr.candidate.rows[duplicates==FALSE] # remove duplicates
## Step 12. Do we have predicted eligible abstracts to be screened by human reviewers?
if(length(curr.candidate.rows)==0) {
train.level.done = TRUE # iterate until no predicted eligible abstracts are possible
c.round = n.rounds # no more iteration
}
else {
ll.candidates.index=length(ll.candidates) + 1 # Increment the list of screened abstracts
ll.candidates[[ll.candidates.index]]=curr.candidate.rows # Step 6 - record the set of predicted eligible abtracts to be screened by human reviewers
# Step 7 - use of the screened results to identify eligible abstracts and look for similar abstracts to those newly identified eligible abstracts
curr.dd=dd[curr.candidates.rows,]
j.status=curr.dd$status # get the screened results from the screening candidates
j.eligibles <- j.status == pos.label # identify eligible abstracts among the screening candidates
j.id <- curr.dd$id[j.eligibles == TRUE] # obtain abstract ID's of the newly identified eligible abstracts
curr.eligibles=which(dd$id %in% j.id) # look up the rows in the corpus - this set becomes the subjects for the next iteration of steps 10-14 and 6-7
duplicates=curr.eligibles %in% cum.eligibles.rows # identify duplications in eligible abstracts
curr.eligibles=curr.eligibles[duplicates==FALSE] # remove duplicates
if(length(curr.eligibles)==0) { # Step 14 - Can we still identify newly identified eligible abstracts?
train.level.done = TRUE # iterate until no new eligible abstracts are possible
c.round = n.rounds # no more iteration
}
else {
# update the list of screened abstracts
cum.candidate.rows=c(cum.candidate.rows,curr.candidate.rows) # update the list of all candidates in terms of rows in the corpus dataset
cum.candidate.rows=unique(cum.candidate.rows) # remove duplicates
cum.candidate.rows=na.omit(cum.candidate.rows) # remove missing row numbers
cum.eligibles.rows=c(cum.eligibles.rows,curr.eligibles) # update the list of all eligibles that have been identified
cum.eligibles.rows=unique(cum.eligibles.rows) # remove duplicates
# Step 6 - Identify abstracts similar to the newly identified eligibles
curr.candidate.rows = similar.abstracts(distance.matrices=m.distance,seed.id=curr.eligibles,pick=pick.ml) # identify abstracts similar to newly identified eligibles
curr.candidate.rows=c(curr.candidate.rows,curr.eligibles) # include the seeds into current candidates
curr.candidate.rows=unique(curr.candidate.rows) # remove duplicates
cum.candidate.rows=na.omit(cum.candidate.rows) # remove missing row numbers
duplicates = curr.candidate.rows %in% cum.candidate.rows # identify duplications with already screened citations
curr.candidate.rows=curr.candidate.rows[duplicates==FALSE] # remove duplicates
## Step 7 - abstracts to be manually screened
ll.candidates.index=length(ll.candidates) + 1 # index to the next set of abstracts for screening by human reviewers
ll.candidates[[ll.candidates.index]]=curr.candidate.rows # record the set of abtracts to be screened by human reviewers
## prepare for another round of human screening, if needed
c.round=c.round+1 # each Workflow Phase 2 involves two rounds of manual screening
# Step 10 - Prepare the screening results for updating the training dataset
curr.dd=dd[curr.candidates.rows,]
j.status=curr.dd$status # get the screened results from the screening candidates
j.eligibles <- j.status == pos.label # identify eligible abstracts among the screening candidates
j.id <- curr.dd$id[j.eligibles == TRUE] # obtain abstract ID's of predicted eligibles
curr.eligibles=which(dd$id %in% j.id) # look up the rows in the corpus dataframe - this set becomes the new seeds
duplicates=curr.eligibles %in% cum.eligibles.rows # identify duplications in seeds
curr.eligibles=curr.eligibles[duplicates==FALSE] # remove duplicates
# Step 10 - Update the list of screened abstracts and the list of eligible abstracts identified up to this point
cum.candidate.rows=c(cum.candidate.rows,curr.candidate.rows) # update the list of all candidates in terms of rows in the corpus dataset
cum.candidate.rows=unique(cum.candidate.rows) # remove duplicates
cum.candidate.rows=na.omit(cum.candidate.rows) # remove missing row numbers
cum.eligibles.rows=c(cum.eligibles.rows,curr.eligibles) # update the list of all eligibles that have been identified
cum.eligibles.rows=unique(cum.eligibles.rows) # remove duplicates
## prepare for another round of human screening, if needed
c.round=c.round+1 # each ML iteration involves two rounds of manual screening
} # end if(length(curr.eligibles)==0)
} # end of if(length(curr.candidate.rows)==0)
} # end if(initial.phase.done && !train.level.done)
} # end while(c.round<n.rounds)
# tracking messages
if(!initial.phase.done) {cat("Initial phase to generate training data: Not completed \n")}
if(!train.level.done) {cat("ML phase - training level: Not completed \n")}
d.results=d.results %>% dplyr::filter(!is.na(round)) # remove blank rows in the workflow panel data
output=list(candidates=ll.candidates, results=d.results, rf.svd=rf.svd) # compile outputs, the final prediction model is the Random Forest with SVD feature representation
return(output)
} # end function
### Functions called by the Workflow functions
## select abstracts similar to the seed abstracts for screening, given a distance matrix
## return the rows of abstracts in the corpus dataset for human screening
## requires a distance matrix, list of seed studies and the number k fo the k-nearest neighbors
similar.abstracts.one = function(distance.matrix=NULL,seed.id=seed.id,pick=25) {
# cat("In similar abstracts - seed.id: ",seed.id,"\n" )
m.colnames=colnames(distance.matrix) # row and column names of the distance matrix must be sequenced from 1 to number of abstracts
list.pick=NULL
nn=length(seed.id)
pick = pick + 1 # skip the first one, take from 2 to pick + 1, as the abstracts that are closest to the seed
for(i in c(1:nn)) {
jtemp = distance.matrix[seed.id[i],] # take distances of each seed abstract relative to others
jord=order(jtemp,decreasing=FALSE) # line up the more similar abstracts to the seed
jextract.colnames=m.colnames[jord]
current.pick=jextract.colnames[2:pick]
list.pick=c(list.pick,current.pick)
}
list.pick=unique(list.pick)
list.pick=as.integer(list.pick)
return(list.pick) # return a list of rows in the main corpus database
}
## select abstracts similar to the seed abstracts for screening, given "n" distance matrices
similar.abstracts = function(distance.matrices=distance.matrices,seed.id=seed.id,pick=25) {
nn = length(distance.matrices) # expected distance matrices from SVD, LDA and feature embeddings
list.pick=NULL # list of candidate abstracts for human screening
for (i in c(1:nn)) {
curr.matrix=distance.matrices[[i]]
curr.pick = similar.abstracts.one(distance.matrix=curr.matrix,seed.id=seed.id,pick=pick)
# cat("in here ", i, "candidates", curr.pick, "\n")
list.pick=c(list.pick,curr.pick)
}
return(unique(list.pick)) # return a list of rows in the main corpus database
}
## look up the row of an abstract ID
indx.lkup.all=function(x,dd) {which(dd$id==x)} # look up the row of an abstract ID from the input dataframe
### Fit the Random forest model through Cross-Validation (CV)
### Input data is a dataframe with 300 predictors, the last column is labeled as "status", denoting the screening results, and is the response variable
### Output is the fitted RF model - Specify "ROC" or "Sens"
rf.model <- function(mdata, metric="ROC") { # or metric="Sens"
cv.cntrl.rf <- trainControl(method = "cv", number = 10, sampling='smote', search = "grid",
summaryFunction=twoClassSummary, classProbs = TRUE,verboseIter=FALSE) # set up cross-validation parameters, see package 'caret'
max.n.leaves = as.integer(ncol(mdata)/10) # preferrably a number much smaller than the number of features such as 300 for svd and lda or 500 for text2vec
rf.grid <- expand.grid(mtry = as.integer(seq(from = 1, to = max.n.leaves, length.out = 20))) # possible values for the # of predictors of decision trees in the RF
set.seed(seed) # fix value for the stream of pseudo-random number generator
## Create a cluster to work on logical cores;
assign("last.warning", NULL, envir = baseenv()) # clear messages
unregister() # clear any remaining registered data on parallel processing
my.cluster <- makeCluster(spec=6, type = "SOCK") # specify the number of clusters for parallel processing
registerDoSNOW(my.cluster) # start the parallel processing
m.rf.cv <- train(status ~ ., data=mdata, method = "rf", ntree=500, metric=metric,
tuneGrid = rf.grid, trControl=cv.cntrl.rf, maximize=TRUE) # fit the RF model through CV
stopCluster(my.cluster) # end parallel processing
assign("last.warning", NULL, envir = baseenv()) # clear messages
unregister() # clear any remaining registered data on parallel processing
return(m.rf.cv)
}
## Recording specific results of the workflow in a dataframe (see Table 2A for sample output)
recording = function(d.results,phase,k,round,n.seeds,n.candidates,n.eligibles,eval,table) {
jj=d.results # obtain a copy of the current dataframe of step-specific results
jj[k,"phase"]=phase # worflow phases: initial phase and ML phase, which specifies RF, SVM, Ensemble of the two
jj[k,"round"]=round # the number of times the Workflow function interacts with human reviewers to help screening citations for the training dataset
jj[k,"n.seeds"]=n.seeds # number of seeds used in the near-neighboring procedure to identify candidates for screening by human reviewers
jj[k,"n.candidates"]=n.candidates # number of candidate citations required screening by human reviewers
jj[k,"n.eligibles"]=n.eligibles # number of eligible citations from the current round of screening
jj[k,"percent"]=round(100*n.eligibles/n.candidates) # percent eligible citation from the current round of screening
jj[k,c("precision","recall","f1","accuracy")]=eval # RF or SVM performance measures
jj[k,c("tp","fp","fn","tn")]=table # breakdown of cells in the 2x2 tables
eval.parent(substitute(d.results<-jj)) # call by reference https://www.r-bloggers.com/call-by-reference-in-r/ to put the updated results in the d.results
}
### The function extracts some basic evaluation metrics from the model evaluation object
### produced by the confusionMatrix() function of the caret package
get_eval_measures <- function(model_eval) {
metrics <- c("Precision", "Recall", "F1", "Accuracy")
eval_measures <- model_eval$byClass[metrics[1:3]]
eval_measures <- c(eval_measures, model_eval$overall[metrics[4]])
eval_measures
}
### Ensemble classification with positive is identified if at least one classifier is positive
ensemble23 = function(l1,l2,l3=NULL,positives=pos.label,negatives=neg.label) { # l1, l2, l3 are the raw predictions from the classifiers
jout = rep(negatives,length(l1)) # "INCLUDE" if either classifier labeled as "INCLUDE"
ja=l1 == positives # e.g., RF
jb=l2 == positives # SVM
if(length(l3)==0) {
jc=ja | jb
}
else {
jc=l3 == positives # XGBT
jc=ja|jb|jc
}
jout[jc]=positives # if any classifier is "INCLUDE"
jout=factor(jout,levels=c(positives,negatives))
return(jout)
}
## function to convert uncertainty values (some in scientific format e.g., 0.76E-4) into numeric values
scientific_format_to_decimal=function(x) {
indx=regexpr("[E]",x,ignore.case=TRUE)[1]
if(indx>=1) {
num=substr(x,start=1,stop=indx-1);
n.exp=substr(x,start=indx+1, stop=nchar(x));
y=as.numeric(num)*10^as.numeric(n.exp);
} else y=as.numeric(x)
return(y)
}
### Extract Concept Unique Identifiers (CUI) from the output of the Annotator RysannMD for each abstract
### See John Cuzzola, Jelena Jovanovic, Ebrahim Bagheri. RysannMD: A biomedical semantic annotator balancing speed and accuracy.
### Journal of Biomedical Informatics 71 (2017) 91–109
### Input: annotated text from RysannMD and a threshold of uncertainty
extr_cuis <- function(rys_data,certainty.threshold=0.5) {
### UMLS CUI coding pattern: UMLS(cui)\":\"C0035647;
cui.pattern="UMLS\\(cui\\)\\\":\\\"C[0-9]{7}" # search patterns - noted the "\\(" for the escape of the special character "("
partial.cui.pattern=16 # number of trailing characters that need to be removed after matching pattern
### Uncertainty estimate of CUI - Pattern: "uncertainty\":6.1185281268738E-5
uncertainty.pattern="uncertainty\\\":[0-9\\.E-]*"
partial.uncertainty.pattern=14
op=rys_data
for(i in c(1:nrow(op))) {
# extract cui's
abst.cuis= rys_data$text[i] %>% str_extract_all(cui.pattern) %>% unlist() %>% str_remove(substr(cui.pattern,start=1,stop=partial.cui.pattern))
len1=length(abst.cuis)
# extract uncertainty and convert to numeric using a procedure in the UtilityFunctions.R
abst.uncertainty=rys_data$text[i] %>% str_extract_all(uncertainty.pattern) %>% unlist() %>% str_remove(substr(uncertainty.pattern,start=1,stop=partial.uncertainty.pattern))
len2=length(abst.uncertainty)
uncertainty=abst.uncertainty %>% map(scientific_format_to_decimal) %>% unlist()
certainty = 1-uncertainty # vector of certainty estimates
if(len1!=len2) {
cat("Not matching for list of CUI's and list of uncertainty estimates for abstract: ",i,"\n")
absts="NA"
} else {
jindx=certainty>=certainty.threshold
absts=abst.cuis[jindx==TRUE] %>% str_c(collapse=" ") # only kept the cui's with high certainty
}
op$text[i]= absts
}
return(op)
}
## The function creates a data frame out of the word vectors
## that originate from a pre-trained GloVe model (m_glove)
get_word_vectors_df <- function(m_glove, verbose = FALSE) {
# initialize space for values and the names of each word in the model
n_words <- length(m_glove)
vals <- list()
names <- character(n_words)
# loop through to gather values and names of each word
for(i in 1:n_words) {
if (verbose) {
if(i %% 5000 == 0) {print(i)}
}
this_vec <- m_glove[i]
this_vec_unlisted <- unlist(strsplit(this_vec, " "))
this_vec_values <- as.numeric(this_vec_unlisted[-1])
this_vec_name <- this_vec_unlisted[1]
vals[[i]] <- this_vec_values
names[i] <- this_vec_name
}
# convert the list to a data frame and attach the names
glove_df <- data.frame(vals)
names(glove_df) <- names
glove_df
}
## The function computes harmonic mean for the given input vector
harmonicMean <- function(values, precision=2000L) {
require("Rmpfr")
valMed <- median(values)
as.double(valMed - log(mean(exp(-mpfr(values, prec = precision) + valMed))))
}
## The function reads all files from the given folder (infolder)
## into a data frame and returns the created data frame
read_folder <- function(infolder) {
data_frame(file = dir(infolder, full.names = TRUE)) %>%
mutate(text = map(file, read_lines)) %>%
transmute(id = basename(file), text) %>%
unnest(text) # text is a list-column; unnest transforms each element of the list into a row
}
# clearance of previous parallel cluster data in the system
# https://stackoverflow.com/questions/25097729/un-register-a-doparallel-cluster
unregister <- function() {
env <- foreach:::.foreachGlobals
rm(list=ls(name=env), pos=env)
}
### End of the Workflow algorithm with Steps 1-11
# Set up for workflow execution -
# NOTES: These steps are messy right now because they were modified as I tested things out. It will be a few lines once the testing is done
# abstracts with limited content are not used in ML training, for SR type 1 diabetes, n=68
dim(wkta) # 16375 x 5, 16375 - 68 = 16307
length(lda.deleted.abstract.ids)
lda.deleted.abstract.rows=unlist(lapply(lda.deleted.abstract.ids,indx.lkup.all,dd=wkta)) # look up the rows of the deleted abstract ID's
length(lda.deleted.abstract.rows)
# corpus data frame
jj.dd = wkta %>% dplyr::select(id, abstract, status)
dim(jj.dd) # 16375 x 3
jj.dd1= jj.dd[-lda.deleted.abstract.rows,]
dim(jj.dd1) # 16307 x 3
# X matrices
dim(sr.svd.X) # 16375 x 300
length(wkta$id) # matching 16375
sr.svd.X1=sr.svd.X[-lda.deleted.abstract.rows,] # SA - Threshold - Repeat
dim(sr.svd.X1) # 16307 x 300
sr.lda.X1=sr.lda.X.test # LDA
dim(sr.lda.X1) # 16307 x 300
dim(sr.cui.X) # 16376 500
j.sr.cui.X.ids=as.integer(rownames(sr.cui.X))
jjone=j.sr.cui.X.ids %in% wkta$id
summary(jjone)
j.sr.cui.X.ids[jjone==FALSE] # ID is 662239