-
Notifications
You must be signed in to change notification settings - Fork 0
/
FUN_TFIDF.R
71 lines (60 loc) · 1.99 KB
/
FUN_TFIDF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
## bind_tf_idf {tidytext}
TF_IDF = function (tbl, term, document, n , mode = "Basic")
{
term <- quo_name(enquo(term))
document <- quo_name(enquo(document))
n_col <- quo_name(enquo(n))
terms <- as.character(tbl[[term]])
documents <- as.character(tbl[[document]])
n <- tbl[[n_col]]
doc_totals <- tapply(n, documents, sum)
if(mode == "Basic"){
idf <- log(length(doc_totals)/table(terms))
tbl$tf <- n/as.numeric(doc_totals[documents])
tbl$idf <- as.numeric(idf[terms])
}else if(mode == "A"){
idf <- log(length(doc_totals)/table(terms))
tbl$tf <- n/as.numeric(doc_totals[documents])
tbl[tbl$tf > 0, ]$tf <- 1
if(nrow(tbl[tbl$tf <= 0, ])>0){
tbl[tbl$tf <= 0, ]$tf <- 0
}
tbl$idf <- as.numeric(idf[terms])
}else if(mode == "B"){
idf <- log(length(doc_totals)/table(terms))
tbl$tf <- n/as.numeric(doc_totals[documents])
maxtf <- max(tbl$tf)
tbl$tf <- 0.5 + 0.5*tbl$tf/maxtf
tbl$idf <- as.numeric(idf[terms])
}else if(mode == "C"){
idf <- log((length(doc_totals)-table(terms))/table(terms))
tbl$tf <- n/as.numeric(doc_totals[documents])
tbl[tbl$tf > 0, ]$tf <- 1
if(nrow(tbl[tbl$tf <= 0, ])>0){
tbl[tbl$tf <= 0, ]$tf <- 0
}
tbl$idf <- as.numeric(idf[terms])
if(nrow(tbl[tbl$idf <= 0, ])>0){
tbl[tbl$idf <= 0, ]$idf <- 0
}
}else if(mode == "D"){
idf <- log((length(doc_totals)-table(terms))/table(terms))
tbl$tf <- n/as.numeric(doc_totals[documents])
maxtf <- max(tbl$tf)
tbl$tf <- 0.5 + 0.5*tbl$tf/maxtf
tbl$idf <- as.numeric(idf[terms])
if(nrow(tbl[tbl$idf <= 0, ])>0){
tbl[tbl$idf <= 0, ]$idf <- 0
# ##
# tbl$idf[tbl$idf<0] <- 0
# ##
}
}
tbl$tf_idf <- tbl$tf * tbl$idf
if (any(tbl$idf < 0, na.rm = TRUE)) {
rlang::warn(paste("A value for tf_idf is negative:\n",
"Input should have exactly one row per document-term combination."))
}
tbl
return(tbl)
}