-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathanalysis.R
118 lines (105 loc) · 3.7 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Analysis part. Start from loading required packages
require(dplyr)
require(stringr)
require(tm)
require(wordcloud)
require(SnowballC)
# load data - tweets we collected
load("tweets.Rdata")
## get only text from list with tweets
gettext<- function (mylist) {
text <- character()
for (i in 1:length(mylist)){
text <- append(text, mylist[[i]][[".->text"]])
}
return(text)
}
# for some reasons unlist inside function doesn't work, so sample call should look like
text <- gettext(unlist(tweets_nat))
# build and clean corpus
corp <- function (text) {
text_df <- data_frame(line = 1, text = text)
# clean up some stuff
text_df <- sapply(text_df$text,function(row) iconv(row, "latin1", "ASCII", sub=""))
#corpus is a collection of text documents
t_corpus <- Corpus(VectorSource(text_df))
#clean text and stemming
t_clean <- tm_map(t_corpus, removePunctuation)
t_clean <- tm_map(t_clean, content_transformer(tolower))
t_clean <- tm_map(t_clean, removeWords, stopwords("english"))
t_clean <- tm_map(t_clean, removeNumbers)
t_clean <- tm_map(t_clean, stripWhitespace)
t_clean <- tm_index(t_clean, stemDocument)
return(t_clean)
}
# gettiin corpus for each party
lab_corp <- gettext(unlist(tweets_lab)) %>% corp()
lib_corp <- gettext(unlist(tweets_lib)) %>% corp()
nat_corp <- gettext(unlist(tweets_nat)) %>% corp()
#wordcloud function
mywordcloud <- function (corp) {
wc <-wordcloud(corp, random.order=F,max.words=80, col=rainbow(80), scale=c(3,0.2))
return(wc)
}
# building wordclouds for parties
#setting parameters
layout(matrix(c(1, 2), nrow=2), heights=c(0.25, 4))
par(mar=rep(0, 4))
# Labor
plot.new()
text(x=0.5, y=0.5, "Labor")
lab_wc <- lab_corp %>% mywordcloud()
dev.copy(png,"labor.png",width=8,height=6,units="in",res=100)
dev.off()
#Libs
plot.new()
text(x=0.5, y=0.5, "Liberals")
lib_wc <- lib_corp %>% mywordcloud()
dev.copy(png,"liberals.png",width=8,height=6,units="in",res=100)
dev.off()
# Nationals
plot.new()
text(x=0.5, y=0.5, "Nationals")
nat_wc <- nat_corp %>% mywordcloud()
dev.copy(png,"nationals.png",width=8,height=6,units="in",res=100)
dev.off()
## Reducing dimensions - one document per party
# defining function
my_tm <- function (corp, name){
term.matrix <- termFreq (corp) %>% as.matrix()
colnames(term.matrix)<- name
# transpose to merge later
#term.matrix <- t(term.matrix)
term.matrix <- as.data.frame(term.matrix)
term.matrix$word <- row.names(term.matrix)
return(term.matrix)
}
# to new df
atm_nats <- my_tm (nat_corp, "Nationals")
atm_libs <- my_tm (lib_corp, "Liberals")
atm_labs <- my_tm (lab_corp, "Labor")
# join
atm <- full_join(atm_nats, atm_libs, by = 'word') %>% full_join(atm_labs,by = 'word')
# formatting for further use
row.names(atm)<-atm$word
atm <- atm[c(1,3,4)]
atm <- as.matrix(atm)
atm[is.na(atm)] <- 0
# save in file
write.csv(atm, 'word-party-freq-matrix.csv')
# compare
# Comparison Cloud, words that are party specific
plot.new()
text(x=0.5, y=0.5, "Comparison Cloud")
comparison.cloud(atm,max.words=80,scale=c(3,.2), random.order=FALSE, colors=brewer.pal(max(3,ncol(atm)),"Dark2"),
use.r.layout=FALSE, title.size=3,
title.colors=NULL, match.colors=FALSE,
title.bg.colors="grey90")
dev.copy(png,"comparisoncloud.png",width=8,height=6,units="in",res=100)
dev.off()
# Commonality Cloud - similar words used by all parties
plot.new()
text(x=0.5, y=0.5, "Commonality Cloud")
commonality.cloud(atm, max.words=80,random.order=FALSE)
dev.copy(png,"commonalitycloud.png",width=8,height=6,units="in",res=100)
dev.off()