-
Notifications
You must be signed in to change notification settings - Fork 0
/
tidy_analysis_indian_news.R
157 lines (118 loc) · 5.42 KB
/
tidy_analysis_indian_news.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
library(tidytext)
library(tidyverse)
library(lubridate)
library(igraph)
library(ggraph)
news <- read_csv("india-news-headlines.csv")
news %>% pull(headline_category) %>% str_to_lower(.) ->news$headline_category
news %>% pull(headline_text) %>% str_to_lower(.) -> news$headline_text
# convert the date column into "date" data-type
news %>% pull(publish_date) %>% ymd(.) -> news$publish_date
# filter out headlines related to modi, bjp, congress
news %>% filter(str_detect(news$headline_text,"(narendra modi)|(bjp)|(congress)|(manmohan singh)")) -> politics_ind_tidy
# find out how many categories are there in the dataset
politics_ind_tidy %>% pull(headline_category) %>% unique(.) %>%
as_data_frame(.) %>% count()
# build an alternative str_detect function
str_detect_politics <- function(string){
politics_ind_tidy %>% pull(headline_text) %>%
str_detect(., string)
}
# convert the categories into 3, congress, bjp and prime_minister
politics_ind_tidy %>%
mutate(headline_category = case_when(
str_detect_politics("(narendra modi)|(manmohan singh)") ~ "prime_minister",
str_detect_politics("(bjp)") ~ "political_party",
str_detect_politics("(congress)") ~ "political_party"
), headline_sub_category = case_when(
str_detect_politics("(manmohan singh)") ~ "manmohan",
str_detect_politics("(narendra modi)") ~ "modi",
str_detect_politics("(bjp)") ~ "bjp",
str_detect_politics("(congress)") ~ "congress"
) ) -> politics_ind_tidy
# convert it to tidy format
politics_ind_tidy %>% group_by(headline_category) %>%
unnest_tokens(., headline_words, headline_text) -> politics_ind_tidy
# merge categories into more managable ones
#politics_ind_tidy %>% pull(headline_category) %>% str_extract("[A-z0-9]{1,}\\.") %>%
# str_remove("\\.") -> politics_ind_tidy$headline_category
# convert NA category into Unspecified
#politics_ind_tidy %>% replace_na(list(headline_category ="Unspecified")) -> politics_ind_tidy
# save the tidy data
write_csv(politics_ind_tidy, "politics_ind_tidy.csv")
# read the tidy news csv file
politics_ind_tidy <- read_csv("politics_ind_tidy.csv")
# calculate word frequencies for words that are not stop words
politics_ind_tidy %>%
group_by(headline_category) %>%
anti_join(stop_words, by = c("headline_words" = "word")) %>%
count(headline_words, sort = TRUE) %>%
filter(n > 1000) %>%
ggplot(., aes(headline_words, n)) +
geom_col(aes(fill = headline_category)) +
coord_flip() +
theme(legend.position = "bottom")
# sentiment analysis
politics_ind_tidy %>%
#mutate( ind = as_factor(paste(year(publish_date),month(publish_date)))) %>%
inner_join(get_sentiments("bing"), by = c("headline_words" = "word")) %>%
count(headline_sub_category, index = year(publish_date) ,sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) -> politics_sentiment
politics_sentiment %>%
ggplot(., aes(index, sentiment, fill = headline_sub_category)) +
geom_col(show.legend = FALSE) +
facet_wrap(~headline_sub_category, nrow = 3, scales = "free_y")
# calculate tf*idf scores and find out unique valuable words
# calculate the count for each group and each word
politics_ind_tidy %>%
ungroup() %>%
count(headline_sub_category, headline_words, sort = TRUE) %>%
ungroup()-> head_words
head_words %>%
group_by(headline_sub_category) %>%
summarise(total = sum(n)) -> total_words
head_words <- left_join(head_words, total_words)
head_words %>% bind_tf_idf(headline_words, headline_sub_category, n) -> head_words
head_words %>%
arrange(desc(tf_idf)) %>%
mutate(headline_words = factor(headline_words, levels = rev(unique(headline_words)))) %>%
group_by(headline_sub_category) %>%
#filter(tf_idf >=.1) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(headline_words, tf_idf, fill = headline_sub_category)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~headline_sub_category, ncol = 2, scales = "free") +
coord_flip()
# see the total of counts over time mentioning modi and manmohan
news %>%
filter(str_detect(news$headline_text,"(narendra modi)|(manmohan singh)")) %>%
mutate(headline_category = str_extract(.$headline_text, "(narendra modi)|(manmohan singh)")) -> modi_manmohan
modi_manmohan %>%
group_by(year =year(publish_date)) %>%
count(year, headline_category) %>%
ggplot(., aes(year , n, group = headline_category, color = headline_category)) +
geom_line(size =1)
# bigrams
news %>%
filter(str_detect(news$headline_text,"(narendra modi)|(bjp)|(congress)|(manmohan singh)")) %>%
select(-headline_category) %>%
unnest_tokens(bigram, headline_text, n = 2, token = "ngrams") %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) -> political_bigrams
# bigram count
set.seed(2016)
a <- grid::arrow(type = "closed", length = unit(.05, "inches"))
political_bigrams %>%
count(word1, word2, sort = TRUE) %>%
filter(n > 170) %>%
graph_from_data_frame() %>%
ggraph(., layout = "fr") +
geom_edge_link(edge_alpha = .55, show.legend = FALSE,
arrow = a, end_cap = circle(.05, 'inches')) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
# do topic modelling