-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Cleaning.R
261 lines (197 loc) · 9.66 KB
/
Data Cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# GRAPH CODES FOR MY VISUALS
# doownloaded our various libraries
library(sf)
library(leaflet)
library(readr)
library(readxl)
library(tidyverse)
library(nnet)
library(rgdal)
library(janitor)
##############################################################
# DATA CLEANING FOR THE GENDER PLOT!
# Got the data for the official 2015 korean religions count data
x <- read_xlsx("Korea/raw-data/2015_Religions_Korea.xlsx") %>%
clean_names()
#fixes merged column problem from excel for the dataset
test<- x %>% fill(gender) %>% fill(korean_region)
#trying to filter data for only the total number of korean data
gender_data <- x %>%
filter(korean_region == "All of Korea") %>%
filter(gender != "Korea - Total") %>%
filter(age == "Total")
#needs to be a data frame in order to use gather
gender_data <- data.frame(gender_data)
#gather flips the rows and columns
gender_data_vertical <- gather(gender_data)
gender_data_vertical <- gender_data_vertical[11:30,]
gender_data_vertical<- gender_data_vertical %>%
mutate(gender = ifelse(row_number()%%2 == 0,
"Female",
"Male")) %>%
mutate(religion = ifelse(row_number()%%2 == 0,
str_c(key, "F"),
str_c(key, "M")))
gender_data_vertical <- transform(gender_data_vertical, value = as.numeric(value))
##############################################################
# KOREAN AGE DATA CLEANING
age_data <- x %>%
filter(korean_region == "All of Korea") %>%
filter(age != "Total") %>%
filter(gender == "Korea - Total") %>%
mutate(age_number = as.numeric(substr(age,1,2))) %>%
mutate(age2 = as.numeric(age_number)^2)
##############################################################
# WORLD RELIGION DATA CLEANING!!!
# Read in the data from the Pew Research Center's The Global Religious Landscape
# database ; needed to specify the various columns for each variable
# cleaned names to have useful labels for the graph
world<- read_csv("Korea/raw-data/World_Religion_Data.csv", col_types = cols(
Country = col_character(),
Christian = col_double(),
Muslim = col_double(),
Unaffiliated = col_double(),
Hindu = col_double(),
Buddhist = col_double(),
Folk = col_double(),
Other = col_double(),
Jewish = col_double())) %>%
clean_names() %>%
na.exclude()
# Need to clean up my data sources to have only the overall world totals data
# utilized summarize function in order to get totals of every single religion;
# numbers based upon information from totlas of every nation, numbers are the
# percentages of people who practice the religion
world_summary<- world %>%
summarize(christian = mean(as.numeric(christian)),
muslim = mean(as.numeric(muslim)),
unaffiliated = mean(as.numeric(unaffiliated)),
hindu = mean(as.numeric(hindu)),
buddhist = mean(as.numeric(buddhist)),
folk = mean(as.numeric(folk)),
other = mean(as.numeric(other)),
jewish = mean(as.numeric(jewish)))
# Get the separate data for korean numbers totals from the korean religion dataset
# Took out the country column as it was unncecessary
korea_totals <- world %>%
filter(country == "Korea, Republic of") %>%
select(-matches("country"))
# Merged the world data with the korea totals data in order to make a plot that could
# compare percentages of the world religions with the korean religions
world_korea_summary <- rbind(data.frame(world_summary),data.frame(korea_totals))
# flipped the rows and columns in order to have data in a format I can put in a bar
# plot with groupings
vertical_world <- gather(world_korea_summary)%>%
mutate(country = ifelse(row_number()%%2 == 1, "World","South Korea"))
######################################################################
# KOREA MAP DATA WRANGLING
# used the korea religions dataset :D
clean <- x %>%
filter(gender == "Total") %>%
tail(17) %>%
select(-matches("age")) %>%
select(-matches("gender"))
colnames(clean)[1]<-"id"
#############################################################
############### START OF SHAPEFILE STUFF ####################
#############################################################
# Read in Shape files of the korean administrative area boundaries
shp <- readOGR("Korea/raw-data/KOR_adm/KOR_adm1.shp", stringsAsFactors = F)
# tidyed the data for the region :D, need the region name so that the longitude data points match up the various busan seoul other area names
shp_df<- tidy(shp, region = "NAME_1")
head(shp_df)
joined <- inner_join(shp_df,clean, by = "id")
######################################################################
############# Line plot data ##############
# 2015, 2005, 1995, 1985 Religion data for Korea
x2015 <- read_xlsx("Korea/raw-data/2015_Religions_Korea.xlsx") %>%
clean_names()
x2005 <- read_xlsx("Korea/raw-data/2005_Religions_Korea.xlsx") %>%
clean_names()
x1995 <- read_xlsx("Korea/raw-data/1995_Religions_Korea.xlsx") %>%
clean_names()
x1985 <- read_xlsx("Korea/raw-data/1985_Religions_Korea.xlsx") %>%
clean_names()
# Fixes merged column problem from excel for the dataset
x2015 <- x2015 %>% fill(gender) %>% fill(korean_region)
x2005 <- x2005 %>% fill(gender) %>% fill(korean_region)
x1995 <- x1995 %>% fill(gender) %>% fill(korean_region)
x1985 <- x1985 %>% fill(gender) %>% fill(korean_region)
# Data cleaning, need to only get the regions of all the total populations of Korea of
# 2015, only the gender and age totals, and we only selected the specific religions
# we wanted to view on the line plot, christianity and buddhism and religious totals
# and non religious totals
totals2015 <- x2015 %>%
filter(korean_region == "All of Korea") %>%
filter(gender == "Korea - Total") %>%
filter(age == "Total") %>%
select(total,religious_total,christianity_protestant,
christianity_catholic,buddhism,no_religion) %>%
mutate(year = "2015")
# Data cleaning, need to only get the regions of all the total populations of Korea of
# 2005, only the gender and age totals, and we only selected the specific religions
# we wanted to view on the line plot, christianity and buddhism and religious totals
# and non religious totals
totals2005 <- x2005 %>%
filter(korean_region == "All of Korea") %>%
filter(gender == "Korea - Total") %>%
filter(age == "Total") %>%
select(total,religious_total,christianity_protestant,
christianity_catholic,buddhism,no_religion) %>%
mutate(year = "2005")
# Data cleaning, need to only get the regions of all the total populations of Korea of
# 1995, only the gender and age totals, and we only selected the specific religions
# we wanted to view on the line plot, christianity and buddhism and religious totals
# and non religious totals
totals1995 <- x1995 %>%
filter(korean_region == "All of Korea") %>%
filter(gender == "Korea - Total") %>%
filter(age == "Total") %>%
select(total,religious_total,christianity_protestant,
christianity_catholic,buddhism,no_religion) %>%
mutate(year = "1995")
# Data cleaning, need to only get the regions of all the total populations of Korea of
# 1985, only the gender and age totals, and we only selected the specific religions
# we wanted to view on the line plot, christianity and buddhism and religious totals
# and non religious totals
totals1985 <- x1985 %>%
filter(korean_region == "All of Korea") %>%
filter(gender == "Total") %>%
filter(age == "Total") %>%
select(total,religious_total,christianity_protestant,
christianity_catholic,buddhism,no_religion) %>%
mutate(year = "1985")
# Combined the four different years datas so that we could have it all in one set
combined2 <- rbind(totals1985, totals1995, totals2005, totals2015) %>%
mutate(freq = 6)
# Converted to a data frame, duplicated each row 6 times so that we could make a column
# that contains all of the various religion data for grouped line bar plots
combined <- as.data.frame(combined2[rep(row.names(combined2),
combined2$freq), 1:7])
# Made several rows that could contain numbers for the four different religous population
# numbers, and the total population numbers and non religious population numbers
# had a identifier row called religion and numbers in population
# needed a case_when function because we had so many different possibilities
combined <- combined %>%
mutate(religion = case_when(row_number() %% 6 == 1 ~ "Total Population",
row_number() %% 6 == 2 ~ "All Religions",
row_number() %% 6 == 3 ~ "Christianity Protestant",
row_number() %% 6 == 4 ~ "Christianity Catholic",
row_number() %% 6 == 5 ~ "Buddhism",
row_number() %% 6 == 0 ~ "No Religion",
TRUE ~ "no")) %>%
mutate(population = case_when(row_number() %% 6 == 1 ~ total,
row_number() %% 6 == 2 ~ religious_total,
row_number() %% 6 == 3 ~ christianity_protestant,
row_number() %% 6 == 4 ~ christianity_catholic,
row_number() %% 6 == 5 ~ buddhism,
row_number() %% 6 == 0 ~ no_religion,
TRUE ~ "no"))
#########################################################################
#Used write_rds() to write out vertical world data into a separate rds file
write_rds(vertical_world, "Korea/world.rds")
write_rds(gender_data_vertical, "Korea/gender.rds")
write_rds(age_data, "Korea/age.rds")
write_rds(world, "Korea/worldMap.rds")
write_rds(joined, "Korea/koreaMap.rds")
write_rds(combined, "Korea/lineplot.rds")