-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_packs.R
270 lines (238 loc) · 14.6 KB
/
data_packs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
####
#### Created by Michalis Pavlis
####
##########################################################################################################################
# Produce Data Packs #
##########################################################################################################################
library(sf)
library(data.table)
library(whisker)
##########################################################################################################################
# Get data and subset columns, optionally rename columns #
##########################################################################################################################
# import data, subset columns by name or index (optional), rename columns (optional)
get_data <- function(path_and_file_name, select_cols_vec = NULL, new_col_names_vec = NULL, ...){
Df <- fread(path_and_file_name, ...)
if (!is.null(select_cols_vec)){
Df <- Df[, select_cols_vec, with = F]
}
if (!is.null(new_col_names_vec)){
if (length(new_col_names_vec) != ncol(Df)){
stop("the number of names in col_names should be the same as the number of (the selected) columns")
}
setnames(Df, 1:ncol(Df), new_col_names_vec)
}
return(Df)
}
##########################################################################################################################
# Join two data tables #
##########################################################################################################################
# inner join, optionally pass arguments for outer joins (e.g. all.x etc)
join_data <- function(table_x, table_y, by_x, by_y, ...){
setkeyv(table_x, by_x)
setkeyv(table_y, by_y)
return(merge.data.table(table_x, table_y, by.x = by_x, by.y = by_y, ...))
}
##########################################################################################################################
# Produce aggregate stats for a number of columns #
##########################################################################################################################
##### object to hold the names of the attribute columns (vector of characters),
##### the summary statistic (character) and how to treat NA values (boolean)
map_attr_stats <- setClass('map_attr_stats',
slots = c(
attr_col_name = "character",
stat_name = "character",
na_behaviour = "logical"),
validity = function(object){
if (length(object@stat_name) > 1 || length(object@na_behaviour) > 1){
return("stat_name and na_behaviour have to be of length 1")
}
return(T)
}
)
##### calculate summary statistic in a data.table, the group by can be one or more columns
aggregate_stats <- function(Dt, group_cols_names, attr_cols_names, stat_to_calc, na.rm = T, sort_by_key = T){
if (sort_by_key){
Dt <- setkeyv(Dt, group_cols_names)
}
if (na.rm){
out <- eval(parse(text = paste0("Dt[,lapply(.SD, ", stat_to_calc,", na.rm = na.rm), by=group_cols_names, .SDcols=attr_cols_names]")))
} else {
out <- eval(parse(text = paste0("Dt[,lapply(.SD, ", stat_to_calc,"), by=group_cols_names, .SDcols=attr_cols_names]")))
}
setnames(out, names(out), c(group_cols_names, paste(attr_cols_names, stat_to_calc, sep = "_")))
return(out)
}
##### calculate multiple summary statistics, return the result in a data.table
aggregate_multiple_stats <- function(Dt, group_cols_names, map_attr_stats_list){
Dt <- setkeyv(Dt, group_cols_names)
out_stat <- aggregate_stats(Dt, group_cols_names, attr_cols_names = map_attr_stats_list[[1]]@attr_col_name,
stat_to_calc = map_attr_stats_list[[1]]@stat_name, na.rm = map_attr_stats_list[[1]]@na_behaviour,
sort_by_key = F)
if (length(map_attr_stats_list) > 1){
for (i in 2:length(map_attr_stats_list)){
temp_stat <- aggregate_stats(Dt, group_cols_names, attr_cols_names = map_attr_stats_list[[i]]@attr_col_name,
stat_to_calc = map_attr_stats_list[[i]]@stat_name, na.rm = map_attr_stats_list[[i]]@na_behaviour,
sort_by_key = F)
out_stat <- join_data(out_stat, temp_stat, by_fields_t1 = group_cols_names, by_fields_t2 = group_cols_names)
}
}
return(out_stat)
}
##########################################################################################################################
# Metadata List #
##########################################################################################################################
metadata <- list(grantNo = "ES/L011840/1",
titl = "",
AuthEnty = c("Konstantinos Daras, University of Liverpool","Alex Singleton, University of Liverpool",
"Mark A Green, University of Liverpool","Alec Davies, University of Liverpool"),
depositr = "Konstantinos Daras, University of Liverpool",
fundAg = "Economic and Social Research Council",
producer= "ESRC Consumer Data Research Centre",
copyright = paste0("The following attribution statements must be used to acknowledge copyright and source in use of these datasets:\n",
"Data provided by the ESRC Consumer Data Research Centre;"),
dataCollector = "",
dataSrc = "",
topcClas = "",
keyword = "",
abstract = "",
timePrd = "",
collDate = "",
nation = "",
geogCover = "",
geogUnit = "",
anlyUnit = "",
dataKind = "",
accsPlac = c("Consumer Data Research Centre", "UK Data Service"),
contact = "[email protected]",
relPubl = "",
serName = ""
)
##########################################################################################################################
# Write Readme txt #
##########################################################################################################################
write_readme <- function(metadata_list, out_dir, csv_tables = T){
readme <- file(paste(out_dir, "readme.txt", sep="/"))
writeLines(c(metadata_list$titl,
"\n",
"+ Abstract",
metadata_list$abstract,
"\n",
"+ Contents",
"\t - readme.txt: Information about the CDRC Geodata pack",
"\t - metadata.xml: Metadata",
if(csv_tables) "\t - tables: Folder containing the csv files",
"\t - shapefiles: Folder containing the shapefiles",
"\n",
"+ Citation and Copyright",
metadata_list$copyright,
"\n",
"+ Funding",
paste("Funded by: Economic and Social Research Council", metadata_list$grantNo),
"\n",
"+ Other Information",
"Areas that contained no information in the original dataset, are marked with NA in the csv files and NULL in the shapefiles."
),
readme)
close(readme)
}
##########################################################################################################################
# Write Metadata xml #
##########################################################################################################################
write_xml <- function(xml_template, metadata_list, out_dir){
new_xml <- whisker.render(xml_template, metadata_list)
writeLines(new_xml, paste(out_dir,"metadata.xml", sep="/"))
}
##########################################################################################################################
# Write other stuff #
##########################################################################################################################
# write_other_text <- function(output_text_list, out_dir){
# for (out_file_name in output_text_list){
# out_file <- file(paste(out_dir, out_file_name, sep="/"))
# writeLines(output_text_list[[out_file_name]], out_file)
# close(out_file)
# }
# }
##########################################################################################################################
# Create Metadata #
##########################################################################################################################
# the argumants: area_code, area_name, area_scale, geography_scale, time_period are always required by add_to_metadata
# the function should be in every (x_data_pack)_functions.r script, other arguments can be passed in add_to_metadata
create_metadata <- function(area_code, area_name, area_scale, geography_scale, time_period, base_dir, xml_template,
metadata_list = metadata, csv_tables = T, ...){
metadata_list <- add_to_metadata(metadata_list = metadata_list, area_scale = area_scale, area_code = area_code,
area_name = area_name, geography_scale = geography_scale,time_period = time_period, ...)
out_dir <- paste(base_dir, area_code, sep = "/")
write_readme(metadata_list, out_dir, csv_tables = csv_tables)
write_xml(xml_template, metadata_list, out_dir)
}
###########################################################################################################################
# Create directories #
###########################################################################################################################
create_dirs <- function(dir_name, base_dir){
out_dir <- paste(base_dir, dir_name, sep = "/")
dir.create(out_dir)
return(out_dir)
}
###########################################################################################################################
# Subset data #
###########################################################################################################################
subset_data <- function(col_value, input_data, col_name, rm_col = F){
out_data <- subset(input_data, input_data[[col_name]] %in% col_value)
if (rm_col){
out_data[[col_name]] <- NULL
}
return(out_data)
}
###########################################################################################################################
# Merge shapefile with attribute data #
###########################################################################################################################
# merge_shps_attr <- function(sf_df, dt, by_x, by_y, all_x){
# if (! all_x){
# sf_df <- subset_data(dt[[by_y]], sf_df, by_x)
# }
# n <- ncol(sf_df)
# sf_df <- data.frame(sf_df, dt[match(sf_df[[by_x]], dt[[by_y]]), ])
# sf_df[, n+1] <- NULL
# return(sf_df)
# }
###########################################################################################################################
# Extract data per geographical unit #
###########################################################################################################################
extract_per_geog <- function(area_value, input_data, area_column, base_dir, geom_column = "geometry",
create_dir = T, name_suffix="", csv_tables = T){
out_data <- subset_data(area_value, input_data, area_column, rm_col = T)
if (create_dir){
if (csv_tables){
out_dir_csv <- create_dirs("tables", base_dir)
}
out_dir_shp <- create_dirs("shapefiles", base_dir)
} else {
if (csv_tables){
out_dir_csv <- paste0(base_dir,"/tables")
}
out_dir_shp <- paste0(base_dir,"/shapefiles")
}
if (csv_tables){
fwrite(out_data[,names(out_data) != geom_column], paste0(out_dir_csv, "/", area_value, name_suffix, ".csv"))
}
st_write(st_as_sf(as.data.frame(out_data[,c(1,20)])), paste0(out_dir_shp, "/", area_value, if(name_suffix != "") paste0("_", name_suffix), ".shp"), driver = "ESRI Shapefile")
}
##########################################################################################################################
# Create data pack #
# input: a sf object and a lookup data.table object #
# output: csvs and shapefiles per geography #
##########################################################################################################################
create_data_pack <- function(sf_df, dt_lookup, base_dir, sf_geom_column = "geometry", sf_geog_unit_column = "lsoa11cd",
dt_area_colunn = "lad11cd", dt_geog_unit_column = "lsoa11cd", name_suffix = "",
create_dir = T, csv_tables = T){
# create folders
if (create_dir) invisible(sapply(unique(dt_lookup[[dt_area_colunn]]), create_dirs, base_dir))
setDT(sf_df)
# join sf with lookup
Dt <- merge(dt_lookup, sf_df, by.x = dt_geog_unit_column, by.y = sf_geog_unit_column, all = T)
# extract data from data.table
invisible(sapply(unique(Dt[[dt_area_colunn]]), function(x)
extract_per_geog(area_value = x, input_data = Dt, area_column = dt_area_colunn, base_dir = paste(base_dir, x, sep = "/"),
geom_column = sf_geom_column, create_dir = create_dir, name_suffix = name_suffix, csv_tables = csv_tables)))
}