-
Notifications
You must be signed in to change notification settings - Fork 0
/
CoB_stitchingCategories.R
132 lines (100 loc) · 6.85 KB
/
CoB_stitchingCategories.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#Having got all the column names to something readable
#and removed any sum columns
#We're now onto finding common categories across as many decades as we can
#1991 postcode sectors are the common zone.
#1991 LBS data has vastly more detailed CoB breakdown than SAS. This is at 1991 postcode level.
source("Function_bulkColumnRecode.R")
library(dplyr)
#Load all decades to look at
seventy1 <- read.csv("1971/Scotland/1971Scotland_CoB_EnumDistrict_countryNamesAdded_n_MaleFemaleSummed.csv")
eighty1 <- read.csv("1981/Scotland/Scotland_1981_CoB_OutputArea_Total_noMaleFemale_countyNamesAdded_spacesRemovedFromLabels.csv")
ninety1_SAS <- read.csv("1991/Scotland/1991_Scotland_SAS_CoB_OutputAreas/1991_Scotland_SAS_CoB_OutputAreas__noMaleFemale_countyNamesAdded.csv")
ninety1_SAS_PCSzone <- read.csv("1991/Scotland/1991_Scotland_SAS_CoB_postcodeSectors/1991_Scotland_SAS_CoB_postcodeSectors_noMaleFemale_countyNamesAdded.csv")
ninety1_LBS <- read.csv("1991/Scotland/1991_Scotland_LBS_CoB_postcodeSectors/1991_Scotland_LBS_CoBPostcodeSectors_allPeople_notMF_countyNamesAdded.csv")
twoThousand1 <- read.csv("2001/Scotland/2001_Scots_CoB_univariatetable_countryNamesAdded.csv")
#twoThEleven <- read.csv("2011/Scotland/OutputArea_bulk_CoB_formatted/QS203SC_formatted.csv", as.is = T)
#use comma-less version
twoThEleven <- read.csv("2011/Scotland/OutputArea_bulk_CoB_formatted/QS203SC_formatted_commasRemoved.csv")
#2011 aggregated already to intermediate zones
# twoThEleven2 <- read.csv("2011/Scotland/CoB_IntermediateGeog_viaLookupFromOutputArea/2011_CoB_intermediateGeog.csv")
#2011 data issue:
#NAs introduced. Where are the non-numerics?
# twoThEleven2 <- data.frame(apply(twoThEleven[,2:ncol(twoThEleven)],2,as.numeric))
#
# twoThEleven2[is.na(twoThEleven2),1:2]
# #row col
# #[1,] 44680 3
# which(is.na(twoThEleven2),1:2)
#
# #Ah ha! Commas!
# twoThEleven[44680,4]
# table(0 + grepl(",",twoThEleven[2:nrow(twoThEleven),2:ncol(twoThEleven)]))
#
# #Remove commas, re-save...
# twoThEleven <- data.frame(apply(twoThEleven,c(2),function(x) gsub(",","",x)))
# write.csv(twoThEleven, "2011/Scotland/OutputArea_bulk_CoB_formatted/QS203SC_formatted_commasRemoved.csv", row.names = F)
#see xxxCoB_stitchingCategories for the code getting rid of headings (just takes headings from above file and combines)
#namesNoHeadinz <- read.csv("VariableCoding/1971to2011_CountryOfBirthCategories_Scotland_noRegionHeadings.csv")
#version with all columns to get the indexing actually correct
#namesNoHeadinz <- read.csv("VariableCoding/1971to2011_CountryOfBirthCategories_Scotland_noRegionHeadings_allCols.csv")
namesNoHeadinz <- read.csv("VariableCoding/APRIL17UPDATE_1971to2011_CountryOfBirthCategories_Scotland_noRegionHeadings_allCols.csv")
#drop 91 SAS for now, not using at present
namesNoHeadinz <- namesNoHeadinz[,c(-1,-4)]
#~~~~~~~~~~~~~~~~~~~~~~~~~
#FIVE CENSUS STITCHING----
#Passing everything in to function to make sure names across everything remains consistent
#The second list: indexes columns to keep from the dataframes to re-attach after re-coding
#All others: common variable name and, for each dataframe, columns to sum
#(or just a single column)
#Add in 91 SAS
# fiveCensusCombo <- list(list(seventy1,eighty1,ninety1_SAS,ninety1_LBS,twoThousand1,twoThEleven2),#all data to sum columns on
# list('1','2','1','1,2','1','1'),#columns to keep from those
# list('England','3','3','2','3','2','2'),#Common name of column, index of columns to combine in order
# list('Scotland','4','4','3','4','3','4'),#doing in strings cos typing all this in lists here would be reet messy
# list('Wales','5','5','4','5','5','5'),
# list('Rest of UK','6','6','5','6:9','4,6,9','3,6:7'),#Leaving out 91/01 "ireland part not specified"
# list('Irish Republic','7','7','6','10','7','8'),
# list('Old Commonwealth','8','8','7','12:14','59,65:66','48,54:55'),#Canada, Oz, New Zealand
# #list('Africa','9','9:10','15:25,72:78','32:41','27:32'),#No: this doesn't split new commonwealth out
# list('Africa (New-C)','9','9:10','8:9','15:25','34:35,37:38,40','27,29,31'),
# list('India','12','12','12','35','55','40'),
# list('Pakistan','13','17','13','36','56','41'),
# list('Other Europe','16','18:19','17:18','47:69,71','10:26,28:31','9:23,26'),
# list('SE Asia New C','14','14','14','38:40','47,49:50','37,43,45'),#Quote marks because racist! But very specific Colonial categories
# list('Caribbean New C','10','11','10','26:33','61:62','53'),
# list('New Commonwealth other','11,15','13,15:16','11,15:16','34,37,41:46','42,57,54','39'),
# list('Rest of World','17','20','19:20','70,72:92',
# '27,32:33,36,39,41,43:46,48,51:53,58,60,63:64,67:68','28,30,32')
# )
#Adding ninety1_SAS_PCSzone on the end.
#Should be same as ninety1_SAS: current third column.
fiveCensusCombo <- list(
list(seventy1,eighty1,ninety1_SAS,ninety1_LBS,twoThousand1,twoThEleven,ninety1_SAS_PCSzone),#all data to sum columns on
list('1','2','1','1,2','1','1','1'),#columns to keep from those
list('England','3','3','2','3','2','2','2'),#Common name of column, index of columns to combine in order
list('Scotland','4','4','3','4','3','4','3'),#doing in strings cos typing all this in lists here would be reet messy
list('Wales','5','5','4','5','5','5','4'),
list('Rest of UK','6','6','5','6:9','4,6,9','3,6:7','5'),#Leaving out 91/01 "ireland part not specified"
list('Irish Republic','7','7','6','10','7','8','6'),
list('Old Commonwealth','8','8','7','12:14','60,66:67','48,54:55','7'),#Canada, Oz, New Zealand
#list('Africa','9','9:10','15:25,72:78','32:41','27:32'),#No: this doesn't split new commonwealth out
list('Africa (New-C)','9','9:10','8:9','15:25','35:36,38:39,41','27,29,31','8:9'),
list('India','12','12','12','35','56','40','12'),
list('Pakistan','13','17','13','36','57','41','13'),
list('Other Europe','16','18:19','17:18','47:69,71','10:27,29:32','9:23,26','17:18'),
list('SE Asia New C','14','14','14','38:40','48,50:51','37,43,45','14'),#Quote marks because racist! But very specific Colonial categories
list('Caribbean New C','10','11','10','26:33','62:63','53','10'),
list('New Commonwealth other','11,15','13,15:16','11,15:16','34,37,41:46','43,58,55','39','11,15:16'),
list('Rest of World','17','20','19:20','70,72:92',
'28,33:34,37,40,42,44:47,49,52:54,59,61,64:65,68:69','28,30,32','19:20')
)
#Returns list of re-coded dataframes
#"Error: column_recode_lists_are_all_correct_length not equal to TRUE"
#Means lists defining recodes aren't all of a length that recodes in each dataframe
results <- recode(fiveCensusCombo)
looksee <- results[[5]]
#Save as five dataframes, ready for geog re-assigning (apart from '91 which is correct already)
savens <- c('71','81','91SAS','91LBS','01','11','91SAS_PCS')
lapply(1:length(savens),function(x) write.csv(results[[x]],
paste0("VariableCoding/CountryOfBirth_fiveCensusRecodes/",savens[x],".csv"),
row.names = F))