-
Notifications
You must be signed in to change notification settings - Fork 42
/
read PUMS codebook.R
49 lines (42 loc) · 1.6 KB
/
read PUMS codebook.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
## Read PUMS codebook & data
## author: Len Greski
## date: 16 August 2015
## prerequisites: split the REVISEDPUMS5_36.TXT file into two separate files,
## one for persons, and one for households
##
## NOTE: n = 953076 in read.fwf() to have program read a fixed number of records, reducing
## memory allocation. File is 330Mb in size
##
## Prerequisite: download the 5PCT PUMS record layout spreadsheet from
## http://www2.census.gov/census_2000/datasets/PUMS/FivePercent/5%25_PUMS_record_layout.xls
##
startTime <- Sys.time()
# readxl version
library(readxl)
cellRange <- "A2:G1219"
codeBook <- read_xls("./data/5%_PUMS_record_layout.xls",
sheet=2,
range=cellRange)
## remove blank rows
codeBook <- codeBook[!is.na(codeBook$VARIABLE),]
## remove duplicate rows
library(data.table)
codeBook <- unique(as.data.table(codeBook))
## remove NA rows by setting length to a numeric variable, and processing
## with !is.na
codeBook$LEN <- as.numeric(codeBook$LEN)
codeBook <- codeBook[!is.na(codeBook$LEN)]
## set widths vector to LEN (length) column
colWidths <- codeBook$LEN
## set column names to the VARIABLE column in codebook
colNames <- codeBook$VARIABLE
## read PUMS data previously separated by split PUMS R script
personData <- read.fwf("./data/PUMS_person_NY.txt",
colWidths,
header=FALSE,
n=953076,
col.names = colNames,
stringsAsFactors=FALSE)
endTime <- Sys.time()
endTime - startTime
print(object.size(personData),units="Mb")