-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre_processing_census_for_neural_network.r
64 lines (32 loc) · 3.09 KB
/
pre_processing_census_for_neural_network.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
base = read.csv('census.csv')
base$X = NULL
#Get all unique values from column
unique(base$sex)
#Get quantity of each value
table(base$sex)
#replace Female with 0 and Male with 1
base$sex = as.numeric(factor(base$sex, levels = c(' Female', ' Male'), labels = c(0,1)))
#Check for NA, should be 0
base[is.na(base$sex)]
base$workclass = as.numeric(factor(base$workclass, levels = c(' Federal-gov', ' Local-gov', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'), labels = c(1, 2, 3, 4, 5, 6, 7)))
base$education = as.numeric(factor(base$education, levels = c(' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college'), labels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)))
base$marital.status = as.numeric(factor(base$marital.status, levels = c(' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed'), labels = c(1, 2, 3, 4, 5, 6, 7)))
base$occupation = as.numeric(factor(base$occupation, levels = c(' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving'), labels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)))
base$relationship = as.numeric(factor(base$relationship, levels = c(' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife'), labels = c(1, 2, 3, 4, 5, 6)))
base$race = as.numeric(factor(base$race, levels = c(' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White'), labels = c(1, 2, 3, 4, 5)))
base$native.country = as.numeric(factor(base$native.country, levels = c(' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti', ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India', ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos', ' Mexico', ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru', ' Philippines', ' Poland', ' Portugal', ' Puerto-Rico', ' Scotland', ' South', ' Taiwan', ' Thailand', ' Trinadad&Tobago', ' United-States', ' Vietnam', ' Yugoslavia'), labels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41)))
#base$income = factor(base$income, levels = c(' <=50K', ' >50K'), labels = c(0, 1))
#Scale attributes
base[, 1] = as.numeric(scale(base[, 1]))
base[, 3] = as.numeric(scale(base[, 3]))
base[, 5] = as.numeric(scale(base[, 5]))
base[, 11:13] = as.numeric(scale(base[, 11:13]))
library(caTools)
# Set random seed
set.seed(1)
# Define training sample
division = sample.split(base$income, SplitRatio = 0.85)
# Save training sample
training_base = subset(base, division == TRUE)
# Save test base
test_base = subset(base, division == FALSE)