-
Notifications
You must be signed in to change notification settings - Fork 0
/
05bucket.py
115 lines (95 loc) · 4.19 KB
/
05bucket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
california_housing_dataframe = pd.read_csv('https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv', sep=',')
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
def preprocess_features(california_housing_dataframe):
"""Prepares input features from California housing data set.
Args:
california_housing_dataframe: A Pandas DataFrame expected to contain data
from the California housing data set.
Returns:
A DataFrame that contains the features to be used for the model, including
synthetic features.
"""
selected_features = california_housing_dataframe[
["latitude",
"longitude",
"housing_median_age",
"total_rooms",
"total_bedrooms",
"population",
"households",
"median_income"]]
processed_features = selected_features.copy()
# Create a synthetic feature.
processed_features["rooms_per_person"] = (
california_housing_dataframe["total_rooms"] /
california_housing_dataframe["population"])
return processed_features
training_examples = preprocess_features(california_housing_dataframe.head(12000))
def get_quantile_based_buckets(feature_values, num_buckets):
quantiles = feature_values.quantile(
[(i+1.)/(num_buckets + 1.) for i in range(num_buckets)])
return [quantiles[q] for q in quantiles.keys()]
def construct_feature_columns():
"""Construct the TensorFlow Feature Columns.
Returns:
A set of feature columns
"""
bucketized_households = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("households"),
boundaries=get_quantile_based_buckets(training_examples["households"], 10))
bucketized_longitude = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("longitude"),
boundaries=get_quantile_based_buckets(training_examples["longitude"], 50))
bucketized_latitude = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("latitude"),
boundaries=get_quantile_based_buckets(training_examples["latitude"], 50))
bucketized_housing_median_age = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("housing_median_age"),
boundaries=get_quantile_based_buckets(
training_examples["housing_median_age"], 10))
bucketized_total_rooms = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("total_rooms"),
boundaries=get_quantile_based_buckets(training_examples["total_rooms"], 10))
bucketized_total_bedrooms = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("total_bedrooms"),
boundaries=get_quantile_based_buckets(training_examples["total_bedrooms"], 10))
bucketized_population = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("population"),
boundaries=get_quantile_based_buckets(training_examples["population"], 10))
bucketized_median_income = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("median_income"),
boundaries=get_quantile_based_buckets(training_examples["median_income"], 10))
bucketized_rooms_per_person = tf.feature_column.bucketized_column(
tf.feature_column.numeric_column("rooms_per_person"),
boundaries=get_quantile_based_buckets(
training_examples["rooms_per_person"], 10))
long_x_lat = tf.feature_column.crossed_column(
set([bucketized_longitude, bucketized_latitude]), hash_bucket_size=1000)
feature_columns = set([
long_x_lat,
bucketized_longitude,
bucketized_latitude,
bucketized_housing_median_age,
bucketized_total_rooms,
bucketized_total_bedrooms,
bucketized_population,
bucketized_households,
bucketized_median_income,
bucketized_rooms_per_person])
return feature_columns
print(construct_feature_columns())