-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_agg.py
46 lines (33 loc) · 1.53 KB
/
data_agg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
from vega_datasets import data
def create_state_csv():
df = pd.read_csv('C:/Users/anjby/IDS/abnormal_distribution/MigrationPatternsData/od.csv')
df.info(verbose=True, show_counts=True)
grouped_df = df.groupby(['o_state_name', 'd_state_name', 'pool'])['n']\
.agg('sum')\
.reset_index()
# Splitting the pool in to state and quantile
splits = grouped_df['pool'].str.split('Q', expand=True)
grouped_df['race'] = splits[0]
grouped_df['quintile'] = splits[1]
grouped_df.drop(['pool'], axis=1, inplace=True)
# Adding unique identifier for source and destination states
pop = data.population_engineers_hurricanes()
pop = pop[['state', 'id']]
states = list(pop['state'])
ids = list(pop['id'])
## Source state
state_id_dict = {}
for idx, state in zip(ids, states):
if state == "District of Columbia":
state_id_dict["DC"] = idx
else:
state_id_dict[state] = idx
def assign_state_id(state):
return state_id_dict[state]
grouped_df['o_state_id'] = grouped_df.apply(lambda x: assign_state_id(x['o_state_name']), axis=1)
grouped_df['d_state_id'] = grouped_df.apply(lambda x: assign_state_id(x['d_state_name']), axis=1)
col_order = ['o_state_id', 'o_state_name', 'd_state_id', 'd_state_name',
'race', 'quintile', 'n']
grouped_df = grouped_df.reindex(columns=col_order)
grouped_df.to_csv('state_to_state_migration.csv')