-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmultivariate_temporal_trajectories_visualization.py
131 lines (108 loc) · 5.05 KB
/
multivariate_temporal_trajectories_visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 08 14:28:40 2020
@author: Johannes
"""
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.colors as colors
import matplotlib.cm as cmx
import os,sys
import matplotlib
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"
#matplotlib.rcParams['font.size'] = 4
### read some country level statistics
#Source: United Nations,
#Department of Economic and Social Affairs, Population Division (2019).
#World Population Prospects 2019 - Special Aggregates, Online Edition. Rev. 1.)
popdata=pd.read_csv('totpop.csv')
agedata=pd.read_csv('medianage.csv')
sexdata=pd.read_csv('sexratio.csv')
### specify relevant columns for t-SNE transform:
relcols=['pop','medianage','sexratio']
### selected objects (here: countries) to be labelled in the plot:
example_labels=['Germany','Mexico','Burundi','Australia','Luxembourg','Viet Nam','Lesotho']
label_examples=True
## some cleaning and restructuring: ##############################
agedata=agedata[agedata['Country code']<900]
popdata=popdata[popdata['Country code']<900]
sexdata=sexdata[sexdata['Country code']<900]
## country name dictionary:
obj_dict = dict(agedata[['Country code','Region, subregion, country or area *']].values)
years=np.arange(1950,2021,10)
agedata=agedata[['Country code']+[str(int(x)) for x in years]]
popdata=popdata[['Country code']+[str(int(x)) for x in years]]
sexdata=sexdata[['Country code']+[str(int(x)) for x in years]]
agedata.columns=['Country code']+['medianage%s' %x for x in agedata.columns[1:]]
popdata.columns=['Country code']+['pop%s' %x for x in popdata.columns[1:]]
sexdata.columns=['Country code']+['sexratio%s' %x for x in sexdata.columns[1:]]
mergeddata=agedata.merge(popdata,on='Country code').merge(sexdata,on='Country code')
for col in mergeddata.columns:
mergeddata[col]=mergeddata[col].map(str).str.replace(' ','')
mergeddata[col]=mergeddata[col].map(float)
datadf=pd.DataFrame()
for year in years:
yeardf=pd.DataFrame()
yeardf['object']=mergeddata['Country code']
yeardf['pop']=mergeddata['pop%s' %year]
yeardf['medianage']=mergeddata['medianage%s' %year]
yeardf['sexratio']=mergeddata['sexratio%s' %year]
yeardf['time']=year
datadf=datadf.append(yeardf)
###############################################################################
datadf=datadf.sort_values(by=['object','time'])
#transform to log, assumed to be skewed
datadf['pop']=np.log(1+datadf['pop'].values)
#scale to (0,1)
datadf['pop']=np.divide(datadf['pop'],np.nanmax(datadf['pop']))
datadf['medianage']=np.divide(datadf['medianage'],np.nanmax(datadf['medianage']))
datadf['sexratio']=np.divide(datadf['sexratio'],np.nanmax(datadf['sexratio']))
# as baseline t-SNE model, to be refined by the user.
model = TSNE(n_components=2,n_iter = 1000, random_state=0)
np.set_printoptions(suppress=True)
tsnecoords = model.fit_transform(datadf[relcols].values)
datadf['tsne_x']=tsnecoords[:,0]
datadf['tsne_y']=tsnecoords[:,1]
#plot trajectories: ###########################################################
cmap=plt.get_cmap('jet')
uniq = np.unique(datadf.time.values)
cNorm = colors.Normalize(vmin=0, vmax=len(uniq))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cmap)
fig = plt.figure()
#plot.style.use('dark_background')
plt.grid(False)
objcount=0
for objid,objdf in datadf.groupby('object'):
objdf=objdf.sort_values(by='time')
currptsx = objdf.tsne_x.values
currptsy = objdf.tsne_y.values
count=0
for segment in np.arange(0,currptsx.shape[0]):
segmx=currptsx[segment:segment+2]
segmy=currptsy[segment:segment+2]
plt.plot(segmx, segmy, color=scalarMap.to_rgba(count),alpha=0.6, linewidth=1,zorder=1) #scalarMap.to_rgba(count)
if label_examples and obj_dict[objid] in example_labels:
if count==0:
plt.annotate(obj_dict[objid]+' %s' %years[count], (currptsx[segment], currptsy[segment]),color='white',fontsize=8,zorder=2)
plt.scatter(x=currptsx[segment], y=currptsy[segment],c='white',s=1,zorder=2)
if count==currptsx.shape[0]-1:
plt.annotate(obj_dict[objid]+' %s' %years[count], (currptsx[segment], currptsy[segment]),color='white',fontsize=8,zorder=2)
plt.scatter(x=currptsx[segment], y=currptsy[segment],c='white',s=1,zorder=2)
count+=1
ax = plt.gca()
ax.set_facecolor('black')
ax.set_xlabel('t-SNE 1')
ax.set_ylabel('t-SNE 2')
plt.show()
cmappable = ScalarMappable(Normalize(years[0],years[-1]),cmap=cmap)
cmappable.set_array([years[0],years[-1]])
fig.colorbar(cmappable)
fig.savefig('tsne_trajectories.jpg',dpi=150)