-
Notifications
You must be signed in to change notification settings - Fork 0
/
insert_data.py
158 lines (118 loc) · 4.84 KB
/
insert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import psycopg2
import spotipy
from datetime import datetime
from dateutil import tz
from psycopg2.extensions import AsIs
from json import loads
# Connection to database
conn = psycopg2.connect(database="spotify", user="postgres", password="password", host="127.0.0.1", port="5432")
cursor = conn.cursor()
# Listening files
files = [
'/Users/muhammadsohail/Desktop/MyData-7/endsong_0.json',
'/Users/muhammadsohail/Desktop/MyData-7/endsong_1.json',
'/Users/muhammadsohail/Desktop/MyData-7/endsong_2.json'
]
# Function to insert listening histroy into table
def insert_listens():
# Time zones for timestamp conversion
from_zone = tz.tzutc()
to_zone = tz.tzlocal()
# Function to filter columns I don't need
def removeCols(col):
return col not in [
'username',
'ip_addr_decrypted',
'user_agent_decrypted',
'episode_name',
'episode_show_name',
'spotify_episode_uri',
]
for filename in files:
with open(filename, 'r') as file:
data = loads(file.read()) # Load the JSON array for each file
columns = list(filter(removeCols, data[0].keys())) # Filtering out unwanted columns
for song in data:
if song['episode_name']: # I don't care about podcasts sadly
continue
song = dict(filter(removeCols, song.items())) # Filtering out unwanted data
song['ts'] = datetime.fromisoformat(song['ts'][:-1]) \
.replace(tzinfo=from_zone) \
.astimezone(to_zone) \
.isoformat() # Converting to local timezone
try:
values = [song[column] for column in columns]
insert_statement = 'insert into listens (%s) values %s'
cursor.execute(insert_statement, (AsIs(','.join(columns)), tuple(values))) # Inserting into 'listens' table
except Exception as e:
print(e, song)
break
conn.commit() # Commiting changes to DB
# Function to insert genres into table
def insert_genres():
client_id = ''
client_secret = '' # API constants
# Spotipy clients
ccm = spotipy.SpotifyClientCredentials(client_id=client_id, client_secret=client_secret, cache_handler=spotipy.MemoryCacheHandler())
sp = spotipy.Spotify(client_credentials_manager=ccm, requests_timeout=10)
songs = {}
artists = {}
genres = {}
# Iterating through listens and determining the number of streams for each song
for filename in files:
with open(filename, 'r') as file:
data = loads(file.read())
for song in data:
if not song['spotify_track_uri']:
continue
if not songs.get(song['spotify_track_uri']):
songs[song['spotify_track_uri']] = 0
songs[song['spotify_track_uri']] += 1
# Getting all the song URIs
song_uris = list(songs.keys())
# Iterating through the tracks and determining the number of streams for each artist
for i in range(0, len(song_uris), 50):
print(round(i / len(song_uris) * 100, 2), end='\t\r')
try:
tracks = sp.tracks(song_uris[i:i + 50])['tracks']
for track in tracks:
for artist in track['artists']:
if not artists.get(artist['id']):
artists[artist['id']] = 0
artists[artist['id']] += songs[track['uri']]
except Exception as e:
print(e, i, i + 50)
print()
# Getting artist URIs
artist_uris = list(artists.keys())
# Iterating through the artists and determining the number of streams for each genre
for i in range(0, len(artist_uris), 50):
print(round(i / len(artist_uris) * 100, 2), end='\t\r')
try:
api_artists = sp.artists(artist_uris[i:i + 50])['artists']
for artist in api_artists:
for genre in artist['genres']:
if not genres.get(genre):
genres[genre] = 0
genres[genre] += artists[artist['id']]
except Exception as e:
print(e, i, i + 50)
print()
# Mapping to appropriate dictionary structure
genre_data = [{ 'genre': g, 'listens': genres[g] } for g in genres]
columns = ['genre', 'listens']
# Inserting genre streams into table
for genre in genre_data:
try:
values = [genre[column] for column in columns]
insert_statement = 'insert into genres (%s) values %s'
cursor.execute(insert_statement, (AsIs(','.join(columns)), tuple(values)))
except Exception as e:
print(e, genre)
break
# Commiting changes to the DB
conn.commit()
insert_listens()
insert_genres()
# Closing the connection
conn.close()