-
Notifications
You must be signed in to change notification settings - Fork 0
/
ght_commits2es.py
executable file
·159 lines (129 loc) · 5.22 KB
/
ght_commits2es.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copy data from GHTorrent commits to Elasticsearch
#
# Copyright (C) Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Authors:
# Alvaro del Castillo San Felix <[email protected]>
#
import argparse
import logging
import pymysql
from elasticsearch import helpers, Elasticsearch
HTTPS_CHECK_CERT = False
NUM_COMMITS = 0
BULK_CHUNK_SIZE = 10000
GITHUB_URL = "https://github.com"
def get_params():
parser = argparse.ArgumentParser(usage="usage: ght2es.py [options]",
description="Publish GHTorrent data in Elasticsearch")
parser.add_argument("-e", "--elastic-url", required=True,
help="Elasticsearch URL with the metrics")
parser.add_argument('-g', '--debug', action='store_true')
parser.add_argument('-i', '--index', required=True, help='Index to fill with GHTorrent info')
parser.add_argument('--db-name', default='gh-torrent', help='GHTorrent database (gh-torrent default)')
parser.add_argument('--db-user', default='root', help='GHTorrent database user (root default)')
parser.add_argument('--db-passwd', default='', help='GHTorrent database password ("" default)')
return parser.parse_args()
def fetch_commits(es_index, db_con):
"""
Fetch commits adding information about the author login and org and the repository URL
:param es_index: Elasticsearch index in which to publish the data
:param db_con: connection to GHTorrent database
:return: a dict with the commit info to be added to es_index
"""
global NUM_COMMITS
commits_table = "commits2018" # Working only with commits in 2018
commits_sql = """
SELECT c.created_at, sha, login as author_login, company, url, forked_from, language
FROM %s c, projects p, users u
WHERE c.project_id = p.id AND c.author_id = u.id
""" % commits_table
# commits_sql += " LIMIT 1000000" # for debugging
logging.info("Getting commits: %s" % commits_sql)
db_cursor = db_con.cursor(pymysql.cursors.SSCursor)
db_cursor.execute(commits_sql)
logging.info("SQL query finished")
for commit_row in db_cursor:
api_url = commit_row[4]
[api, owner, repo] = [None, None, None]
if api_url:
[api, owner, repo] = api_url.rsplit("/", 2)
project_url = GITHUB_URL + "/%s/%s" % (owner, repo)
commit_json = {
"created_at": commit_row[0],
"sha": commit_row[1],
"author_login": commit_row[2],
"author_org": commit_row[3],
"url": project_url,
"repo_forked_from": commit_row[5],
"repo_language": commit_row[6],
"repo_name": owner+"_"+repo
}
item = {
"_index": es_index,
"_type": "item",
"_source": commit_json
}
NUM_COMMITS += 1
yield item
def publish_commits(es_url, es_index, db_con):
"""
Publish all the commits
:param es_url: URL for Elasticsearch
:param es_index: index in Elasticsearch
:param db_con: connection to GHTorrent database
:return:
"""
global NUM_COMMITS
es_conn = Elasticsearch([es_url], timeout=100, verify_certs=HTTPS_CHECK_CERT)
NUM_COMMITS = 0
helpers.bulk(es_conn, fetch_commits(es_index, db_con), chunk_size=BULK_CHUNK_SIZE)
logging.info("Total commits published in %s: %i", es_index, NUM_COMMITS)
def db_connect(name, user, passwd, host='localhost', port=3306):
"""
Connect to the MySQL database.
:param name: database name
:param user: database connect user
:param password: database connect password
:param host: host in which mysql server is running
:param port: port in which mysql server is listening
:return: a connection to the database
"""
"""
"""
try:
db = pymysql.connect(user=user, passwd=passwd,
host=host, port=port,
db=name, use_unicode=True)
return db
except Exception:
logging.error("Database connection error")
raise
if __name__ == '__main__':
args = get_params()
if args.debug:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
logging.debug("Debug mode activated")
else:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)
db_con = db_connect(args.db_name, args.db_user, args.db_passwd)
publish_commits(args.elastic_url, args.index, db_con)