-
Notifications
You must be signed in to change notification settings - Fork 0
/
tab-languages.py
158 lines (129 loc) · 4.42 KB
/
tab-languages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import configparser
import csv
import logging
import os
import time
import zipfile
from mongoengine import connect
from pymongo import ReadPreference
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from opac_schema.v1 import models
CONFIG = configparser.ConfigParser()
CONFIG.read('config.ini')
TIMENOW = time.strftime('%Y%m%d_%H%M')
def connect_mongodb():
try:
# reads config
mdb = CONFIG._sections['MONGO-OPAC']
# reads reference
rp = ReadPreference.SECONDARY if mdb['readpreference'] == 'secondary' else ReadPreference.PRIMARY
connect(db=mdb['dbname'],
username=mdb['username'],
password=mdb['password'],
host='mongodb://{hostnames}'.format(hostnames=mdb['hostnames']),
port=int(mdb['port']),
replicaSet=mdb['replicaset'],
read_preference=rp,
)
except (ServerSelectionTimeoutError, ConnectionFailure) as et:
logging.info('timeout, connect failure')
logging.exception(et)
def get_data(item):
# aka
aka = ''
if item.scielo_pids and item.scielo_pids.get('other'):
aka = set(item.scielo_pids.get('other'))
try:
aka.remove(item._id)
except KeyError:
pass
try:
aka.remove(item.pid)
except KeyError:
pass
aka = ';'.join(aka)
# languages
languages = set()
for lang in item.pdfs:
if lang['lang'] != '':
languages.add(lang['lang'].strip().lower())
for lang in item.htmls:
if lang['lang'] != '':
languages.add(lang['lang'].strip().lower())
# languages [pt, es, en]
doc_pt = 1 if 'pt' in languages else 0
doc_es = 1 if 'es' in languages else 0
doc_en = 1 if 'en' in languages else 0
# other languages
xlang = languages.copy()
for l in ('pt', 'en', 'es'):
try:
xlang.remove(l)
except Exception as e:
pass
doc_other_lang = 1 if xlang else 0
# Data
data_dict = dict(
pid_v3 = item._id,
pid_v2 = item.pid,
aka = aka,
type = item.type.strip().lower(),
doi = item.doi,
languages = ';'.join(languages),
document_pt = doc_pt,
document_es = doc_es,
document_en = doc_en,
document_other_languages = doc_other_lang,
)
return data_dict
def main():
# MongoDB Connection
connect_mongodb()
# Directory and file names output
dirout = CONFIG._sections['DIRPATH']['diroutput']
csvfilename = os.path.join(dirout, 'opac-tabs-{now}.csv'.format(now=TIMENOW))
zipfilename = os.path.join(dirout, 'opac-tabs-{now}.zip'.format(now=TIMENOW))
# Create output directory
if not os.path.exists(dirout):
os.mkdir(dirout)
# Get and writes data in CSV file
fieldnames = ['pid_v3', 'pid_v2', 'aka', 'type', 'doi', 'languages',
'document_pt', 'document_es', 'document_en',
'document_other_languages',
]
with open(csvfilename, mode='w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
query = models.Article.objects.filter(is_public=True)
total = str(query.count())
logging.info('total records: ', total)
for item in query:
try:
writer.writerow(get_data(item))
except Exception as e:
logging.info(item._id)
logging.exception(e)
csvfile.close()
# ZipFile
try:
if os.path.isfile(csvfilename):
zf = zipfile.ZipFile(zipfilename, mode='x')
zf.write(csvfilename, compress_type=zipfile.ZIP_DEFLATED)
zf.close()
# remove CSV file
os.remove(csvfilename)
except Exception as e:
logging.info(csvfilename, zipfilename)
logging.exception(e)
# Remove old zip files keeping the 3 most recent
ld = [fzip for fzip in os.listdir(dirout) if fzip.startswith('opac-tabs-') and fzip.endswith('.zip')]
ld.sort()
if len(ld) > 3:
try:
for file_to_remove in ld[:-3]:
os.remove(os.path.join(dirout, file_to_remove))
except Exception as e:
logging.info(str(ld))
logging.exception(e)
if __name__ == '__main__':
main()