-
Notifications
You must be signed in to change notification settings - Fork 0
/
charset.py
67 lines (54 loc) · 1.9 KB
/
charset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import sys
import json
import gzip
import subprocess
import os
import re
import unicodedata
import time
import numpy as np
from argparse import ArgumentParser
from collections import Counter
from multiprocessing import Pool, Lock
def argparser():
ap = ArgumentParser()
ap.add_argument('input_dir', help='Input directory with jsonl files')
ap.add_argument('--processes', type=int, default=4, help='Number of processes to use')
return ap
def update_charset(charset, text):
prepared_text = ''.join(c for c in text if unicodedata.category(c) == 'Zs' or c.isprintable())
charset.update(prepared_text)
def process_file(jsonl_file):
charset = Counter()
with open(jsonl_file, 'r') as f:
for line in f:
indata = json.loads(line)
text = indata['text']
update_charset(charset, text)
return charset
def update_progress(p, start_time):
elapsed_time = time.time() - start_time
hours = int(elapsed_time / 3600)
minutes = int(elapsed_time / 60) % 60
seconds = elapsed_time % 60
sys.stderr.write(f"json_file done: {p:.2%} ({hours:02d}:{minutes:02d}:{seconds:06.03f})\n")
def main(argv):
args = argparser().parse_args()
jsonl_files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if f.endswith('.jsonl')]
total_files = len(jsonl_files)
start_time = time.time()
with Pool(processes=args.processes) as pool:
charsets = []
for i, charset in enumerate(pool.imap_unordered(process_file, jsonl_files)):
update_progress((i+1) / len(jsonl_files), start_time)
charsets.append(charset)
charset = Counter()
for cs in charsets:
charset.update(cs)
chars = sorted(charset.keys())
for c in chars:
print(c, end='')
sys.stderr.write(f"{c}")
sys.stderr.write("\n")
if __name__ == '__main__':
sys.exit(main(sys.argv))