-
Notifications
You must be signed in to change notification settings - Fork 3
/
process_sentences.py
98 lines (71 loc) · 2.18 KB
/
process_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#%%
import text_utils as tu
from pathlib import Path
from tqdm import tqdm
import datetime as dt
from collections import namedtuple
import argparse
#%%
parser = argparse.ArgumentParser(
description='Process raw corpus txt file, split to lines, check is sentence is valid, remove to short and to long sentences and save to file with suffix "_lines"'
)
parser.add_argument("corpus_file", type=str, help="corpus txt raw input file")
parser.add_argument(
"-sp",
"--split_each_line_as_doc",
action="store_true",
help="If true each line from corpus file will be treated as document, new line will be added after last sentence from this line",
default=False,
)
parser.add_argument(
"-vs",
"--check_valid_sentence",
action="store_true",
default=False,
help="check if extracted sentence is valid polish sentence, if not do not save it in output file",
)
parser.add_argument(
"-ls",
"--check_lang_sentence",
action="store_true",
default=False,
help="check if extracted sentence is in polish, remove sentences in other lang, do not save it in output file",
)
parser.add_argument(
"-ml",
"--max_sentence_length",
type=int,
default=700,
help="remove longer(in chars) sentences",
)
parser.add_argument(
"-u",
"--krnnt_pos_url",
type=str,
default="http://localhost:9003",
help="KRNNT pos tagger docker url",
)
args = parser.parse_args()
#%%
corpus_oscar_raw = args.corpus_file
p = Path(corpus_oscar_raw)
corpus_oscar_lines = f"{p.with_suffix('')}_lines.txt"
print(f"Start preparing corpus")
print(f"in file={corpus_oscar_raw}\nout file={corpus_oscar_lines}")
start = dt.datetime.now()
print(f"Start time: {start}")
stats, vl, pl = tu.corpus_process_sentence(
corpus_oscar_raw,
corpus_oscar_lines,
split_each_line_as_doc=args.split_each_line_as_doc,
check_valid_sentence=args.check_valid_sentence,
check_lang_sentence=args.check_lang_sentence,
max_sentence_length=args.max_sentence_length,
krnnt_url=args.krnnt_pos_url,
)
end = dt.datetime.now()
print(f"Finish. End time: {end} Start time: {start} took={end-start}")
from pprint import pprint
print(f"Cleaning stats")
pprint(stats)
#%%