-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize.sh
executable file
·83 lines (76 loc) · 2.04 KB
/
tokenize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# help
display_help() {
echo "Usage: $0 [option...] " >&2
echo
echo " -h Display this help message"
echo " -s Specify the token separator in output (blank space is the default)"
echo " -n Do not break on hyphens"
echo " -a Do not break on apostrophes"
echo " -l One sentence per line"
echo " -o Specify output file name"
echo " -i Specify input file name"
echo
exit 1
}
SEP=" "
INPUTFILE="/dev/stdin"
OUTPUTFILE="/dev/stdout"
NOHYPHEN=0
NOAPOS=0
ONESPL=0
# As long as there is at least one more argument, keep looping
while getopts "hnals:o:i:" OPTION
do
case $OPTION in
s)
SEP="$OPTARG"
;;
n)
NOHYPHEN=1
;;
a)
NOAPOS=1
;;
l)
ONESPL=1
;;
o)
OUTPUTFILE="$OPTARG"
;;
i)
INPUTFILE="$OPTARG"
;;
h)
display_help # Call your function
exit 0
;;
esac
done
if [ ! -z "$OUTPUTFILE" ]; then
if [ -f "$OUTPUTFILE" ]; then
read -p "File exist! Are you sure [y,n]? " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]];
then
rm $OUTPUTFILE
else
exit
fi
fi
fi
if [[ "$ONESPL" == 1 ]]; then
cmd1="awk '{gsub(/[!\.?]\s*/,\"\n\"); print \$0}'"
else
cmd1="cat"
fi
if [[ "$NOHYPHEN" == 1 ]] && [[ "$NOAPOS" == 1 ]]; then
cmd2="awk '{gsub(/-{2,}/,\"\"); print \$0}' | tr -s '\n ' | awk '{gsub(/[^[:alpha:]\'\''-]/,\""$SEP"\"); print \$0}'"
elif [[ "$NOHYPHEN" == 1 ]]; then
cmd2="awk '{gsub(/-{2,}/,\"\"); print \$0}' | tr -s '\n ' | awk '{gsub(/[^[:alpha:]-]/,\""$SEP"\"); print \$0}'"
elif [[ "$NOAPOS" == 1 ]]; then
cmd2="awk '{gsub(/-{2,}/,\"\"); print \$0}' | tr -s '\n ' | awk '{gsub(/[^[:alpha:]\'\'']/,\""$SEP"\"); print \$0}'"
else
cmd2="awk '{gsub(/-{2,}/,\"\"); print \$0}' | tr -s '\n ' | awk '{gsub(/[^[:alpha:]]/,\""$SEP"\"); print \$0}'"
fi
(tr -d '\r' | sed '/^$/d' | awk '{$1=$1};1' | eval $cmd1 | eval $cmd2 | tr -s "$SEP" | sed "s/$SEP\+\$//" | sed "s/^$SEP\+//" | sed '/^$/d') < "$INPUTFILE" > "$OUTPUTFILE"