-
Notifications
You must be signed in to change notification settings - Fork 8
/
configure
executable file
·91 lines (82 loc) · 2.63 KB
/
configure
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#make standalone
STANDALONE=lib/LanguageIdentifier.js
STANDALONE_TEMPLATE=src/Tika/LanguageIdentifier.js
line_begin=`sed -n '/var LanguageIdentifier = (function() {/=' $STANDALONE_TEMPLATE`; \
sed -n "1,${line_begin}p" $STANDALONE_TEMPLATE > $STANDALONE; \
sed '1,9d' src/StringHashMap.js | sed 's!^!\t!' >> $STANDALONE; \
sed '1,9d' src/isUnicodeLetter.js | sed 's!^!\t!' >> $STANDALONE; \
sed '1,9d' src/Tika/LanguageProfile.js | sed 's!^!\t!' >> $STANDALONE; \
sed '1,9d' src/Tika/ProfilingWriter.js | sed 's!^!\t!' >> $STANDALONE; \
sed "1,${line_begin}d" $STANDALONE_TEMPLATE >> $STANDALONE
#generate lng addons
for FILE in ngp/*.ngp
do
LNG=`basename $FILE .ngp`
SUM=0
for COUNT in $(sed '/^#/d;s/^\S\+\s\+\([0-9]\+\)$/\1/' ngp/$LNG.ngp)
do
SUM=`expr $SUM + $COUNT`;
done
echo $LNG $SUM
echo "/*!" > lib/Lng/$LNG.js
echo " * $LNG addon for LanguageIdentifier v0.2" >> lib/Lng/$LNG.js
echo " * http://tika.apache.org/" >> lib/Lng/$LNG.js
echo " * https://github.com/mazko/jsli" >> lib/Lng/$LNG.js
echo " *" >> lib/Lng/$LNG.js
echo " * Copyright `date +'%Y.%m.%d'`, Oleg Mazko" >> lib/Lng/$LNG.js
echo " * http://www.opensource.org/licenses/bsd-license.html" >> lib/Lng/$LNG.js
echo " */" >> lib/Lng/$LNG.js
echo "(function() {" >> lib/Lng/$LNG.js
echo "var ngrams = {" >> lib/Lng/$LNG.js
perl -CSD -p -e 's{([^\t\n\x20-\x7E])}{sprintf "\\u%04x", ord $1}eg' -f ngp/$LNG.ngp | sed '/^#/d;s/^\(\S\+\)\s\+\([0-9]\+\)$/"\1":\2,/' >> lib/Lng/$LNG.js
sed -i '$s/,$//' lib/Lng/$LNG.js
echo "};" >> lib/Lng/$LNG.js
echo "LanguageIdentifier.addProfile('$LNG', ngrams, $SUM);" >> lib/Lng/$LNG.js
echo "}());" >> lib/Lng/$LNG.js
done
#print how include all addons in html page
for FILE in ngp/*.ngp
do
LNG=`basename $FILE .ngp`
echo "<script src="../lib/Lng/$LNG.js"></script>"
done
#convert multiline text to plain string
#cat en.test | tr '\n' ' ' | sed 's/"/\\\"/g' > 1
#check total count in origin Java Tika
#LanguageIdentifier.initProfiles();
#Field field = LanguageIdentifier.class.getDeclaredField("PROFILES");
#field.setAccessible(true);
#Map<String, LanguageProfile> profiles = (Map<String, LanguageProfile>)field.get(null);
#for(Entry<String, LanguageProfile> entry : profiles.entrySet()) {
#System.out.println(entry.getKey() + ": " + entry.getValue().getCount());
#}
#
#output
#
#ro: 18795406
#ca: 21269815
#no: 1917111
#hu: 1838851
#lt: 21193343
#th: 1557558
#de: 6187872
#fi: 5881942
#sv: 5330933
#fr: 6094304
#be: 294317
#sl: 15068758
#sk: 12972964
#uk: 3846451
#is: 826407
#da: 5324113
#it: 6549432
#gl: 20288369
#el: 5581755
#pt: 5993011
#pl: 854272
#eo: 16541099
#en: 5333054
#ru: 925340
#et: 2373307
#es: 5928555
#nl: 5865945