-
Notifications
You must be signed in to change notification settings - Fork 14
/
GenerateMeta.groovy
148 lines (112 loc) · 4.89 KB
/
GenerateMeta.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/env groovy
@Grab(group='org.apache.commons', module='commons-csv', version='1.10.0')
import groovy.transform.CompileStatic
import groovy.xml.slurpersupport.GPathResult
import groovy.xml.slurpersupport.NodeChild
import org.apache.commons.csv.CSVFormat
import org.apache.commons.csv.CSVParser
import org.apache.commons.csv.CSVPrinter
import org.apache.commons.csv.CSVRecord
@groovy.transform.SourceURI
URI SOURCE_URI
String SCRIPT_DIR=new File(SOURCE_URI).parent
def origHeaders = ["group", "cat", "file", "author_surname", "author_name", "title", "publ_in", "url", "publ_part", "publ_place", "publisher", "year", "pages", "length", "alt_orth", "errors", "comments"]
def newHeaders = []
for(int i=1; i<11; i++) {
newHeaders << "author_surname_$i"
newHeaders << "author_name_$i"
}
def headers = origHeaders + newHeaders
def csvFormat = CSVFormat.Builder.create()
.setHeader()
.setSkipHeaderRecord(true)
.setIgnoreSurroundingSpaces(true)
.setAllowMissingColumnNames(true)
.build()
CSVParser metaParser = csvFormat.parse(new FileReader("meta/meta.csv"))
List<CSVRecord> records = []
for(CSVRecord record: metaParser) {
records << record
}
def metasWithoutFile = records.collect{ r -> r['file'] }
CSVPrinter printer = new CSVPrinter(new FileWriter("meta_so-so.csv"), CSVFormat.EXCEL)
printer.printRecord(headers);
def dirNames = ["good", "so-so"]
dirNames.each { dirName ->
//def metaSuffix = dirName == "good" ? "" : "_${dirName}"
println "Working on category: $dirName"
def dir = "../data/$dirName"
File txtFolder = new File(SCRIPT_DIR, dir)
def files = txtFolder.listFiles().findAll { it.name.endsWith('.txt') }.sort{ it.name }
def xmlFiles = new File(SCRIPT_DIR, "../data/disambig").listFiles().collect { it.name }.toSorted()
def wrongExtension = xmlFiles.findAll{ ! it.endsWith('.xml') }
if (wrongExtension) {
println "Wrong extensions in xml folder:\n${wrongExtension.join('\n')}"
}
files.each { File file->
File txtFile = new File(txtFolder, file.name)
String text = file.getText('utf-8')
if( dirName == "good" ) {
if( ! xmlFiles.contains(file.name.replaceFirst(/\.txt$/, '.xml')) ) {
println "xml missing for ${file.name}"
}
}
def trueLength = countWords(text)
if( trueLength < 10 ) {
System.err.println "ERROR: word count too small ${trueLength}: ${file.name}"
}
def record = records.find{ r -> r['file'] == file.name }
if( ! record ) {
println "ERROR: File not in the meta: ${file.name}"
}
else {
if( Math.abs(trueLength - (record['length'] as int)) > 5 ) {
println "Actual word count ${trueLength} does not match meta: ${record['length']} for: ${file.name}"
}
if( ! file.name.startsWith("${record['cat']}_") ) {
println "ERROR: meta category ${record['cat']} does not match filename: ${file.name}"
}
metasWithoutFile.remove(file.name)
}
if( text.contains("<body>") ) {
if( ! text.trim().endsWith("</body>") )
println "closing </body> missing - ${file.name}"
String metaXml = text.replaceAll(/(?s)(.*?)<body>.*/, '<text><meta>$1</meta></text>')
metaXml = metaXml.replace('&', '&')
try {
GPathResult xml = new groovy.xml.XmlSlurper().parseText(metaXml)
def metaItems = xml.children().getAt(0).children().collect { GPathResult it -> it.name() }
metaItems -= headers
metaItems -= "id"
if( metaItems ) {
println "Unknown meta items for ${file.name}: $metaItems"
}
xml.meta.with {
def values = [dirName, id, file.name, author_surname, author_name, title, publ_in, url, publ_part, publ_place, publisher, year, pages, trueLength, alt_orth, errors, comments]
newHeaders.each {
values << xml.meta[it]
}
printer.printRecord(values)
}
}
catch(e) {
System.err.println("Failed to parse: ${file.name}")
throw e
}
if( dirName == "good" ) {
println "WARNING: Txt file still has meta: ${file.name}"
}
}
else {
}
}
}
println "ERROR: Meta records without files (${metasWithoutFile.size()}):\n${metasWithoutFile.toSorted().join('\n')}"
printer.flush()
@CompileStatic
int countWords(String text) {
def pureText = text.replaceFirst(/.*<body>/, '')
pureText = pureText.replaceAll(/([0-9])[:,.-]([0-9])/, '$1$2').trim()
def words = pureText =~ /(?ui)[а-яіїєґ][а-яіїєґa-z0-9\u0301'’ʼ\/\u2013-]*/
return words.size()
}