forked from mdocekal/namegen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
diff.py
executable file
·131 lines (105 loc) · 3.61 KB
/
diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
# encoding: utf-8
"""
Created on 10. 2. 2019
Compares two results of namegen.py and determines if they are the same.
@author: xdocek09
"""
import sys
import re
import os
class Morphs(object):
def __init__(self, s):
self.cases=[self.parseCase(c) for c in s.split("|")]
self._origRepr=s
self._hash=hash(''.join(sorted(s)))
def parseCase(self, c):
words=[]
for w in re.split("-|–|,|\s|"+u'\u200b',c):
variants=set()
for variant in w.split("/"):
try:
wordTagRule, wType=variant.rsplit("#", 1)
except ValueError:
wordTagRule, wType=variant, ""
match = re.match( r'(.+)\[(.+)\]', wordTagRule, re.MULTILINE)
if match is None:
#probably without tag rule
variants.add((wordTagRule,"",wType))
else:
#word, tagRule,type
variants.add((match.group(1),match.group(2),wType))
words.append(variants)
return words
def __hash__(self):
return self._hash
def __repr__(self):
return self._origRepr
def __str__(self):
return repr(self)
def __eq__(self, other):
if isinstance(other, Morphs):
if len(self.cases)!=len(other.cases):
return False
for wordsS,wordsO in zip(self.cases, other.cases):
if len(wordsS)!=len(wordsO):
#their do not have same number of words
return False
for variantsF, variantsO in zip(wordsS,wordsO):
#ok, finally we have two sets, that we can compare
if variantsF!=variantsO:
return False
return True
return False
def loadFile(fileName):
names={}
with open(fileName, "r") as f:
for line in f:
parts=line.split("\t")
name=parts[0]
nameType=parts[2]
if len(parts[3])==0:
res=(nameType, parts[3])
else:
res=(nameType, Morphs(parts[3]), parts[4])
try:
names[name].add(res)
except KeyError:
names[name]=set([res])
return names
first=loadFile(sys.argv[1])
firstName=os.path.basename(sys.argv[1])
second=loadFile(sys.argv[2])
secondName=os.path.basename(sys.argv[2])
numberOfDiff=0
numberOfNoMorphs=0
for name in set(first.keys())|set(second.keys()):
noMorphs=0
try:
for v in first[name]:
if len(v)==2 or len(v[1].cases)==0:
noMorphs=1
break
except KeyError:
print(name+"\t"+"is not in "+sys.argv[1]+".")
continue
try:
for v in second[name]:
if len(v)==2 or len(v[1].cases)==0:
noMorphs= 0 if noMorphs>0 else 2
break
except KeyError:
print(name + "\t" + "is not in " + sys.argv[2] + ".")
continue
if noMorphs>0:
numberOfNoMorphs+=1
print(name+"\t"+"No morphs in "+[sys.argv[1],sys.argv[2]][noMorphs-1]+".")
else:
for fileName, diffX, diffY in [(sys.argv[1],first[name],second[name]),(sys.argv[2],second[name],first[name])]:
diffVar=diffX - diffY
if len(diffVar)>0:
numberOfDiff+=len(diffVar)
for x in diffVar:
print(fileName+"\t"+str(x))
print("Number of no morphs at one and some morphs at other:\t"+str(numberOfNoMorphs))
print("Number of different name variants:\t"+str(numberOfDiff))