-
Notifications
You must be signed in to change notification settings - Fork 0
/
readability_scores_calculator.py
160 lines (126 loc) · 6.21 KB
/
readability_scores_calculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Modules providing regular expression matching operations, simple modal dialog to
get a value from the user and a function ti calculate the square root of a given number"""
import re
from math import sqrt
from tkinter import filedialog
# Text analysis with RegEx
def caracteres(txt):
"""Counts the number of characters"""
return len(re.findall(r'\w', txt))
def voyelles(txt):
"""Counts the number of vowels"""
return len(re.findall(r'[aeiouœéèàâûù]', txt, re.I))
def digrammes(txt):
"""Counts the number of vowel digraphs"""
return len(re.findall(r'(au)|(eu)|(ou)|(oû)|(où)|(oi)|(œu)|(ei)|(ai)|(ée)|(ue)|(ui)|(ua)', txt, re.I))
def trigrammes(txt):
"""Counts the number of vowel trigraphs"""
return len(re.findall(r'(eau)|(oue)|(aie)', txt, re.I))
def syllabes_graphiques(txt):
"""Counts the number of written syllables by substracting the number of bigraphs and trigraphs from the number of vowels"""
return voyelles(txt) - (digrammes(txt)+trigrammes(txt))
def mots(txt):
"""Counts the number of words"""
return len(re.findall(r'\w+', txt))
def mots_plusdesixlettres(txt):
"""Counts the number of words consisting of more than six letters"""
return len(re.findall(r'\w{7,}', txt))
def mots_plusdetroissyllabes(txt):
"""Counts the number of words consisting of more than three syllables"""
mots_polysyllabiques = 0
for mot in txt.split():
if (syllabes_graphiques(mot)) >= 3 :
mots_polysyllabiques+=1
return mots_polysyllabiques
def phrases(txt):
"""Counts the number of sentences"""
if len(re.findall(r'\w{2,}\s?[.?!]', txt)) == 0:
return 1 # Prevents ZeroDivison error
return len(re.findall(r'\w{2,}\s?[.?!]', txt))
# Readability formulas
def ari(txt):
"""Calculates the Automated Readability Index (ARI)"""
return 4.71*(caracteres(txt)/mots(txt))+0.5*(mots(txt)/phrases(txt))-21.43
def coleman_liau(txt):
"""Calculates the Coleman-Liau index"""
return 0.0588*((caracteres(txt)/mots(txt))*100)-0.296*((phrases(txt)/mots(txt))*100)-15.8
def fkgl(txt):
"""Calculates the Flesch-Kincaid Grade Level (FKGL)"""
return 0.39*(mots(txt)/phrases(txt))+11.8*(syllabes_graphiques(txt)/mots(txt))-15.59
def gunning(txt):
"""Calculates the Gunning Fog index"""
return 0.4*((mots(txt)/phrases(txt))+100*(mots_plusdetroissyllabes(txt)/mots(txt)))
def smog(txt):
"""Calculates the Simple Measure of Gobbledygook index (SMOG)"""
return 1.043*(sqrt((mots_plusdetroissyllabes(txt)*(30/phrases(txt)))))+3.1291
def lix(txt):
"""Calculates the LIX index"""
return mots(txt)/phrases(txt)+100*(mots_plusdesixlettres(txt)/mots(txt))
def rix(txt):
"""Calculates the RIX index"""
return mots_plusdesixlettres(txt)/phrases(txt)
# Score analysis
def score_analysis(formula, score):
'''Gives the reading levels corresponding to a score for each formula'''
formula_scales = [['lix',59,50,40,30],
['rix',7.1,5.3,2.9,1.8],
['fkgl',15,12,5,1],
['gunning',17,13,7,1],
['smog',14,12,7,1],
['ari',15,9,5,1],
['coleman_liau',15,9,5,1]]
for column in range(7):
for row in range(5):
if formula_scales[column][row] == formula :
if score > formula_scales[column][1]:
return "Très difficile"
if score > formula_scales[column][2]:
return "Difficile"
if score > formula_scales[column][3]:
return "Intermédiaire"
if score > formula_scales[column][4]:
return "Facile"
return "Très facile"
# Print results
def print_readability_scores(txt):
"""Prints the readability scores and the grade levels corresponding"""
return print (f'''
La plupart des indices de lisibilité suivants ont été conçus pour l'Anglais. Les indices de lisibilité LIX
et RIX sont les plus fiables pour évaluer le niveau de difficulté des textes français.
INDICES DE LISIBILITE
---------------------
LIX................................ {round(lix(txt), 2)} : {score_analysis('lix',lix(txt))}
RIX................................ {round(rix(txt), 2)} : {score_analysis('rix',rix(txt))}
Gunning fog........................ {round(gunning(txt), 2)} : {score_analysis('gunning',gunning(txt))}
SMOG............................... {round(smog(txt), 2)} : {score_analysis('smog',smog(txt))}
Flesch-Kincaid..................... {round(fkgl(txt), 2)} : {score_analysis('fkgl',fkgl(txt))}
Coleman-Liau....................... {round(coleman_liau(txt), 2)} : {score_analysis('coleman_liau',coleman_liau(txt))}
Automated readability index........ {round(ari(txt), 2)} : {score_analysis('ari',ari(txt))}''')
def print_text_statistics(txt):
"""Prints the text stastistics that served to calculate the readability scores"""
return print(f'''
STATISTIQUES DU TEXTE
---------------------
Caractères......................... {caracteres(txt)}
Voyelles........................... {voyelles(txt)}
Digrammmes......................... {digrammes(txt)}
Trigrammes......................... {trigrammes(txt)}
Syllabes graphiques................ {syllabes_graphiques(txt)}
Mots............................... {mots(txt)}
Mots Longs (>6 lettres)............ {mots_plusdesixlettres(txt)}
Mots complexes (>2 syllabes)....... {mots_plusdetroissyllabes(txt)}
Phrases............................ {phrases(txt)}
''')
# User input
def user_input():
"""User input : raw text"""
filepath = filedialog.askopenfilename(title= "Chosissez un fichier en texte brut...",
filetypes=[("Texte brut", "*.txt")])
file=open(filepath, encoding="utf8", errors='ignore')
plain_text=file.read()
file.close()
return plain_text
# Program
texte = user_input()
print_text_statistics(texte)
print_readability_scores(texte)