From 39f421c9385b397ecccc22732bdebb2975da245f Mon Sep 17 00:00:00 2001 From: Nigel Date: Mon, 18 Sep 2023 20:38:25 +0100 Subject: [PATCH] Load bad words when loading good words in SQ When adding `good_words.txt` to the project dictionary in Spell Query, also load `bad_words.txt` if it exists. Always report bad words as bad spellings, even if they are in the dictionary or good words file. Mark bad words with asterisks in SQ report --- src/guiguts.pl | 1 + src/lib/Guiguts/ErrorCheck.pm | 115 ++++++++++++++++++++------------ src/lib/Guiguts/MultiLingual.pm | 66 +++++++----------- src/lib/Guiguts/SpellCheck.pm | 90 +++++++++++++++++-------- 4 files changed, 161 insertions(+), 111 deletions(-) diff --git a/src/guiguts.pl b/src/guiguts.pl index fec3eb73..90de18eb 100755 --- a/src/guiguts.pl +++ b/src/guiguts.pl @@ -220,6 +220,7 @@ our %charsuiteenabled = ( 'Basic Latin' => 1 ); # All projects allow Basic Latin character suite our %pagenumbers; our %projectdict; +our %projectbadwords; our %reghints = (); our %scannoslist; our %geometryhash; #Geometry of some windows in one hash. diff --git a/src/lib/Guiguts/ErrorCheck.pm b/src/lib/Guiguts/ErrorCheck.pm index f87427f6..f93f93f9 100644 --- a/src/lib/Guiguts/ErrorCheck.pm +++ b/src/lib/Guiguts/ErrorCheck.pm @@ -120,7 +120,7 @@ sub errorcheckpop_up { spellquerycleardict(); # Clear cache, so project dictionary will be reloaded below errorcheckpop_up( $textwindow, $top, $errorchecktype ); # Rerun Spell Query }, - -text => 'Add good_words.txt', + -text => 'Add good/bad words', -width => 16 )->grid( -row => 1, -column => $gcol - 1 ); my $btnskip = $ptopframeb->Button( @@ -564,7 +564,11 @@ sub errorcheckpop_up { } elsif ( $errorchecktype eq "Spell Query" ) { # Spell query also has a frequency count to append my $freq = spellqueryfrequency($line); next if $freq > $::spellquerythreshold; # If it's spelled the same way several times, it's probably not an error - $line .= " ($freq)"; + if ( $line =~ s/\*\*\*$// ) { # Spelling flagged as bad word + $line .= " *$freq*"; + } else { + $line .= " ($freq)"; # Ordinary misspelling + } } elsif ( $errorchecktype eq "EPUBCheck" ) { @@ -1480,6 +1484,11 @@ sub booklouperun { my %sqglobaldict = (); my %sqbadwordfreq = (); + # Codes returned by spellquerywordok() + my $SQWORDOKYES = 0; # Word in dictionary, or meets other criteria for being OK + my $SQWORDOKNO = 1; # Typically a spelling error + my $SQWORDOKBAD = 2; # Actual "bad word" - maybe an OK word, but bad in this project - needs marking specially + # # Run Spell Query on whole file sub spellqueryrun { @@ -1511,21 +1520,27 @@ sub booklouperun { $nextcol = $col + length($wd) + 1; # Step forward word length plus single space next unless $wd; # Empty word if two consecutive separators, e.g. period & space - next if spellquerywordok($wd); + my $wordok = spellquerywordok($wd); + next if $wordok == $SQWORDOKYES; # If word has leading straight apostrophe, it might be open single quote; trim it and check again if ( $wd =~ s/^'// ) { ++$col; # Allow for having removed apostrophe from start of word - next if spellquerywordok($wd); + $wordok = spellquerywordok($wd); + next if $wordok == $SQWORDOKYES; } # if trailing straight/curly apostrophe, it might be close single quote; trim it and check again - next if $wd =~ s/['$APOS]$//g and spellquerywordok($wd); - next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes + if ( $wd =~ s/['$APOS]$//g ) { + $wordok = spellquerywordok($wd); + next if $wordok == $SQWORDOKYES; + } + next if $wd =~ /^['$APOS]*$/; # OK if nothing left in string but zero or more quotes # Format message - increment word frequency; final total gets appended later when populating dialog $sqbadwordfreq{$wd}++; - my $error = sprintf( "%d:%-2d - %s", $step, $col, $wd ); + my $badwd = $wordok == $SQWORDOKBAD ? "***" : ""; # Flag bad word to higher routine with asterisks + my $error = sprintf( "%d:%-2d - %s%s", $step, $col, $wd, $badwd ); utf8::encode($error); print $logfile "$error\n"; } @@ -1541,9 +1556,9 @@ sub booklouperun { my $wfword = shift; for my $wd ( split( /\W/, $wfword ) ) { - return 0 unless spellquerywordok($wd); # part not found + return 0 unless spellquerywordok($wd) == $SQWORDOKYES; # part not found } - return 1; # all parts ok + return 1; # all parts ok } # @@ -1551,35 +1566,39 @@ sub booklouperun { sub spellquerywordok { my $wd = shift; - # First check if word is in dictionary - return 1 if spellqueryindictapos($wd); + # Return status code if whole thing is a bad word or in the dictionary + my $wordok = spellqueryindictapos($wd); + return $wordok + if $wordok == $SQWORDOKBAD + or $wordok == $SQWORDOKYES; # Some languages use l', quest', etc., before word - accept if the "prefix" and the main word are both good # Prefix can be with or without apostrophe ("with" is safer to avoid prefix being accepted if standalone word) - return 1 + return $SQWORDOKYES if $wd =~ /^(\w+)['$APOS](\w+)/ - and ( spellqueryindictapos($1) or spellqueryindictapos( $1 . "'" ) ) - and spellqueryindictapos($2); + and (spellqueryindictapos($1) == $SQWORDOKYES + or spellqueryindictapos( $1 . "'" ) == $SQWORDOKYES ) + and spellqueryindictapos($2) == $SQWORDOKYES; # Now check numbers - return 1 if $wd =~ /^\d+$/; # word is all digits - return 1 if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc - return 1 if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc) - return 1 if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc) - return 1 if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc - return 1 if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th + return $SQWORDOKYES if $wd =~ /^\d+$/; # word is all digits + return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?1st$/i; # ...1st, ...21st, ...31st, etc + return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?2n?d$/i; # ...2nd, ...22nd, ...32nd, etc (also 2d, 22d, etc) + return $SQWORDOKYES if $wd =~ /^(\d*[02-9])?3r?d$/i; # ...3rd, ...23rd, ...33rd, etc (also 3d, 33d, etc) + return $SQWORDOKYES if $wd =~ /^\d*[04-9]th$/i; # ...0th, ...4th, ...5th, etc + return $SQWORDOKYES if $wd =~ /^\d*1[123]th$/i; # ...11th, ...12th, ...13th # Allow decades/years - return 1 if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s) - return 1 if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862) - return 1 if $wd =~ /^1\d{3}s$/; # e.g. 1820s + return $SQWORDOKYES if $wd =~ /^['$APOS]?\d\ds$/; # e.g. '20s or 20s (abbreviation for 1820s) + return $SQWORDOKYES if $wd =~ /^['$APOS]\d\d$/; # e.g. '62 (abbreviation for 1862) + return $SQWORDOKYES if $wd =~ /^1\d{3}s$/; # e.g. 1820s # Allow abbreviations for shillings and pence (not pounds because 20l is common scanno for the number 201) - return 1 if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence) + return $SQWORDOKYES if $wd =~ /^\d{1,2}[sd]$/; # e.g. 15s or 6d (up to 2 digits of old English shillings and pence) - return 1 if $wd =~ /^sc$/i; # DP markup + return $SQWORDOKYES if $wd =~ /^sc$/i; # DP markup - return 0; + return $SQWORDOKNO; } # @@ -1587,31 +1606,44 @@ sub booklouperun { sub spellqueryindictapos { my $wd = shift; - # First check if word is in dictionary - return 1 if spellqueryindict($wd); + my $wordok = spellqueryindict($wd); + + # Return status code if it's a bad word, in the dictionary or doesn't contain apostrophes + return $wordok + if $wordok == $SQWORDOKBAD + or $wordok == $SQWORDOKYES + or $wd !~ /['$APOS]/; - # Now try swapping straight/curly apostrophes and recheck + # Contains apostrophes - try swapping straight/curly and recheck if ( $wd =~ /$APOS/ ) { $wd =~ s/$APOS/'/g; - return 1 if spellqueryindict($wd); } elsif ( $wd =~ /'/ ) { $wd =~ s/'/$APOS/g; - return 1 if spellqueryindict($wd); } - return 0; + return spellqueryindict($wd); } # - # Return true if word is in dictionary: same case, lower case, or title case (e.g. LONDON matches London) + # Return whether word is in the bad_words list, dictionary, or neither (wrong spelling) + # Check same case, lower case, or title case (e.g. LONDON matches London) # Can't just do case-insensitive check because we don't want "london" to be OK. sub spellqueryindict { - my $wd = shift; - return 1 if $sqglobaldict{$wd}; - return 1 if $sqglobaldict{ lc $wd }; - return 1 - if length($wd) > 1 and $sqglobaldict{ substr( $wd, 0, 1 ) . lc substr( $wd, 1 ) }; + my $wd = shift; + my $lcwd = lc $wd; + my $tcwd = length($wd) > 1 ? substr( $wd, 0, 1 ) . substr( $lcwd, 1 ) : ''; + $tcwd = '' if $tcwd eq $lcwd; # No point in checking same word twice - return 0; + # First ensure bad words are always reported + return $SQWORDOKBAD if exists( $::projectbadwords{$wd} ); + return $SQWORDOKBAD if exists( $::projectbadwords{$lcwd} ); + return $SQWORDOKBAD if $tcwd and exists( $::projectbadwords{$tcwd} ); + + # Now check dictionary for good words + return $SQWORDOKYES if $sqglobaldict{$wd}; + return $SQWORDOKYES if $sqglobaldict{$lcwd}; + return $SQWORDOKYES if $tcwd and $sqglobaldict{$tcwd}; + + return $SQWORDOKNO; } # @@ -1642,7 +1674,8 @@ sub booklouperun { } # Now add project dictionary words - delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict() + delete $::projectdict{$_} for keys %::projectdict; # Old spellcheck code doesn't clear hash in spellloadprojectdict() + delete $::projectbadwords{$_} for keys %::projectbadwords; ::spellloadprojectdict(); $sqglobaldict{$_} = 1 for keys %::projectdict; @@ -1695,7 +1728,7 @@ sub booklouperun { # Return how many times bad word referred to in error message was found during the check sub spellqueryfrequency { my $line = shift; - return $sqbadwordfreq{$1} if $line =~ /^\d+:\d+ +- (.+)/; + return $sqbadwordfreq{$1} if $line =~ /^\d+:\d+ +- ([^*]+)/; # Ignore asterisks marking bad words return 0; } diff --git a/src/lib/Guiguts/MultiLingual.pm b/src/lib/Guiguts/MultiLingual.pm index a4c69881..2c3eebd1 100644 --- a/src/lib/Guiguts/MultiLingual.pm +++ b/src/lib/Guiguts/MultiLingual.pm @@ -680,60 +680,42 @@ sub includeprojectdict { # # Add all spelt foreign words to project dictionary sub addspeltforeignproject { - ::spellloadprojectdict(); - my $i = 0; - for my $key ( sort ( keys %distinctwords ) ) { - if ( ( $seenwordslang{$key} ) - && ( $seenwordslang{$key} ne $::multidicts[0] ) - && ( $seenwordslang{$key} ne 'user' ) ) { - $::projectdict{$key} = $seenwordslang{$key}; - $i++; - } - } - - my $section = "\%projectdict = (\n"; - for my $key ( sort keys %::projectdict ) { - $key =~ s/'/\\'/g; - $section .= "'$key' => '',\n"; - } - $section .= ");"; - utf8::encode($section); - open my $save, '>:bytes', $::lglobal{projectdictname}; - print $save $section; - close $save; - my $line = "$i words added to project dictionary"; - $multiwclistbox->delete( '0', 'end' ); - $multiwclistbox->insert( 'end', $line ); - $multiwclistbox->update; - updategloballists(); - getwordcounts(); + addcriterionproject('speltforeign'); } # # Add words occuring >= minfreq times to project dictionary sub addminfreqproject { + addcriterionproject('minfreq'); +} + +# +# Add all words that meet criterion to project dictionary +# Input argument should be 'speltforeign' or 'minfreq' +sub addcriterionproject { + my $criterion = shift; + ::spellloadprojectdict(); my $i = 0; for my $key ( keys %distinctwords ) { - unless ( $seenwordslang{$key} ) { - if ( $distinctwords{$key} >= $minfreq ) { - $::projectdict{$key} = 'freq'; - $seenwordslang{$key} = 'freq'; + if ( $criterion eq 'speltforeign' ) { + if ( ( $seenwordslang{$key} ) + && ( $seenwordslang{$key} ne $::multidicts[0] ) + && ( $seenwordslang{$key} ne 'user' ) ) { + $::projectdict{$key} = $seenwordslang{$key}; $i++; } + } elsif ( $criterion eq 'minfreq' ) { + unless ( $seenwordslang{$key} ) { + if ( $distinctwords{$key} >= $minfreq ) { + $::projectdict{$key} = 'freq'; + $seenwordslang{$key} = 'freq'; + $i++; + } + } } } - - my $section = "\%projectdict = (\n"; - for my $key ( sort keys %::projectdict ) { - $key =~ s/'/\\'/g; - $section .= "'$key' => '',\n"; - } - $section .= ");"; - utf8::encode($section); - open my $save, '>:bytes', $::lglobal{projectdictname}; - print $save $section; - close $save; + ::spellsaveprojdict(); my $line = "$i words added to project dictionary"; $multiwclistbox->delete( '0', 'end' ); $multiwclistbox->insert( 'end', $line ); diff --git a/src/lib/Guiguts/SpellCheck.pm b/src/lib/Guiguts/SpellCheck.pm index 730706c3..2eddc2f8 100644 --- a/src/lib/Guiguts/SpellCheck.pm +++ b/src/lib/Guiguts/SpellCheck.pm @@ -7,7 +7,7 @@ BEGIN { our ( @ISA, @EXPORT ); @ISA = qw(Exporter); @EXPORT = qw(&aspellstart &aspellstop &spellchecker &spellloadprojectdict &getmisspelledwords - &spelloptions &get_spellchecker_version &spellmyaddword &spelladdgoodwords); + &spelloptions &get_spellchecker_version &spellmyaddword &spelladdgoodwords &spellsaveprojdict); } # @@ -49,14 +49,22 @@ sub spellloadprojectdict { if ( ( defined $::lglobal{projectdictname} ) and ( -e $::lglobal{projectdictname} ) ) { open( my $fh, "<:encoding(utf8)", $::lglobal{projectdictname} ); + my $hashref = \%::projectdict; while ( my $line = <$fh> ) { - utf8::decode($line); - if ( $line eq "%projectdict = (\n" ) { next; } - if ( $line eq ");" ) { next; } - $line =~ s/' => '',\n$//g; # remove ending - $line =~ s/^'//g; # remove start - $line =~ s/\\'/'/g; # remove \' - $::projectdict{$line} = ''; + $line =~ s/[\n\r]+//; + if ( $line eq "%projectdict = (" ) { # following words are good + $hashref = \%::projectdict; + next; + } + if ( $line eq "%projectbadwords = (" ) { # following words are bad + $hashref = \%::projectbadwords; + next; + } + $line =~ s/' => '',$//g; # remove ending + $line =~ s/^'//g; # remove start + $line =~ s/\\'/'/g; # unescape single quote + next if $line eq ");" or $line eq ""; + $hashref->{$line} = ''; } } } @@ -219,14 +227,24 @@ sub spelladdword { # # Add a word to the project dictionary +# Optional second argument if it's a bad word sub spellmyaddword { my $textwindow = $::textwindow; my $term = shift; + my $bad = shift; unless ($term) { ::soundbell(); return; } - getprojectdic(); + return if $term =~ /^\s*$/; + ( $bad ? $::projectbadwords{$term} : $::projectdict{$term} ) = ''; + spellsaveprojdict(); +} + +# +# Save project dictionary +sub spellsaveprojdict { + getprojectdic(); # Get dict name into global if ( not defined $::lglobal{projectdictname} ) { my $dialog = $::top->Dialog( -text => "File must be saved before words can be added to project dictionary.", @@ -237,16 +255,21 @@ sub spellmyaddword { $dialog->Show; return; } - $::projectdict{$term} = ''; - open( my $dic, '>:bytes', "$::lglobal{projectdictname}" ); my $section = "\%projectdict = (\n"; + for my $term ( sort keys %::projectdict ) { + $term =~ s/'/\\'/g; + $section .= "'$term' => '',\n"; + } + $section .= ");\n\n"; - for my $term ( sort { $a cmp $b } keys %::projectdict ) { + $section .= "\%projectbadwords = (\n"; + for my $term ( sort keys %::projectbadwords ) { $term =~ s/'/\\'/g; $section .= "'$term' => '',\n"; } - $section .= ");"; - utf8::encode($section); + $section .= ");\n\n"; + + open( my $dic, '>:encoding(utf8)', "$::lglobal{projectdictname}" ); print $dic $section; close $dic; } @@ -453,7 +476,7 @@ sub spelladdtexttags { } # -# Add spellings from good_words.txt to the project dictionary +# Add spellings from good_words.txt & bad_words.txt to the project dictionary sub spelladdgoodwords { my $textwindow = $::textwindow; my $top = $::top; @@ -471,21 +494,32 @@ sub spelladdgoodwords { my $pwd = ::getcwd(); chdir $::globallastpath; - unless ( open( DAT, "good_words.txt" ) ) { + my $fh; + + # Load good words first + if ( open( $fh, "<:encoding(utf8)", "good_words.txt" ) ) { + ::busy(); + while ( my $line = <$fh> ) { + $line =~ s/\s+$//; + next if $line eq ''; + spellmyaddword($line); + } + close($fh); + + # The bad_words.txt file often doesn't exist, so don't error if that's the case + if ( open( $fh, "<:encoding(utf8)", "bad_words.txt" ) ) { + while ( my $line = <$fh> ) { + $line =~ s/\s+$//; + next if $line eq ''; + spellmyaddword( $line, "bad" ); + } + close($fh); + } + ::unbusy(); + } else { ::warnerror("Could not open good_words.txt"); - return; } - - # Remove all newlines and/or carriage returns whatever the current OS - ::busy(); - my @raw_data = map { s/[\n\r]+$//g; $_ } ; - close(DAT); chdir $pwd; - my $word = q{}; - foreach my $word (@raw_data) { - spellmyaddword($word); - } - ::unbusy(); } # @@ -711,7 +745,7 @@ sub endaspell { } # -# Load project dictionary +# Get project dictionary name into global variable (base filename + ".dic") sub getprojectdic { return unless $::lglobal{global_filename}; my $fname = $::lglobal{global_filename};