Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
Converts english to regex style and fixes issues with missing commas
Browse files Browse the repository at this point in the history
  • Loading branch information
halfak committed Jul 17, 2015
1 parent 8199ef1 commit 86095aa
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 99 deletions.
58 changes: 30 additions & 28 deletions revscoring/languages/english.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import warnings

import enchant
Expand All @@ -8,13 +9,12 @@

STEMMER = SnowballStemmer("english")
STOPWORDS = set(stopwords.words('english'))
BADWORDS = set([
"anus", "ass",
"bitch", "bootlip", "butt",
BAD_REGEXES = [
"a+nus+", "ass+",
"bitch", "bootlip", "butt+",
"chlamydia", "cholo", "chug", "cocksuck", "coonass", "cracker", "cunt",
"dick", "dickhead", "dothead",
"fag", "faggot",
"fart", "fat", "fuck", "fucker",
"dick", "dothead",
"(f|ph)ag+(ot)?", "fart", "fat", "fuck",
"gipp", "gippo", "gonorrhea", "gook", "gringo", "gypo", "gyppie", "gyppo",
"gyppy",
"herpes", "hillbilly", "hiv", "homosexual", "hori",
Expand All @@ -23,32 +23,35 @@
"kike", "kwashi", "kyke",
"lesbian", "lick",
"motherfuck",
"nig", "nigar", "nigette", "nigga", "niggah", "niggar", "nigger",
"niggress", "nigguh", "niggur", "niglet", "nigor", "nigr", "nigra",
"peckerwood", "penis", "piss",
"nig", "nig+(a|e|u)+(r|h)+", "niggress"
"niglet", "nigor", "nigr", "nigra",
"pecker(wood)?", "peni(s)?", "piss",
"quashi",
"raghead", "redneck", "redskin", "roundeye",
"scabies", "shit", "shitty", "slut", "slutty", "spic", "spick", "spig",
"spigotty", "spik", "spook", "squarehead", "stupid", "suck", "syphilis",
"scabies", "shi+t+", "slut", "spi(g|c|k)+",
"spigotty", "spik", "spook", "squarehead", "st(u|oo+)pid", "suck",
"syphil+is",
"turd", "twat",
"wank", "wetback", "whore", "wog", "wop",
"yank", "yankee", "yid",
"zipperhead"
])
INFORMAL_WORDS = set([
'awesome', 'awesomest', 'awsome'
'bla', 'blah', 'boner', 'boobs', 'bullshit'
'cant', 'coolest', 'crap'
"dont", "dumb", "dumbass",
]
INFORMAL_REGEXES = [
'awesome', 'awesomest', 'awsome',
'bla', 'blah', 'boner', 'boobs', 'bullshit',
'cant', 'coolest', 'crap',

This comment has been minimized.

Copy link
@he7d3r

he7d3r Jul 17, 2015

Contributor

Solves #124 (comment).

"don'?t", "dumb", "dumbass",
"haha", "hello", "hey",
"kool",
"lol", "luv",
"meow",
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux'
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux', "shouldn\'t"
"tits",
"wuz",
'yall', 'yay', 'yea', 'yolo'])
STEMMED_BADWORDS = set(STEMMER.stem(w) for w in BADWORDS)

This comment has been minimized.

Copy link
@he7d3r

he7d3r Jul 17, 2015

Contributor

No more stemming? Why? o.O

"wasn'?t", "wuz", "won'?t",
'yall', 'yay', 'yea', 'yolo'
]
BAD_REGEX = re.compile("|".join(BAD_REGEXES))
INFORMAL_REGEX = re.compile("|".join(INFORMAL_REGEXES))
DICTIONARY = enchant.Dict("en")


Expand All @@ -59,20 +62,19 @@ def stem_word(word):
stem_word = LanguageUtility("stem_word", stem_word_process, depends_on=[])


def is_badword_process(stem_word):
def is_badword_process():
def is_badword(word):
return stem_word(word) in STEMMED_BADWORDS
return bool(BAD_REGEX.match(word.lower()))
return is_badword
is_badword = LanguageUtility("is_badword", is_badword_process, depends_on=[stem_word])
is_badword = LanguageUtility("is_badword", is_badword_process)


def is_informal_word_process(stem_word):
def is_informal_word_process():
def is_informal_word(word):
return stem_word(word) in INFORMAL_WORDS
return bool(INFORMAL_REGEX.match(word.lower()))
return is_informal_word
is_informal_word = LanguageUtility("is_informal_word",
is_informal_word_process, depends_on=[stem_word])

is_informal_word_process, depends_on=[])

def is_misspelled_process():
def is_misspelled(word):
Expand Down
67 changes: 0 additions & 67 deletions revscoring/languages/english.regex.py

This file was deleted.

8 changes: 4 additions & 4 deletions revscoring/languages/tests/test_english.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def test_language():
eq_(stem_word()("Shitting"), "shit")
eq_(hash(stem_word), hash(language.stem_word))

assert is_badword(stem_word())("shit")
assert is_badword(stem_word())("shitty")
assert is_badword(stem_word())("Shitty")
assert not is_badword(stem_word())("hat")
assert is_badword()("shit")
assert is_badword()("shitty")
assert is_badword()("Shitty")
assert not is_badword()("hat")
eq_(hash(is_badword), hash(language.is_badword))

assert is_misspelled()("wjwkjb")
Expand Down

0 comments on commit 86095aa

Please sign in to comment.