Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #129 from wiki-ai/english_regex
Browse files Browse the repository at this point in the history
Converts English language utilities to Regex style and fixes minor issues
  • Loading branch information
ToAruShiroiNeko committed Jul 17, 2015
2 parents 7bbe7c2 + 702b865 commit 110c1c0
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 102 deletions.
64 changes: 33 additions & 31 deletions revscoring/languages/english.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import warnings

import enchant
Expand All @@ -8,13 +9,12 @@

STEMMER = SnowballStemmer("english")
STOPWORDS = set(stopwords.words('english'))
BADWORDS = set([
"anus", "ass",
"bitch", "bootlip", "butt",
BAD_REGEXES = [
"a+nus+", "ass+",
"bitch", "bootlip", "butt+",
"chlamydia", "cholo", "chug", "cocksuck", "coonass", "cracker", "cunt",
"dick", "dickhead", "dothead",
"fag", "faggot",
"fart", "fat", "fuck", "fucker",
"dick", "dothead",
"(f|ph)ag+(ot)?", "fart", "fat", "fuck",
"gipp", "gippo", "gonorrhea", "gook", "gringo", "gypo", "gyppie", "gyppo",
"gyppy",
"herpes", "hillbilly", "hiv", "homosexual", "hori",
Expand All @@ -23,70 +23,72 @@
"kike", "kwashi", "kyke",
"lesbian", "lick",
"motherfuck",
"nig", "nigar", "nigette", "nigga", "niggah", "niggar", "nigger",
"niggress", "nigguh", "niggur", "niglet", "nigor", "nigr", "nigra",
"peckerwood", "penis", "piss",
"nig", "nig+(a|e|u)+(r|h)+", "niggress"
"niglet", "nigor", "nigr", "nigra",
"pecker(wood)?", "peni(s)?", "piss",
"quashi",
"raghead", "redneck", "redskin", "roundeye",
"scabies", "shit", "shitty", "slut", "slutty", "spic", "spick", "spig",
"spigotty", "spik", "spook", "squarehead", "stupid", "suck", "syphilis",
"scabies", "shi+t+", "slut", "spi(g|c|k)+",
"spigotty", "spik", "spook", "squarehead", "st(u|oo+)pid", "suck",
"syphil+is",
"turd", "twat",
"wank", "wetback", "whore", "wog", "wop",
"yank", "yankee", "yid",
"zipperhead"
])
INFORMAL_WORDS = set([
'awesome', 'awesomest', 'awsome'
'bla', 'blah', 'boner', 'boobs', 'bullshit'
'cant', 'coolest', 'crap'
"dont", "dumb", "dumbass",
]
INFORMAL_REGEXES = [
'awesome', 'awesomest', 'awsome',
'bla', 'blah', 'boner', 'boobs', 'bullshit',
'cant', 'coolest', 'crap',
"don'?t", "dumb", "dumbass",
"haha", "hello", "hey",
"kool",
"lol", "luv",
"meow",
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux'
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux', "shouldn\'t"
"tits",
"wuz",
'yall', 'yay', 'yea', 'yolo'])
STEMMED_BADWORDS = set(STEMMER.stem(w) for w in BADWORDS)
"wasn'?t", "wuz", "won'?t",
'yall', 'yay', 'yea', 'yolo'
]
BAD_REGEX = re.compile("|".join(BAD_REGEXES))
INFORMAL_REGEX = re.compile("|".join(INFORMAL_REGEXES))
DICTIONARY = enchant.Dict("en")


def stem_word_process():
def stem_word(word):
return STEMMER.stem(word).lower()
return stem_word
stem_word = LanguageUtility("stem_word", stem_word_process, depends_on=[])
stem_word = LanguageUtility("stem_word", stem_word_process)


def is_badword_process(stem_word):
def is_badword_process():
def is_badword(word):
return stem_word(word) in STEMMED_BADWORDS
return bool(BAD_REGEX.match(word.lower()))
return is_badword
is_badword = LanguageUtility("is_badword", is_badword_process, depends_on=[stem_word])
is_badword = LanguageUtility("is_badword", is_badword_process)


def is_informal_word_process(stem_word):
def is_informal_word_process():
def is_informal_word(word):
return stem_word(word) in INFORMAL_WORDS
return bool(INFORMAL_REGEX.match(word.lower()))
return is_informal_word
is_informal_word = LanguageUtility("is_informal_word",
is_informal_word_process, depends_on=[stem_word])
is_informal_word_process)


def is_misspelled_process():
def is_misspelled(word):
return not DICTIONARY.check(word)
return is_misspelled

is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process,
depends_on=[])
is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process)

def is_stopword_process():
def is_stopword(word):
return word.lower() in STOPWORDS
return is_stopword
is_stopword = LanguageUtility("is_stopword", is_stopword_process, depends_on=[])
is_stopword = LanguageUtility("is_stopword", is_stopword_process)

english = Language("revscoring.languages.english",
[stem_word, is_badword, is_misspelled, is_stopword, is_informal_word])
Expand Down
67 changes: 0 additions & 67 deletions revscoring/languages/english.regex.py

This file was deleted.

8 changes: 4 additions & 4 deletions revscoring/languages/tests/test_english.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ def test_language():
eq_(stem_word()("Shitting"), "shit")
eq_(hash(stem_word), hash(language.stem_word))

assert is_badword(stem_word())("shit")
assert is_badword(stem_word())("shitty")
assert is_badword(stem_word())("Shitty")
assert not is_badword(stem_word())("hat")
assert is_badword()("shit")
assert is_badword()("shitty")
assert is_badword()("Shitty")
assert not is_badword()("hat")
eq_(hash(is_badword), hash(language.is_badword))

assert is_misspelled()("wjwkjb")
Expand Down

0 comments on commit 110c1c0

Please sign in to comment.