Merge pull request #60 from LuminosoInsight/gender-neutral-at

Recognize "@" in gender-neutral word endings as part of the token
rspeer · Jul 24, 2018 · bc12599 · bc12599
2 parents ca9cf7d + d9fc6ec
commit bc12599
Show file tree

Hide file tree

Showing 56 changed files with 36,676 additions and 35,956 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,24 @@
+## Version 2.2 (2018-07-24)
+
+Library change:
+
+- While the @ sign is usually considered a symbol and not part of a word, there
+  is a case where it acts like a letter. It's used in one way of writing
+  gender-neutral words in Spanish and Portuguese, such as "l@s niñ@s". The
+  tokenizer in wordfreq will now allow words to end with "@" or "@s", so it
+  can recognize these words.
+
+Data changes:
+
+- Updated the data from Exquisite Corpus to filter the ParaCrawl web crawl
+  better. ParaCrawl provides two metrics (Zipporah and Bicleaner) for the
+  goodness of its data, and we now filter it to only use texts that get
+  positive scores on both metrics.
+
+- The input data includes the change to tokenization described above, giving
+  us word frequencies for words such as "l@s".
+
+
 ## Version 2.1 (2018-06-18)
 
 Data changes:

diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
     1.07e-05
 
     >>> word_frequency('café', 'en')
-    5.89e-06
+    5.75e-06
 
     >>> word_frequency('cafe', 'fr')
     1.51e-06
 
     >>> word_frequency('café', 'fr')
-    5.25e-05
+    5.13e-05
 
 
 `zipf_frequency` is a variation on `word_frequency` that aims to return the
@@ -78,10 +78,10 @@ one occurrence per billion words.
     5.29
 
     >>> zipf_frequency('frequency', 'en')
-    4.42
+    4.43
 
     >>> zipf_frequency('zipf', 'en')
-    1.55
+    1.57
 
     >>> zipf_frequency('zipf', 'en', wordlist='small')
     0.0
@@ -276,7 +276,8 @@ produces tokens that follow the recommendations in [Unicode
 Annex #29, Text Segmentation][uax29], including the optional rule that
 splits words between apostrophes and vowels.
 
-There are language-specific exceptions:
+There are exceptions where we change the tokenization to work better
+with certain languages:
 
 - In Arabic and Hebrew, it additionally normalizes ligatures and removes
   combining marks.
@@ -288,19 +289,29 @@ There are language-specific exceptions:
 - In Chinese, it uses the external Python library `jieba`, another optional
   dependency.
 
+- While the @ sign is usually considered a symbol and not part of a word,
+  wordfreq will allow a word to end with "@" or "@s". This is one way of
+  writing gender-neutral words in Spanish and Portuguese.
+
 [uax29]: http://unicode.org/reports/tr29/
 
 When wordfreq's frequency lists are built in the first place, the words are
 tokenized according to this function.
 
+    >>> from wordfreq import tokenize
+    >>> tokenize('l@s niñ@s', 'es')
+    ['l@s', 'niñ@s']
+    >>> zipf_frequency('l@s', 'es')
+    2.8
+
 Because tokenization in the real world is far from consistent, wordfreq will
 also try to deal gracefully when you query it with texts that actually break
 into multiple tokens:
 
     >>> zipf_frequency('New York', 'en')
     5.28
     >>> zipf_frequency('北京地铁', 'zh')  # "Beijing Subway"
-    3.57
+    3.61
 
 The word frequencies are combined with the half-harmonic-mean function in order
 to provide an estimate of what their combined frequency would be. In Chinese,

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setup(
     name="wordfreq",
-    version='2.1.0',
+    version='2.2.0',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='[email protected]',
     url='http://github.com/LuminosoInsight/wordfreq/',

diff --git a/tests/test_at_sign.py b/tests/test_at_sign.py
@@ -0,0 +1,109 @@
+from wordfreq import tokenize, lossy_tokenize, word_frequency
+
+
+def test_gender_neutral_at():
+    # Recognize the gender-neutral @ in Spanish as part of the word
+    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
+    assert tokenize(text, "es") == [
+        "la",
+        "protección",
+        "de",
+        "los",
+        "derechos",
+        "de",
+        "tod@s",
+        "l@s",
+        "trabajador@s",
+        "migrantes"
+    ]
+
+    text = "el distrito 22@ de Barcelona"
+    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
+    assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
+
+    # It also appears in Portuguese
+    text = "direitos e deveres para @s membr@s da comunidade virtual"
+    assert tokenize(text, "pt") == [
+        "direitos",
+        "e",
+        "deveres",
+        "para",
+        "@s",
+        "membr@s",
+        "da",
+        "comunidade",
+        "virtual"
+    ]
+
+    # Because this is part of our tokenization, the language code doesn't
+    # actually matter, as long as it's a language with Unicode tokenization
+    text = "@s membr@s da comunidade virtual"
+    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
+
+
+def test_at_in_corpus():
+    # We have a word frequency for "l@s"
+    assert word_frequency('l@s', 'es') > 0
+
+    # It's not just treated as a word break
+    assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
+
+
+def test_punctuation_at():
+    # If the @ appears alone in a word, we consider it to be punctuation
+    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
+    assert tokenize(text, "pt") == [
+        "operadores",
+        "de",
+        "canal",
+        "que",
+        "são",
+        "aqueles",
+        "que",
+        "têm",
+        "um",
+        "ao",
+        "lado",
+        "do",
+        "nick"
+    ]
+
+    assert tokenize(text, "pt", include_punctuation=True) == [
+        "operadores",
+        "de",
+        "canal",
+        ",",
+        "que",
+        "são",
+        "aqueles",
+        "que",
+        "têm",
+        "um",
+        "@",
+        "ao",
+        "lado",
+        "do",
+        "nick"
+    ]
+
+    # If the @ is not at the end of the word or part of the word ending '@s',
+    # it is also punctuation
+    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
+    assert tokenize(text, "es") == [
+        "un",
+        "archivo",
+        "hosts.deny",
+        "que",
+        "contiene",
+        "la",
+        "línea",
+        "all:all",
+        "all"
+    ]
+
+    # Make sure not to catch e-mail addresses
+    text = "[email protected]"
+    assert tokenize(text, "en") == [
+        "info",
+        "something.example"
+    ]
diff --git a/tests/test_chinese.py b/tests/test_chinese.py
@@ -59,7 +59,7 @@ def test_tokens():
 
 def test_combination():
     xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
-    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
+    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
 
 
 def test_alternate_codes():