From 18dab62ba8f69fac4125b3cb4989412a027a74c9 Mon Sep 17 00:00:00 2001
From: Andrew DalPino <me@andrewdalpino.com>
Date: Mon, 7 Sep 2020 16:35:01 -0500
Subject: [PATCH] Implemented Token Hashing Vectorizer transformer

---
 CHANGELOG.md                                  |   3 +
 composer.json                                 |   5 +-
 docs/transformers/bm25-transformer.md         |   4 +-
 docs/transformers/delta-tf-idf-transformer.md |   2 +-
 .../recursive-feature-eliminator.md           |  12 +-
 docs/transformers/token-hashing-vectorizer.md |  25 ++++
 src/Transformers/BM25Transformer.php          |   8 +-
 src/Transformers/DeltaTfIdfTransformer.php    |   4 +-
 src/Transformers/TokenHashingVectorizer.php   | 125 ++++++++++++++++++
 .../TokenHashingVectorizerTest.php            |  63 +++++++++
 10 files changed, 234 insertions(+), 17 deletions(-)
 create mode 100644 docs/transformers/token-hashing-vectorizer.md
 create mode 100644 src/Transformers/TokenHashingVectorizer.php
 create mode 100644 tests/Transformers/TokenHashingVectorizerTest.php
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e73132e..7325c99 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,6 @@
+- Unreleased
+    - Implemented Token Hashing Vectorizer transformer
+
 - 0.1.0-beta
     - Add Recursive Feature Eliminator feature selector
     - Implement BM25 TF-IDF Transformer
diff --git a/composer.json b/composer.json
index 0d066a9..f8bb8af 100644
--- a/composer.json
+++ b/composer.json
@@ -5,8 +5,9 @@
     "homepage": "https://rubixml.com",
     "license": "MIT",
     "keywords": [
-        "php", "machine-learning", "rubix", "ml", "extras", "neural-network", "deep-learning",
-        "analytics", "data-mining"
+        "php", "machine learning", "rubix", "ml", "extras", "neural network", "deep learning",
+        "analytics", "data mining", "php-ml", "php ml", "php ai", "artificial intelligence",
+        "ai", "rubixml", "rubix ml"
     ],
     "authors": [
         {
diff --git a/docs/transformers/bm25-transformer.md b/docs/transformers/bm25-transformer.md
index 5c522e2..0ca9406 100644
--- a/docs/transformers/bm25-transformer.md
+++ b/docs/transformers/bm25-transformer.md
@@ -1,9 +1,9 @@
 <span style="float:right;"><a href="https://github.com/RubixML/Extras/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span>
 
 # BM25 Transformer
-BM25 is a term frequency weighting scheme that takes term frequency (TF) saturation and document length into account.
+BM25 is a sublinear term frequency weighting scheme that takes term frequency (TF) saturation and document length into account.
 
-> **Note:** This transformer assumes that its input is made up of word frequency vectors such as those produced by [Word Count Vectorizer](word-count-vectorizer.md).
+> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
 
 **Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)
 
diff --git a/docs/transformers/delta-tf-idf-transformer.md b/docs/transformers/delta-tf-idf-transformer.md
index 63f506c..9e8a27c 100644
--- a/docs/transformers/delta-tf-idf-transformer.md
+++ b/docs/transformers/delta-tf-idf-transformer.md
@@ -3,7 +3,7 @@
 # Delta TF-IDF Transformer
 A supervised TF-IDF (Term Frequency Inverse Document Frequency) Transformer that uses class labels to boost the TF-IDFs of terms by how informative they are. Terms that receive the highest boost are those whose concentration is primarily in one class whereas low weighted terms are more evenly distributed among the classes.
 
-> **Note:** This transformer assumes that its input is made up of word frequency vectors such as those produced by [Word Count Vectorizer](word-count-vectorizer.md).
+> **Note:** Delta TF-IDF Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
 
 **Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)
 
diff --git a/docs/transformers/recursive-feature-eliminator.md b/docs/transformers/recursive-feature-eliminator.md
index eed76b9..92c96e2 100644
--- a/docs/transformers/recursive-feature-eliminator.md
+++ b/docs/transformers/recursive-feature-eliminator.md
@@ -14,12 +14,6 @@ Recursive Feature Eliminator or *RFE* is a supervised feature selector that uses
 | 2 | epochs | 1 | int | The maximum number of iterations to recurse upon the dataset. |
 | 3 | base | Auto | RanksFeatures | The base feature ranking learner instance. |
 
-## Additional Methods
-Return the final importances scores of the selected feature columns:
-``` php
-public importances() : ?array
-```
-
 ## Example
 ```php
 use Rubix\ML\Transformers\RecursiveFeatureEliminator;
@@ -28,5 +22,11 @@ use Rubix\ML\Regressors\RegressionTree;
 $transformer = new RecursiveFeatureEliminator(10, 2, new RegressionTree());
 ```
 
+## Additional Methods
+Return the final importances scores of the selected feature columns:
+``` php
+public importances() : ?array
+```
+
 ### References
 >- I. Guyon et al. (2002). Gene Selection for Cancer Classification using Support Vector Machines.
diff --git a/docs/transformers/token-hashing-vectorizer.md b/docs/transformers/token-hashing-vectorizer.md
new file mode 100644
index 0000000..cdedb8e
--- /dev/null
+++ b/docs/transformers/token-hashing-vectorizer.md
@@ -0,0 +1,25 @@
+<span style="float:right;"><a href="https://github.com/RubixML/Extras/blob/master/src/Transformers/TokenHashingVectorizer.php">[source]</a></span>
+
+# Token Hashing Vectorizer
+Token Hashing Vectorizer builds token count vectors on the fly by employing a *hashing trick*. It is a stateless transformer that uses the CRC32 (Cyclic Redundancy Check) hashing algorithm to assign token occurrences to a bucket in a vector of user-specified dimensionality. The advantage of hashing over storing a fixed vocabulary is that there is no memory footprint however there is a chance that certain tokens will collide with other tokens especially in lower-dimensional vector spaces.
+
+**Interfaces:** [Transformer](api.md#transformer)
+
+**Data Type Compatibility:** Continuous only
+
+## Parameters
+| # | Param | Default | Type | Description |
+|---|---|---|---|---|
+| 1 | dimensions | | int | The dimensionality of the vector space. |
+| 2 | tokenizer | Word | Tokenizer | The tokenizer used to extract tokens from blobs of text. |
+
+## Example
+```php
+use Rubix\ML\Transformers\TokenHashingVectorizer;
+use Rubix\ML\Other\Tokenizers\NGram;
+
+$transformer = new TokenHashingVectorizer(10000, new NGram(1, 2));
+```
+
+## Additional Methods
+This transformer does not have any additional methods.
diff --git a/src/Transformers/BM25Transformer.php b/src/Transformers/BM25Transformer.php
index 3c4ede7..1fd48d4 100644
--- a/src/Transformers/BM25Transformer.php
+++ b/src/Transformers/BM25Transformer.php
@@ -14,11 +14,11 @@
 /**
  * BM25 Transformer
  *
- * BM25 is a term frequency weighting scheme that takes term frequency (TF) saturation and
- * document length into account.
+ * BM25 is a sublinear term frequency weighting scheme that takes term frequency (TF)
+ * saturation and document length into account.
  *
- * > **Note**: This transformer assumes that its input is made up of term frequency vectors
- * such as those created by the Word Count Vectorizer.
+ * > **Note**: BM25 Transformer assumes that its inputs are made up of token frequency
+ * vectors such as those created by the Word Count Vectorizer.
  *
  * References:
  * [1] S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
diff --git a/src/Transformers/DeltaTfIdfTransformer.php b/src/Transformers/DeltaTfIdfTransformer.php
index 56e22a5..b0bea2f 100644
--- a/src/Transformers/DeltaTfIdfTransformer.php
+++ b/src/Transformers/DeltaTfIdfTransformer.php
@@ -19,8 +19,8 @@
  * that receive the highest boost are those whose concentration is primarily in one
  * class whereas low weighted terms are more evenly distributed among the classes.
  *
- * > **Note**: This transformer assumes that its input is made up of word frequency
- * vectors such as those created by the Word Count Vectorizer.
+ * > **Note**: Delta TF-IDF Transformer assumes that its inputs are made up of token
+ * frequency vectors such as those created by the Word Count Vectorizer.
  *
  * References:
  * [1] J. Martineau et al. (2009). Delta TFIDF: An Improved Feature Space for
diff --git a/src/Transformers/TokenHashingVectorizer.php b/src/Transformers/TokenHashingVectorizer.php
new file mode 100644
index 0000000..a9d62e8
--- /dev/null
+++ b/src/Transformers/TokenHashingVectorizer.php
@@ -0,0 +1,125 @@
+<?php
+
+namespace Rubix\ML\Transformers;
+
+use Rubix\ML\DataType;
+use Rubix\ML\Datasets\Dataset;
+use Rubix\ML\Other\Tokenizers\Word;
+use Rubix\ML\Other\Tokenizers\Tokenizer;
+use InvalidArgumentException;
+use Stringable;
+
+use function count;
+use function is_string;
+
+/**
+ * Token Hashing Vectorizer
+ *
+ * Token Hashing Vectorizer builds token count vectors on the fly by employing a *hashing
+ * trick*. It is a stateless transformer that uses the CRC32 (Cyclic Redundancy Check)
+ * hashing algorithm to assign token occurrences to a bucket in a vector of user-defined
+ * dimensionality. The advantage of hashing over a fixed vocabulary is that there is no
+ * memory footprint however there is a chance that certain tokens will collide with other
+ * tokens especially in lower-dimensional vector spaces.
+ *
+ * @category    Machine Learning
+ * @package     Rubix/ML
+ * @author      Andrew DalPino
+ */
+class TokenHashingVectorizer implements Transformer, Stringable
+{
+    /**
+     * The maximum number of dimensions supported.
+     *
+     * @var int
+     */
+    protected const MAX_DIMENSIONS = 4294967295;
+
+    /**
+     * The dimensionality of the vector space.
+     *
+     * @var int
+     */
+    protected $dimensions;
+
+    /**
+     * The tokenizer used to extract tokens from blobs of text.
+     *
+     * @var \Rubix\ML\Other\Tokenizers\Tokenizer
+     */
+    protected $tokenizer;
+
+    /**
+     * @param int $dimensions
+     * @param \Rubix\ML\Other\Tokenizers\Tokenizer|null $tokenizer
+     * @throws \InvalidArgumentException
+     */
+    public function __construct(int $dimensions, ?Tokenizer $tokenizer = null)
+    {
+        if ($dimensions < 1 or $dimensions > self::MAX_DIMENSIONS) {
+            throw new InvalidArgumentException('Dimensions must be'
+                . ' between 0 and ' . self::MAX_DIMENSIONS
+                . ", $dimensions given.");
+        }
+
+        $this->dimensions = $dimensions;
+        $this->tokenizer = $tokenizer ?? new Word();
+    }
+
+    /**
+     * Return the data types that this transformer is compatible with.
+     *
+     * @return \Rubix\ML\DataType[]
+     */
+    public function compatibility() : array
+    {
+        return DataType::all();
+    }
+
+    /**
+     * Transform the dataset in place.
+     *
+     * @param array[] $samples
+     */
+    public function transform(array &$samples) : void
+    {
+        $scale = $this->dimensions / self::MAX_DIMENSIONS;
+
+        foreach ($samples as &$sample) {
+            $vectors = [];
+
+            foreach ($sample as $column => $value) {
+                if (is_string($value)) {
+                    $template = array_fill(0, $this->dimensions, 0);
+
+                    $tokens = $this->tokenizer->tokenize($value);
+
+                    $counts = array_count_values($tokens);
+
+                    foreach ($counts as $token => $count) {
+                        $offset = (int) floor(crc32($token) * $scale);
+
+                        $template[$offset] += $count;
+                    }
+
+                    $vectors[] = $template;
+
+                    unset($sample[$column]);
+                }
+            }
+
+            $sample = array_merge($sample, ...$vectors);
+        }
+    }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Token Hashing Vectorizer (dimensions: {$this->dimensions},"
+            . " tokenizer: {$this->tokenizer})";
+    }
+}
diff --git a/tests/Transformers/TokenHashingVectorizerTest.php b/tests/Transformers/TokenHashingVectorizerTest.php
new file mode 100644
index 0000000..779003d
--- /dev/null
+++ b/tests/Transformers/TokenHashingVectorizerTest.php
@@ -0,0 +1,63 @@
+<?php
+
+namespace Rubix\ML\Tests\Transformers;
+
+use Rubix\ML\Datasets\Unlabeled;
+use Rubix\ML\Other\Tokenizers\Word;
+use Rubix\ML\Transformers\Transformer;
+use Rubix\ML\Transformers\TokenHashingVectorizer;
+use PHPUnit\Framework\TestCase;
+
+/**
+ * @group Transformers
+ * @covers \Rubix\ML\Transformers\TokenHashingVectorizer
+ */
+class TokenHashingVectorizerTest extends TestCase
+{
+    /**
+     * @var \Rubix\ML\Datasets\Unlabeled
+     */
+    protected $dataset;
+
+    /**
+     * @var \Rubix\ML\Transformers\TokenHashingVectorizer
+     */
+    protected $transformer;
+
+    /**
+     * @before
+     */
+    protected function setUp() : void
+    {
+        $this->dataset = Unlabeled::quick([
+            ['the quick brown fox jumped over the lazy man sitting at a bus stop drinking a can of coke'],
+            ['with a dandy umbrella'],
+        ]);
+
+        $this->transformer = new TokenHashingVectorizer(20, new Word());
+    }
+
+    /**
+     * @test
+     */
+    public function build() : void
+    {
+        $this->assertInstanceOf(TokenHashingVectorizer::class, $this->transformer);
+        $this->assertInstanceOf(Transformer::class, $this->transformer);
+    }
+
+    /**
+     * @test
+     */
+    public function transform() : void
+    {
+        $this->dataset->apply($this->transformer);
+
+        $outcome = [
+            [1, 1, 0, 1, 2, 0, 0, 1, 3, 0, 0, 1, 0, 0, 2, 1, 0, 1, 5, 0],
+            [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
+        ];
+
+        $this->assertEquals($outcome, $this->dataset->samples());
+    }
+}