From 769c3e727050262584a4b2cdb7c4dd95fc4a3a46 Mon Sep 17 00:00:00 2001 From: Andrew DalPino Date: Sun, 13 Sep 2020 19:04:47 -0500 Subject: [PATCH] Added Laplace smoothing to Delta TF-IDF Transformer --- CHANGELOG.md | 4 +- benchmarks/DeltaTfIdfTransformerBench.php | 55 +++++ .../Transformers/BM25TransformerBench.php | 2 +- .../RecursiveFeatureEliminatorBench.php | 49 ----- .../TokenHashingVectorizerBench.php | 54 +++++ composer.json | 2 +- docs/transformers/delta-tf-idf-transformer.md | 6 +- .../recursive-feature-eliminator.md | 32 --- src/ModelOrchestra.php | 11 +- src/Transformers/BM25Transformer.php | 2 +- src/Transformers/DeltaTfIdfTransformer.php | 39 +++- .../RecursiveFeatureEliminator.php | 207 ------------------ .../DeltaTfIdfTransformerTest.php | 2 +- .../RecursiveFeatureEliminatorTest.php | 73 ------ 14 files changed, 157 insertions(+), 381 deletions(-) create mode 100644 benchmarks/DeltaTfIdfTransformerBench.php delete mode 100644 benchmarks/Transformers/RecursiveFeatureEliminatorBench.php create mode 100644 benchmarks/Transformers/TokenHashingVectorizerBench.php delete mode 100644 docs/transformers/recursive-feature-eliminator.md delete mode 100644 src/Transformers/RecursiveFeatureEliminator.php delete mode 100644 tests/Transformers/RecursiveFeatureEliminatorTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 7325c99..37b1889 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -- Unreleased +- 0.2.0-beta1 - Implemented Token Hashing Vectorizer transformer + - Move Recursive Feature Eliminator (RFE) to main repo + - Added Laplace smoothing to Delta TF-IDF Transformer - 0.1.0-beta - Add Recursive Feature Eliminator feature selector diff --git a/benchmarks/DeltaTfIdfTransformerBench.php b/benchmarks/DeltaTfIdfTransformerBench.php new file mode 100644 index 0000000..7d68d6a --- /dev/null +++ b/benchmarks/DeltaTfIdfTransformerBench.php @@ -0,0 +1,55 @@ +greater(0.8); + + $samples = Matrix::gaussian(self::DATASET_SIZE, 4) + ->multiply($mask) + ->asArray(); + + $labels = Vector::rand(self::DATASET_SIZE) + ->greater(0.5) + ->asArray(); + + $this->dataset = Labeled::quick($samples, $labels); + + $this->transformer = new DeltaTfIdfTransformer(1.0); + } + + /** + * @Subject + * @Iterations(3) + * @OutputTimeUnit("milliseconds", precision=3) + */ + public function apply() : void + { + $this->dataset->apply($this->transformer); + } +} diff --git a/benchmarks/Transformers/BM25TransformerBench.php b/benchmarks/Transformers/BM25TransformerBench.php index adcef81..aec1747 100644 --- a/benchmarks/Transformers/BM25TransformerBench.php +++ b/benchmarks/Transformers/BM25TransformerBench.php @@ -41,7 +41,7 @@ public function setUp() : void /** * @Subject * @Iterations(3) - * @OutputTimeUnit("seconds", precision=3) + * @OutputTimeUnit("milliseconds", precision=3) */ public function apply() : void { diff --git a/benchmarks/Transformers/RecursiveFeatureEliminatorBench.php b/benchmarks/Transformers/RecursiveFeatureEliminatorBench.php deleted file mode 100644 index be0cdd7..0000000 --- a/benchmarks/Transformers/RecursiveFeatureEliminatorBench.php +++ /dev/null @@ -1,49 +0,0 @@ - new Blob([5.0, 3.42, 1.46, 0.24], [0.35, 0.38, 0.17, 0.1]), - 'Iris-versicolor' => new Blob([5.94, 2.77, 4.26, 1.33], [0.51, 0.31, 0.47, 0.2]), - 'Iris-virginica' => new Blob([6.59, 2.97, 5.55, 2.03], [0.63, 0.32, 0.55, 0.27]), - ]); - - $this->dataset = $generator->generate(self::DATASET_SIZE); - - $this->transformer = new RecursiveFeatureEliminator(2); - } - - /** - * @Subject - * @Iterations(3) - * @OutputTimeUnit("seconds", precision=3) - */ - public function apply() : void - { - $this->dataset->apply($this->transformer); - } -} diff --git a/benchmarks/Transformers/TokenHashingVectorizerBench.php b/benchmarks/Transformers/TokenHashingVectorizerBench.php new file mode 100644 index 0000000..fcad810 --- /dev/null +++ b/benchmarks/Transformers/TokenHashingVectorizerBench.php @@ -0,0 +1,54 @@ +dataset = Unlabeled::quick($samples); + + $this->transformer = new TokenHashingVectorizer(1000); + } + + /** + * @Subject + * @Iterations(3) + * @OutputTimeUnit("milliseconds", precision=3) + */ + public function apply() : void + { + $this->dataset->apply($this->transformer); + } +} diff --git a/composer.json b/composer.json index f8bb8af..d98d9cc 100644 --- a/composer.json +++ b/composer.json @@ -17,7 +17,7 @@ ], "require": { "php": ">=7.2", - "rubix/ml": "^0.1.0", + "rubix/ml": "^0.2.0", "rubix/tensor": "^2.0.4", "wamania/php-stemmer": "^2.0" }, diff --git a/docs/transformers/delta-tf-idf-transformer.md b/docs/transformers/delta-tf-idf-transformer.md index 9e8a27c..1cb31bc 100644 --- a/docs/transformers/delta-tf-idf-transformer.md +++ b/docs/transformers/delta-tf-idf-transformer.md @@ -10,13 +10,15 @@ A supervised TF-IDF (Term Frequency Inverse Document Frequency) Transformer that **Data Type Compatibility:** Continuous only ## Parameters -This transformer does not have any parameters. +| # | Param | Default | Type | Description | +|---|---|---|---|---| +| 1 | smoothing | 1.0 | float | The amount of additive (Laplace) smoothing to add to the term frequencies and inverse document frequencies (IDFs). | ## Example ```php use Rubix\ML\Transformers\DeltaTfIdfTransformer; -$transformer = new DeltaTfIdfTransformer(); +$transformer = new DeltaTfIdfTransformer(1.0); ``` ## Additional Methods diff --git a/docs/transformers/recursive-feature-eliminator.md b/docs/transformers/recursive-feature-eliminator.md deleted file mode 100644 index 92c96e2..0000000 --- a/docs/transformers/recursive-feature-eliminator.md +++ /dev/null @@ -1,32 +0,0 @@ -[source] - -# Recursive Feature Eliminator -Recursive Feature Eliminator or *RFE* is a supervised feature selector that uses the importance scores returned by a learner implementing the RanksFeatures interface to recursively drop feature columns with the lowest importance until max features is reached. - -**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful) - -**Data Type Compatibility:** Depends on the base learner - -## Parameters -| # | Param | Default | Type | Description | -|---|---|---|---|---| -| 1 | max features | | int | The maximum number of features to select. | -| 2 | epochs | 1 | int | The maximum number of iterations to recurse upon the dataset. | -| 3 | base | Auto | RanksFeatures | The base feature ranking learner instance. | - -## Example -```php -use Rubix\ML\Transformers\RecursiveFeatureEliminator; -use Rubix\ML\Regressors\RegressionTree; - -$transformer = new RecursiveFeatureEliminator(10, 2, new RegressionTree()); -``` - -## Additional Methods -Return the final importances scores of the selected feature columns: -``` php -public importances() : ?array -``` - -### References ->- I. Guyon et al. (2002). Gene Selection for Cancer Classification using Support Vector Machines. diff --git a/src/ModelOrchestra.php b/src/ModelOrchestra.php index 4126ac0..2d4b020 100644 --- a/src/ModelOrchestra.php +++ b/src/ModelOrchestra.php @@ -9,6 +9,7 @@ use Rubix\ML\Datasets\Unlabeled; use Rubix\ML\Other\Helpers\Params; use Rubix\ML\Backends\Tasks\Proba; +use Rubix\ML\Other\Helpers\Verifier; use Rubix\ML\Backends\Tasks\Predict; use Rubix\ML\Other\Traits\LoggerAware; use Rubix\ML\Other\Traits\PredictsSingle; @@ -241,13 +242,13 @@ public function train(Dataset $dataset) : void . ' Labeled training set.'); } - DatasetIsNotEmpty::check($dataset); - SamplesAreCompatibleWithEstimator::check($dataset, $this); + Verifier::check([ + DatasetIsNotEmpty::with($dataset), + SamplesAreCompatibleWithEstimator::with($dataset, $this), + ]); if ($this->logger) { - $this->logger->info('Learner init ' . Params::stringify($this->params())); - - $this->logger->info('Training started'); + $this->logger->info("$this initialized"); } [$left, $right] = $dataset->labelType()->isCategorical() diff --git a/src/Transformers/BM25Transformer.php b/src/Transformers/BM25Transformer.php index 1fd48d4..e14a7bd 100644 --- a/src/Transformers/BM25Transformer.php +++ b/src/Transformers/BM25Transformer.php @@ -164,7 +164,7 @@ public function fit(Dataset $dataset) : void */ public function update(Dataset $dataset) : void { - SamplesAreCompatibleWithTransformer::check($dataset, $this); + SamplesAreCompatibleWithTransformer::with($dataset, $this)->check(); if (is_null($this->dfs) or is_null($this->n)) { $this->fit($dataset); diff --git a/src/Transformers/DeltaTfIdfTransformer.php b/src/Transformers/DeltaTfIdfTransformer.php index b0bea2f..77353e0 100644 --- a/src/Transformers/DeltaTfIdfTransformer.php +++ b/src/Transformers/DeltaTfIdfTransformer.php @@ -34,6 +34,14 @@ */ class DeltaTfIdfTransformer implements Transformer, Stateful, Elastic { + /** + * The amount of additive (Laplace) smoothing to add to the inverse document + * frequencies (IDFs). + * + * @var float + */ + protected $smoothing; + /** * The class specific term frequencies of each word i.e. the number of * times a word appears in the context of a class label. @@ -80,6 +88,19 @@ class DeltaTfIdfTransformer implements Transformer, Stateful, Elastic */ protected $entropies; + /** + * @param float $smoothing + */ + public function __construct(float $smoothing = 1.0) + { + if ($smoothing <= 0.0) { + throw new InvalidArgumentException('Smoothing must be' + . " greater than 0, $smoothing given."); + } + + $this->smoothing = $smoothing; + } + /** * Return the data types that this transformer is compatible with. * @@ -127,11 +148,11 @@ public function fit(Dataset $dataset) : void $classes = $dataset->possibleOutcomes(); - $ones = array_fill(0, $dataset->numColumns(), 1); + $zeros = array_fill(0, $dataset->numColumns(), 0); - $this->tfs = array_fill_keys($classes, $ones); - $this->dfs = $this->totals = $ones; - $this->n = 1; + $this->tfs = array_fill_keys($classes, $zeros); + $this->dfs = $this->totals = $zeros; + $this->n = 0; $this->update($dataset); } @@ -148,7 +169,7 @@ public function update(Dataset $dataset) : void . ' labeled training set.'); } - SamplesAreCompatibleWithTransformer::check($dataset, $this); + SamplesAreCompatibleWithTransformer::with($dataset, $this)->check(); if (is_null($this->tfs) or is_null($this->dfs)) { $this->fit($dataset); @@ -176,17 +197,19 @@ public function update(Dataset $dataset) : void $this->n += $dataset->numRows(); + $nHat = $this->n + $this->smoothing; + $idfs = []; foreach ($this->dfs as $df) { - $idfs[] = 1.0 + log($this->n / $df); + $idfs[] = 1.0 + log($nHat / ($df + $this->smoothing)); } $entropies = array_fill(0, count($this->totals), 0.0); foreach ($this->tfs as $tfs) { foreach ($tfs as $column => $tf) { - $delta = $tf / $this->totals[$column]; + $delta = ($tf + $this->smoothing) / ($this->totals[$column] + $this->smoothing); $entropies[$column] += -$delta * log($delta); } @@ -225,6 +248,6 @@ public function transform(array &$samples) : void */ public function __toString() : string { - return 'Delta TF-IDF Transformer'; + return "Delta TF-IDF Transformer (smoothing: {$this->smoothing})"; } } diff --git a/src/Transformers/RecursiveFeatureEliminator.php b/src/Transformers/RecursiveFeatureEliminator.php deleted file mode 100644 index 7dd605f..0000000 --- a/src/Transformers/RecursiveFeatureEliminator.php +++ /dev/null @@ -1,207 +0,0 @@ -maxFeatures = $maxFeatures; - $this->epochs = $epochs; - $this->base = $base; - $this->fitBase = is_null($base); - } - - /** - * Return the data types that this transformer is compatible with. - * - * @return \Rubix\ML\DataType[] - */ - public function compatibility() : array - { - return DataType::all(); - } - - /** - * Is the transformer fitted? - * - * @return bool - */ - public function fitted() : bool - { - return isset($this->importances); - } - - /** - * Return the final importances scores of the selected feature columns. - * - * @return float[]|null - */ - public function importances() : ?array - { - return $this->importances; - } - - /** - * Fit the transformer to the dataset. - * - * @param \Rubix\ML\Datasets\Dataset $dataset - * @throws \InvalidArgumentException - */ - public function fit(Dataset $dataset) : void - { - if (!$dataset instanceof Labeled) { - throw new InvalidArgumentException('Transformer requires a' - . ' Labeled training set.'); - } - - if ($this->fitBase or is_null($this->base)) { - switch ($dataset->labelType()) { - case DataType::categorical(): - $this->base = new ClassificationTree(); - - break 1; - - case DataType::continuous(): - $this->base = new RegressionTree(); - - break 1; - - default: - throw new InvalidArgumentException('Label type is' - . ' not compatible with base learner.'); - } - } - - $n = $dataset->numColumns(); - - $selected = range(0, $n - 1); - - $k = (int) max(round(max($n - $this->maxFeatures, 0) / $this->epochs), 1); - - $subset = clone $dataset; - - do { - $this->base->train($subset); - - $importances = $this->base->featureImportances(); - - asort($importances); - - $dropped = array_slice($importances, 0, $k, true); - - $selected = array_values(array_diff_key($selected, $dropped)); - - $subset->dropColumns(array_keys($dropped)); - } while (count($selected) > $this->maxFeatures); - - $importances = array_diff_key($importances, $dropped ?? []); - - $this->importances = array_combine($selected, $importances) ?: []; - } - - /** - * Transform the dataset in place. - * - * @param array[] $samples - * @throws \RuntimeException - */ - public function transform(array &$samples) : void - { - if (is_null($this->importances)) { - throw new RuntimeException('Transformer has not been fitted.'); - } - - foreach ($samples as &$sample) { - $sample = array_values(array_intersect_key($sample, $this->importances)); - } - } - - /** - * Return the string representation of the object. - * - * @return string - */ - public function __toString() : string - { - return "Recursive Feature Eliminator {max features: {$this->maxFeatures}}" - . " {epochs: {$this->epochs} base: {$this->base}}"; - } -} diff --git a/tests/Transformers/DeltaTfIdfTransformerTest.php b/tests/Transformers/DeltaTfIdfTransformerTest.php index c1c8a01..d75d028 100644 --- a/tests/Transformers/DeltaTfIdfTransformerTest.php +++ b/tests/Transformers/DeltaTfIdfTransformerTest.php @@ -35,7 +35,7 @@ public function setUp() : void [0, 0, 2, 1, 0, 1, 1, 1, 0, 0, 0, 1], ], ['pos', 'pos', 'pos', 'neg', 'neg']); - $this->transformer = new DeltaTfIdfTransformer(); + $this->transformer = new DeltaTfIdfTransformer(1.0); } /** diff --git a/tests/Transformers/RecursiveFeatureEliminatorTest.php b/tests/Transformers/RecursiveFeatureEliminatorTest.php deleted file mode 100644 index 5f7ab6b..0000000 --- a/tests/Transformers/RecursiveFeatureEliminatorTest.php +++ /dev/null @@ -1,73 +0,0 @@ -generator = new Agglomerate([ - 'male' => new Blob([69.2, 195.7, 40.0], [1.0, 3.0, 0.3]), - 'female' => new Blob([63.7, 168.5, 38.1], [0.8, 2.5, 0.4]), - ], [0.45, 0.55]); - - $this->transformer = new RecursiveFeatureEliminator(1); - } - - /** - * @test - */ - public function build() : void - { - $this->assertInstanceOf(RecursiveFeatureEliminator::class, $this->transformer); - $this->assertInstanceOf(Transformer::class, $this->transformer); - $this->assertInstanceOf(Stateful::class, $this->transformer); - } - - /** - * @test - */ - public function fitTransform() : void - { - $dataset = $this->generator->generate(100); - - $this->assertEquals(3, $dataset->numColumns()); - - $dataset->apply($this->transformer); - - $this->assertEquals(1, $dataset->numColumns()); - } - - /** - * @test - */ - public function transformUnfitted() : void - { - $this->expectException(RuntimeException::class); - - $samples = $this->generator->generate(1)->samples(); - - $this->transformer->transform($samples); - } -}