Skip to content

Commit

Permalink
Added Laplace smoothing to Delta TF-IDF Transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Sep 14, 2020
1 parent 18dab62 commit 769c3e7
Show file tree
Hide file tree
Showing 14 changed files with 157 additions and 381 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
- Unreleased
- 0.2.0-beta1
- Implemented Token Hashing Vectorizer transformer
- Move Recursive Feature Eliminator (RFE) to main repo
- Added Laplace smoothing to Delta TF-IDF Transformer

- 0.1.0-beta
- Add Recursive Feature Eliminator feature selector
Expand Down
55 changes: 55 additions & 0 deletions benchmarks/DeltaTfIdfTransformerBench.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
<?php

namespace Rubix\ML\Benchmarks\Transformers;

use Tensor\Matrix;
use Tensor\Vector;
use Rubix\ML\Datasets\Labeled;
use Rubix\ML\Transformers\DeltaTfIdfTransformer;

/**
* @Groups({"Transformers"})
* @BeforeMethods({"setUp"})
*/
class DeltaTfIdfTransformerBench
{
protected const DATASET_SIZE = 10000;

/**
* @var \Rubix\ML\Datasets\Labeled
*/
public $dataset;

/**
* @var \Rubix\ML\Transformers\DeltaTfIdfTransformer
*/
protected $transformer;

public function setUp() : void
{
$mask = Matrix::rand(self::DATASET_SIZE, 4)
->greater(0.8);

$samples = Matrix::gaussian(self::DATASET_SIZE, 4)
->multiply($mask)
->asArray();

$labels = Vector::rand(self::DATASET_SIZE)
->greater(0.5)
->asArray();

$this->dataset = Labeled::quick($samples, $labels);

$this->transformer = new DeltaTfIdfTransformer(1.0);
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function apply() : void
{
$this->dataset->apply($this->transformer);
}
}
2 changes: 1 addition & 1 deletion benchmarks/Transformers/BM25TransformerBench.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public function setUp() : void
/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("seconds", precision=3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function apply() : void
{
Expand Down
49 changes: 0 additions & 49 deletions benchmarks/Transformers/RecursiveFeatureEliminatorBench.php

This file was deleted.

54 changes: 54 additions & 0 deletions benchmarks/Transformers/TokenHashingVectorizerBench.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?php

namespace Rubix\ML\Benchmarks\Transformers;

use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Transformers\TokenHashingVectorizer;

/**
* @Groups({"Transformers"})
* @BeforeMethods({"setUp"})
*/
class TokenHashingVectorizerBench
{
protected const DATASET_SIZE = 10000;

protected const SAMPLE_TEXT = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec at nisl posuere, luctus sapien vel, maximus ex. Curabitur tincidunt, libero at commodo tempor, magna neque malesuada diam, vel blandit metus velit quis magna. Vestibulum auctor libero quam, eu ullamcorper nulla dapibus a. Mauris id ultricies sapien. Integer consequat mi eget vehicula vulputate. Mauris cursus nisi non semper dictum. Quisque luctus ex in tortor laoreet tincidunt. Vestibulum imperdiet purus sit amet sapien dignissim elementum. Mauris tincidunt eget ex eu laoreet. Etiam efficitur quam at purus sagittis hendrerit. Mauris tempus, sem in pulvinar imperdiet, lectus ipsum molestie ante, id semper nunc est sit amet sem. Nulla at justo eleifend, gravida neque eu, consequat arcu. Vivamus bibendum eleifend metus, id elementum orci aliquet ac. Praesent pellentesque nisi vitae tincidunt eleifend. Pellentesque quis ex et lorem laoreet hendrerit ut ac lorem. Aliquam non sagittis est.';

/**
* @var array[]
*/
protected $aSamples;

/**
* @var array[]
*/
protected $bSamples;

public function setUp() : void
{
$samples = [];

for ($i = 0; $i < self::DATASET_SIZE; ++$i) {
$text = self::SAMPLE_TEXT;

str_shuffle($text);

$samples[] = [$text];
}

$this->dataset = Unlabeled::quick($samples);

$this->transformer = new TokenHashingVectorizer(1000);
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function apply() : void
{
$this->dataset->apply($this->transformer);
}
}
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
],
"require": {
"php": ">=7.2",
"rubix/ml": "^0.1.0",
"rubix/ml": "^0.2.0",
"rubix/tensor": "^2.0.4",
"wamania/php-stemmer": "^2.0"
},
Expand Down
6 changes: 4 additions & 2 deletions docs/transformers/delta-tf-idf-transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ A supervised TF-IDF (Term Frequency Inverse Document Frequency) Transformer that
**Data Type Compatibility:** Continuous only

## Parameters
This transformer does not have any parameters.
| # | Param | Default | Type | Description |
|---|---|---|---|---|
| 1 | smoothing | 1.0 | float | The amount of additive (Laplace) smoothing to add to the term frequencies and inverse document frequencies (IDFs). |

## Example
```php
use Rubix\ML\Transformers\DeltaTfIdfTransformer;

$transformer = new DeltaTfIdfTransformer();
$transformer = new DeltaTfIdfTransformer(1.0);
```

## Additional Methods
Expand Down
32 changes: 0 additions & 32 deletions docs/transformers/recursive-feature-eliminator.md

This file was deleted.

11 changes: 6 additions & 5 deletions src/ModelOrchestra.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Other\Helpers\Params;
use Rubix\ML\Backends\Tasks\Proba;
use Rubix\ML\Other\Helpers\Verifier;
use Rubix\ML\Backends\Tasks\Predict;
use Rubix\ML\Other\Traits\LoggerAware;
use Rubix\ML\Other\Traits\PredictsSingle;
Expand Down Expand Up @@ -241,13 +242,13 @@ public function train(Dataset $dataset) : void
. ' Labeled training set.');
}

DatasetIsNotEmpty::check($dataset);
SamplesAreCompatibleWithEstimator::check($dataset, $this);
Verifier::check([
DatasetIsNotEmpty::with($dataset),
SamplesAreCompatibleWithEstimator::with($dataset, $this),
]);

if ($this->logger) {
$this->logger->info('Learner init ' . Params::stringify($this->params()));

$this->logger->info('Training started');
$this->logger->info("$this initialized");
}

[$left, $right] = $dataset->labelType()->isCategorical()
Expand Down
2 changes: 1 addition & 1 deletion src/Transformers/BM25Transformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ public function fit(Dataset $dataset) : void
*/
public function update(Dataset $dataset) : void
{
SamplesAreCompatibleWithTransformer::check($dataset, $this);
SamplesAreCompatibleWithTransformer::with($dataset, $this)->check();

if (is_null($this->dfs) or is_null($this->n)) {
$this->fit($dataset);
Expand Down
39 changes: 31 additions & 8 deletions src/Transformers/DeltaTfIdfTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@
*/
class DeltaTfIdfTransformer implements Transformer, Stateful, Elastic
{
/**
* The amount of additive (Laplace) smoothing to add to the inverse document
* frequencies (IDFs).
*
* @var float
*/
protected $smoothing;

/**
* The class specific term frequencies of each word i.e. the number of
* times a word appears in the context of a class label.
Expand Down Expand Up @@ -80,6 +88,19 @@ class DeltaTfIdfTransformer implements Transformer, Stateful, Elastic
*/
protected $entropies;

/**
* @param float $smoothing
*/
public function __construct(float $smoothing = 1.0)
{
if ($smoothing <= 0.0) {
throw new InvalidArgumentException('Smoothing must be'
. " greater than 0, $smoothing given.");
}

$this->smoothing = $smoothing;
}

/**
* Return the data types that this transformer is compatible with.
*
Expand Down Expand Up @@ -127,11 +148,11 @@ public function fit(Dataset $dataset) : void

$classes = $dataset->possibleOutcomes();

$ones = array_fill(0, $dataset->numColumns(), 1);
$zeros = array_fill(0, $dataset->numColumns(), 0);

$this->tfs = array_fill_keys($classes, $ones);
$this->dfs = $this->totals = $ones;
$this->n = 1;
$this->tfs = array_fill_keys($classes, $zeros);
$this->dfs = $this->totals = $zeros;
$this->n = 0;

$this->update($dataset);
}
Expand All @@ -148,7 +169,7 @@ public function update(Dataset $dataset) : void
. ' labeled training set.');
}

SamplesAreCompatibleWithTransformer::check($dataset, $this);
SamplesAreCompatibleWithTransformer::with($dataset, $this)->check();

if (is_null($this->tfs) or is_null($this->dfs)) {
$this->fit($dataset);
Expand Down Expand Up @@ -176,17 +197,19 @@ public function update(Dataset $dataset) : void

$this->n += $dataset->numRows();

$nHat = $this->n + $this->smoothing;

$idfs = [];

foreach ($this->dfs as $df) {
$idfs[] = 1.0 + log($this->n / $df);
$idfs[] = 1.0 + log($nHat / ($df + $this->smoothing));
}

$entropies = array_fill(0, count($this->totals), 0.0);

foreach ($this->tfs as $tfs) {
foreach ($tfs as $column => $tf) {
$delta = $tf / $this->totals[$column];
$delta = ($tf + $this->smoothing) / ($this->totals[$column] + $this->smoothing);

$entropies[$column] += -$delta * log($delta);
}
Expand Down Expand Up @@ -225,6 +248,6 @@ public function transform(array &$samples) : void
*/
public function __toString() : string
{
return 'Delta TF-IDF Transformer';
return "Delta TF-IDF Transformer (smoothing: {$this->smoothing})";
}
}
Loading

0 comments on commit 769c3e7

Please sign in to comment.