-
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented Token Hashing Vectorizer transformer
- Loading branch information
1 parent
e8f5ed2
commit 18dab62
Showing
10 changed files
with
234 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<span style="float:right;"><a href="https://github.com/RubixML/Extras/blob/master/src/Transformers/TokenHashingVectorizer.php">[source]</a></span> | ||
|
||
# Token Hashing Vectorizer | ||
Token Hashing Vectorizer builds token count vectors on the fly by employing a *hashing trick*. It is a stateless transformer that uses the CRC32 (Cyclic Redundancy Check) hashing algorithm to assign token occurrences to a bucket in a vector of user-specified dimensionality. The advantage of hashing over storing a fixed vocabulary is that there is no memory footprint however there is a chance that certain tokens will collide with other tokens especially in lower-dimensional vector spaces. | ||
|
||
**Interfaces:** [Transformer](api.md#transformer) | ||
|
||
**Data Type Compatibility:** Continuous only | ||
|
||
## Parameters | ||
| # | Param | Default | Type | Description | | ||
|---|---|---|---|---| | ||
| 1 | dimensions | | int | The dimensionality of the vector space. | | ||
| 2 | tokenizer | Word | Tokenizer | The tokenizer used to extract tokens from blobs of text. | | ||
|
||
## Example | ||
```php | ||
use Rubix\ML\Transformers\TokenHashingVectorizer; | ||
use Rubix\ML\Other\Tokenizers\NGram; | ||
|
||
$transformer = new TokenHashingVectorizer(10000, new NGram(1, 2)); | ||
``` | ||
|
||
## Additional Methods | ||
This transformer does not have any additional methods. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
<?php | ||
|
||
namespace Rubix\ML\Transformers; | ||
|
||
use Rubix\ML\DataType; | ||
use Rubix\ML\Datasets\Dataset; | ||
use Rubix\ML\Other\Tokenizers\Word; | ||
use Rubix\ML\Other\Tokenizers\Tokenizer; | ||
use InvalidArgumentException; | ||
use Stringable; | ||
|
||
use function count; | ||
use function is_string; | ||
|
||
/** | ||
* Token Hashing Vectorizer | ||
* | ||
* Token Hashing Vectorizer builds token count vectors on the fly by employing a *hashing | ||
* trick*. It is a stateless transformer that uses the CRC32 (Cyclic Redundancy Check) | ||
* hashing algorithm to assign token occurrences to a bucket in a vector of user-defined | ||
* dimensionality. The advantage of hashing over a fixed vocabulary is that there is no | ||
* memory footprint however there is a chance that certain tokens will collide with other | ||
* tokens especially in lower-dimensional vector spaces. | ||
* | ||
* @category Machine Learning | ||
* @package Rubix/ML | ||
* @author Andrew DalPino | ||
*/ | ||
class TokenHashingVectorizer implements Transformer, Stringable | ||
{ | ||
/** | ||
* The maximum number of dimensions supported. | ||
* | ||
* @var int | ||
*/ | ||
protected const MAX_DIMENSIONS = 4294967295; | ||
|
||
/** | ||
* The dimensionality of the vector space. | ||
* | ||
* @var int | ||
*/ | ||
protected $dimensions; | ||
|
||
/** | ||
* The tokenizer used to extract tokens from blobs of text. | ||
* | ||
* @var \Rubix\ML\Other\Tokenizers\Tokenizer | ||
*/ | ||
protected $tokenizer; | ||
|
||
/** | ||
* @param int $dimensions | ||
* @param \Rubix\ML\Other\Tokenizers\Tokenizer|null $tokenizer | ||
* @throws \InvalidArgumentException | ||
*/ | ||
public function __construct(int $dimensions, ?Tokenizer $tokenizer = null) | ||
{ | ||
if ($dimensions < 1 or $dimensions > self::MAX_DIMENSIONS) { | ||
throw new InvalidArgumentException('Dimensions must be' | ||
. ' between 0 and ' . self::MAX_DIMENSIONS | ||
. ", $dimensions given."); | ||
} | ||
|
||
$this->dimensions = $dimensions; | ||
$this->tokenizer = $tokenizer ?? new Word(); | ||
} | ||
|
||
/** | ||
* Return the data types that this transformer is compatible with. | ||
* | ||
* @return \Rubix\ML\DataType[] | ||
*/ | ||
public function compatibility() : array | ||
{ | ||
return DataType::all(); | ||
} | ||
|
||
/** | ||
* Transform the dataset in place. | ||
* | ||
* @param array[] $samples | ||
*/ | ||
public function transform(array &$samples) : void | ||
{ | ||
$scale = $this->dimensions / self::MAX_DIMENSIONS; | ||
|
||
foreach ($samples as &$sample) { | ||
$vectors = []; | ||
|
||
foreach ($sample as $column => $value) { | ||
if (is_string($value)) { | ||
$template = array_fill(0, $this->dimensions, 0); | ||
|
||
$tokens = $this->tokenizer->tokenize($value); | ||
|
||
$counts = array_count_values($tokens); | ||
|
||
foreach ($counts as $token => $count) { | ||
$offset = (int) floor(crc32($token) * $scale); | ||
|
||
$template[$offset] += $count; | ||
} | ||
|
||
$vectors[] = $template; | ||
|
||
unset($sample[$column]); | ||
} | ||
} | ||
|
||
$sample = array_merge($sample, ...$vectors); | ||
} | ||
} | ||
|
||
/** | ||
* Return the string representation of the object. | ||
* | ||
* @return string | ||
*/ | ||
public function __toString() : string | ||
{ | ||
return "Token Hashing Vectorizer (dimensions: {$this->dimensions}," | ||
. " tokenizer: {$this->tokenizer})"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
<?php | ||
|
||
namespace Rubix\ML\Tests\Transformers; | ||
|
||
use Rubix\ML\Datasets\Unlabeled; | ||
use Rubix\ML\Other\Tokenizers\Word; | ||
use Rubix\ML\Transformers\Transformer; | ||
use Rubix\ML\Transformers\TokenHashingVectorizer; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
/** | ||
* @group Transformers | ||
* @covers \Rubix\ML\Transformers\TokenHashingVectorizer | ||
*/ | ||
class TokenHashingVectorizerTest extends TestCase | ||
{ | ||
/** | ||
* @var \Rubix\ML\Datasets\Unlabeled | ||
*/ | ||
protected $dataset; | ||
|
||
/** | ||
* @var \Rubix\ML\Transformers\TokenHashingVectorizer | ||
*/ | ||
protected $transformer; | ||
|
||
/** | ||
* @before | ||
*/ | ||
protected function setUp() : void | ||
{ | ||
$this->dataset = Unlabeled::quick([ | ||
['the quick brown fox jumped over the lazy man sitting at a bus stop drinking a can of coke'], | ||
['with a dandy umbrella'], | ||
]); | ||
|
||
$this->transformer = new TokenHashingVectorizer(20, new Word()); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function build() : void | ||
{ | ||
$this->assertInstanceOf(TokenHashingVectorizer::class, $this->transformer); | ||
$this->assertInstanceOf(Transformer::class, $this->transformer); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function transform() : void | ||
{ | ||
$this->dataset->apply($this->transformer); | ||
|
||
$outcome = [ | ||
[1, 1, 0, 1, 2, 0, 0, 1, 3, 0, 0, 1, 0, 0, 2, 1, 0, 1, 5, 0], | ||
[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], | ||
]; | ||
|
||
$this->assertEquals($outcome, $this->dataset->samples()); | ||
} | ||
} |