-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpamDatasetReader.php
105 lines (90 loc) · 3.09 KB
/
SpamDatasetReader.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
<?php
namespace App\Module\ML\Infrastructure\Reader;
use App\Core\Application\Path\AppPathResolver;
use App\Module\ML\Domain\Constant;
use League\Csv\Exception;
use League\Csv\Reader;
use League\Csv\UnavailableStream;
use Ramsey\Collection\Collection;
use function Symfony\Component\String\u;
readonly class SpamDatasetReader
{
public function __construct(
private AppPathResolver $appPathResolver,
) {
}
/**
* @return Collection<array<string, string>>
*
* @throws UnavailableStream
* @throws Exception
*/
public function read(
string $datasetFilename,
int $tooLongWordSize = Constant::DEFAULT_TO_LONG_WORD_SIZE,
?int $headerOffset = 0,
): Collection {
$reader = Reader::createFromPath($this->appPathResolver->getDatasetPath($datasetFilename));
$reader->setHeaderOffset($headerOffset);
$collection = new Collection('array');
foreach ($reader->getRecords() as $record) {
if ($this->isRecordValid($record)) {
$collection->add([
'message' => $this->normalizeMessage($this->clearMessage($record['MESSAGE'], $tooLongWordSize)),
'is_spam' => match (intval($record['CATEGORY'])) {
1 => 'yes',
default => 'no',
},
]);
}
}
return $collection;
}
/**
* @param array<string, string> $record
*/
private function isRecordValid(array $record): bool
{
return isset($record['CATEGORY'])
&& !empty($record['MESSAGE'])
&& in_array($record['CATEGORY'], ['0', '1'], true);
}
private function clearMessage(string $message, int $tooLongWordSize): string
{
$result = trim(strip_tags(html_entity_decode(
u($message)
->replaceMatches('/(<(script|style)\b[^>]*>).*?(<\/\2>)/is', '$1$3')
->replaceMatches('/([a-zA-Z\-]+:).*?(\r\n|\r|\n)/is', '')
->replaceMatches('/charset="[a-zA-Z0-9\-]*"/is', '')
->replaceMatches('/(------=_).*?(\r\n|\r|\n)/is', '')
->replace('This is a multi-part message in MIME format.', '')
->toString()
)));
foreach ($this->getTooLongWords($result, $tooLongWordSize) as $word) {
$result = u($result)->replace($word, '')->toString();
}
return trim($result);
}
private function normalizeMessage(string $message): string
{
return u($message)
->replaceMatches('/\r\n|\r|\n/is', ' ')
->replaceMatches('/\s+/uis', ' ')
->collapseWhitespace()
->toString();
}
/**
* @return string[]
*/
private function getTooLongWords(string $message, int $tooLongSize): array
{
$tooLong = [];
$words = preg_split('/\s/', $message);
foreach ($words as $word) {
if (strlen($word) > ($tooLongSize - 1) && !in_array($word, $tooLong)) {
$tooLong[] = $word;
}
}
return $tooLong;
}
}