Skip to content

Commit

Permalink
Initial
Browse files Browse the repository at this point in the history
  • Loading branch information
toddbruner committed Oct 31, 2024
1 parent 08efd57 commit 64ceb80
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 10 deletions.
21 changes: 21 additions & 0 deletions etc/test.sqlite.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ CREATE TABLE IF NOT EXISTS metrics (

CREATE TABLE IF NOT EXISTS regex (
regex_id INTEGER PRIMARY KEY AUTOINCREMENT,
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
name TEXT NOT NULL,
description TEXT NOT NULL,
Expand All @@ -33,6 +34,26 @@ BEGIN
UPDATE regex SET updated=CURRENT_TIMESTAMP WHERE regex_id=NEW.regex_id;
END;

insert into regex values (
NULL,
NULL,
NULL,
'new closing dispo',
'User Defined Entity', 'new\ closing\ dispo', 'test_entity', 'udef', 101, 1);
insert into regex values (
NULL,
NULL,
NULL,
'foo',
'User Defined Entity', 'fufoo', 'test_entity', 'udef', 100, 0);

insert into regex values (
NULL,
NULL,
NULL,
'sydney rox',
'User Defined Entity', 'sydney rox', 'test_entity', 'udef', 102, 0);


--
-- The Files table keeps track of the files created by imgmunger
Expand Down
41 changes: 36 additions & 5 deletions lib/Flair/Model/Regex.pm
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,15 @@ sub create_pg ($self, $regex_href) {
$self->log->debug("REGEX Create");
$self->log->debug("regex = ", { filter => \&Dumper, value => $regex_href});

my $href = dclone($regex_href);

if ($self->contains_whitespace($href->{match}) and ! $href->{multiword} ) {
$href->{multiword} = 1;
}

my $sql = $self->getSAL;
my ($stmt, @bind) = $sql->insert($self->tablename,
$regex_href,
$href,
{ returning => 'regex_id' });
$self->log_sql(__PACKAGE__, $stmt, @bind);

Expand All @@ -71,9 +77,14 @@ sub create_pg ($self, $regex_href) {
}

sub create_mysql ($self, $regex_href) {
my $sql = $self->getSAL;
my $href = dclone($regex_href);
$href->{'`match`'} = delete $href->{match};
my $sql = $self->getSAL;
my $href = dclone($regex_href);

if ($self->contains_whitespace($href->{match}) and ! $href->{multiword} ) {
$href->{multiword} = 1;
}

$href->{'`match`'} = delete $href->{match};
my ($stmt, @bind) = $sql->insert($self->tablename, $href);
$self->log_sql(__PACKAGE__, $stmt, @bind);
my $id = $self->do_query($stmt, @bind)->last_insert_id;
Expand All @@ -83,6 +94,11 @@ sub create_mysql ($self, $regex_href) {
sub create_sqlite ($self, $regex_href) {
my $sql = $self->getSAL;
my $href = dclone($regex_href);

if ($self->contains_whitespace($href->{match}) and ! $href->{multiword} ) {
$href->{multiword} = 1;
}

$href->{'`match`'} = delete $href->{match};
my ($stmt, @bind) = $sql->insert($self->tablename, $href);
$self->log_sql(__PACKAGE__, $stmt, @bind);
Expand All @@ -95,6 +111,17 @@ sub create_sqlite ($self, $regex_href) {
return $self->fetch($id);
}

sub contains_whitespace ($self, $match) {
return $match =~ /[ ]/;
}

sub escape_spaces ($self, $match) {
my $escaped = dclone($match);
$escaped =~ s/ /\\ /g; # space
$escaped =~ s/ /\\ /g; # tab
return $escaped;
}

sub update ($self, $id, $update_href) {
return $self->update_re($id, $update_href) if ($self->dbtype eq "pg");
return $self->update_mysql($id, $update_href) if ($self->dbtype eq "mysql");
Expand Down Expand Up @@ -250,8 +277,12 @@ sub build_flair_regexes ($self, $opts=undef) {

sub create_re ($self, $href) {
my $match = $href->{match};
if ($match =~ / / and ! $href->{multiword}) {
$self->log->warn("Regular Expression $href->{name} contains spaces but was not marked multiword. Overriding multiword to true.");
$href->{multiword} = 1;
}
die "Must provide match value in RE record." unless defined $match;
$href->{regex} = ($href->{multiword}) ? qr/($match)/xims
$href->{regex} = ($href->{multiword}) ? qr/($match)/ims
: qr/\b($match)\b/xims;
}

Expand Down
17 changes: 13 additions & 4 deletions lib/Flair/Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ has 'public_suffix' => sub {
};

sub parse ($self, $input, $edb, $falsepos, $hint=undef) {
$self->log->debug("PARSING $input");
my $clean = $self->clean_input($input);
# load the current set of regexes. This can be updated async
# so we want to pay the price of database fetch to make sure we
# have any new flair regexes included
my $regexes = $self->db->regex->build_flair_regexes();
my $re_aref = $self->db->regex->build_flair_regexes();
$self->log->debug("(parse) RE array has ".scalar(@$re_aref)." elements");
# begin the parsing of the text, which is a recursive process
my @new = $self->descend($edb, $input, $falsepos, $regexes, $hint);
my @new = $self->descend($edb, $input, $falsepos, $re_aref, $hint);
return @new;
}

Expand All @@ -43,6 +45,9 @@ sub descend ($self, $edb, $input, $falsepos, $re_aref, $hint=undef) {
# suppress deep recursion warning
no warnings 'recursion';

$self->log->debug("Descending into $input");
$self->log->debug("RE array has ".scalar(@$re_aref)." elements");

# recursion end condition
return if $input eq '';

Expand All @@ -52,16 +57,20 @@ sub descend ($self, $edb, $input, $falsepos, $re_aref, $hint=undef) {
# if we have a hint, then only provide that re for parsing
# e.g. message_id columns, that way email does not match on it.
if (defined $hint) {
$self->log->debug("We have a hint!");
@regexes = grep { $_->{entity_type} eq $hint } @regexes;
}

# look for each regex. first found regex wins and we move on.
REGEX:
foreach my $re_href (@regexes) {


my $re = $re_href->{regex};
my $et = $re_href->{entity_type};

$self->log->debug("Using Regex ".$re_href->{name});

# get text before, the flair, and text after match
my ($pre, $flair, $post) = $self->find_flairable($input,
$re,
Expand Down Expand Up @@ -101,7 +110,6 @@ sub find_flairable ($self, $text, $re, $et, $edb, $falsepos) {
MATCH:
while ( $text =~ m/$re/g ) {

$self->log->trace("Text = ".$text);
# use perl special vars to get pre, match and post strings
# $-[0] = index of start of match, $+[0] = index of end of match
my $pre = substr($text, 0, $-[0]);
Expand All @@ -116,7 +124,7 @@ sub find_flairable ($self, $text, $re, $et, $edb, $falsepos) {
# check for false positive
if (! defined $flairable) {
$fp++;
$self->log->trace("no flairable: ");
$self->log->debug("no flairable: ");
$self->log->trace("PRE was $PRE");
# append $pre and $match to the PRE buffer
$PRE .= $pre.$match;
Expand All @@ -128,6 +136,7 @@ sub find_flairable ($self, $text, $re, $et, $edb, $falsepos) {
return $pre, $flairable, $post if $fp;
return $PRE.$pre, $flairable, $post;
}
$self->log->debug("No match");
# nothing found
return undef, undef, undef;
}
Expand Down
6 changes: 5 additions & 1 deletion lib/Flair/Processor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -230,11 +230,13 @@ sub validate_data ($self, @args) {
return undef;
}

return {
my $retval = {
type => $type,
id => $id,
data => $data,
};
$self->log->debug("Data validated, returning ", {filter=>\&Dumper, value => $retval});
return $retval;
}

sub calculate_size ($self, $href) {
Expand Down Expand Up @@ -370,6 +372,8 @@ sub walk_tree ($self, $element, $edb, $falsepos) {
# recursively descend into html tree, look for flair when leaf node is found
return if $element->is_empty;

$self->log->trace("Walking tree element ",{filter=>\&Dumper, value => $element});

# concatenate adjacent text nodes
$element->normalize_content;

Expand Down
13 changes: 13 additions & 0 deletions t/db.t
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ my $result = $apikeys->list({ fields => ['*'], where => undef });

print Dumper($result);

my $href = $db->regex->create({
active => 1,
name => 'Test it',
description => 'Test spaces',
match => 'Test it',
entity_type => 'test_mw_entity',
regex_type => 'udef',
re_order => 10,
multiword => 0,
});

print Dumper($href);

done_testing();
exit 0;

Expand Down
15 changes: 15 additions & 0 deletions t/parser_test_data/udef_1
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
text => <<~'EOF',
this is fufoo stuff
EOF

expect => <<~'EOF',
this is <span class="entity test_entity" data-entity-type="test_entity" data-entity-value="fufoo">fufoo</span> stuff
EOF

entities => {
test_entity => {
"fufoo" => 1,
},
},
}
15 changes: 15 additions & 0 deletions t/parser_test_data/udef_2
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
text => <<~'EOF',
new closing dispo
EOF

expect => <<~'EOF',
<span class="entity test_entity" data-entity-type="test_entity" data-entity-value="new closing dispo">new closing dispo</span>
EOF

entities => {
test_entity => {
"new closing dispo" => 1,
},
},
}
15 changes: 15 additions & 0 deletions t/parser_test_data/udef_3
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
text => <<~'EOF',
The prof said "sydney rox"
EOF

expect => <<~'EOF',
The prof said "<span class="entity test_entity" data-entity-type="test_entity" data-entity-value="sydney rox">sydney rox</span>"
EOF

entities => {
test_entity => {
"sydney rox" => 1,
},
},
}

0 comments on commit 64ceb80

Please sign in to comment.