forked from ufal/hamledt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_duplicate_sentences.pl
executable file
·71 lines (69 loc) · 1.69 KB
/
find_duplicate_sentences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env perl
# Reads sentences from CoNLL file, stores them in a hash, then reads sentences from another file and tries to find duplicates.
# Motivation: German UD lacks lemmas and features. But it is data from Google, which reportedly contains part of Tiger corpus. And we have morphology for Tiger, from CoNLL 2009.
# Copyright © 2016 Dan Zeman <[email protected]>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
my ($f1, $f2) = @ARGV;
my %hash;
my $sentence = '';
my $skip_until;
open(IN, $f1) or die("Cannot read $f1: $!");
while(<IN>)
{
next if(m/^\#/);
# UD has split contractions ("zu dem") while CoNLL 2009 has "zum". Get the surface tokens when applicable.
if(m/^(\d+)-(\d+)\t/)
{
$skip_until = $2;
my @f = split(/\t/, $_);
$sentence .= $f[1].' ';
}
elsif(m/^(\d+)\t/)
{
my $i = $1;
if(defined($skip_until))
{
if($i<=$skip_until)
{
next;
}
else
{
$skip_until = undef;
}
}
my @f = split(/\t/, $_);
$sentence .= $f[1].' ';
}
else # empty line = end of sentence
{
$hash{$sentence}++;
$sentence = '';
}
}
close(IN);
open(IN, $f2) or die("Cannot read $f2: $!");
while(<IN>)
{
next if(m/^\#/);
next if(m/^\d+-/);
if(m/^\d+\t/)
{
my @f = split(/\t/, $_);
$sentence .= $f[1].' ';
}
else # empty line = end of sentence
{
if(exists($hash{$sentence}))
{
print("$sentence\n");
}
$sentence = '';
}
}
close(IN);