-
Notifications
You must be signed in to change notification settings - Fork 34
/
dup_subtr_finder.pl
executable file
·94 lines (74 loc) · 2.16 KB
/
dup_subtr_finder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl
# Author: Daniel "Trizen" Șuteu
# License: GPLv3
# Date: 11 December 2013
# https://trizenx.blogspot.com
# Find the longest duplicated sub-strings inside a string/file (based on a given minimum length).
use 5.010;
use strict;
use autodie;
use warnings;
use List::Util qw(first);
use Data::Dump qw(pp);
use Getopt::Std qw(getopts);
sub find_substrings (&@) {
my ($code, $str, $min) = @_;
my @substrings;
my $len = length($str);
my $max = int($len / 2);
my @pos;
for (my $i = $max ; $i >= $min ; $i--) {
for (my $j = 0 ; $j <= $len - $i * 2 ; $j++) {
#die $i if $i > ($len - ($j + $i)); # not gonna happen
#say "=>> ", substr($str, $j, $i);
if (defined(my $arr = first { $j >= $_->[0] && $j <= $_->[1] } @pos)) {
$j = $arr->[1];
next;
}
if ((my $pos = index($str, substr($str, $j, $i), $j + $i)) != -1) {
$code->({pos => [$j, $pos], len => $i, substr => substr($str, $j, $i)});
push @pos, [$j, $j + $i]; # don't match again in substr
#push @pos, [$pos, $pos + $i]; # don't match again in dup-substr
$j += $i;
}
}
}
=old
for (my $j = 0 ; $j <= $len ; $j++) {
for (my $i = $len - $j > $max ? $max : $len - $j ; $i >= $min ; $i--) {
next if $i > ($len - ($j + $i));
if ((my $pos = index($str, substr($str, $j, $i), $j + $i)) != -1) {
$code->({pos => [$j, $pos], len => $i, substr => substr($str, $j, $i)});
$j += $i;
last;
}
}
}
=cut
return @substrings;
}
#
## MAIN
#
sub usage {
print <<"USAGE";
usage: $0 [options] [input-file]
options:
-m <int> : the minimum sub-string length
example: perl $0 -m 50 file.txt
USAGE
exit 1;
}
my %opt;
getopts('m:', \%opt);
my $file = @ARGV && (-f $ARGV[0]) ? shift() : usage();
my $minLen = $opt{m} || (-s $file) / 10;
# Dearly spider
find_substrings { say pp(shift) } (
do {
local $/;
open my $fh, '<', $file;
<$fh>;
},
$minLen
);