aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <gabocze@git.gnu.org.ua>2014-09-23 04:16:06 -0300
committerunknown <gabocze@git.gnu.org.ua>2014-09-23 04:16:06 -0300
commitfa6b243b08c2e483b8b6ed9654417607f158d416 (patch)
tree80a5e4953d4166f494316d68fb3515282a3bba39
parent84251691307b7e99bd7cbd15a20190ca14c65a1d (diff)
downloadvital-fa6b243b08c2e483b8b6ed9654417607f158d416.tar.gz
vital-fa6b243b08c2e483b8b6ed9654417607f158d416.tar.bz2
Changed file extensions from txt to {pl|pm}
-rw-r--r--TokensV2.pm174
-rw-r--r--synopsis.pl166
2 files changed, 340 insertions, 0 deletions
diff --git a/TokensV2.pm b/TokensV2.pm
new file mode 100644
index 0000000..619d7df
--- /dev/null
+++ b/TokensV2.pm
@@ -0,0 +1,174 @@
+=pod
+ TokensV2: Programme to discover tokens, where there are not.
+
+ Copyright 2013 Gabriel Czernikier
+
+
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+=cut
+use strict;
+#package declarations
+sub digest_single;
+sub parse_programme;
+sub printAll;
+
+my @REGEX;
+my @MASK;
+my $long_tokens;
+my @DIGEST_SINGLE;
+my $literal_char_count;
+
+sub digest_single { # $target, $regex, $eaten_left_literal_chars, $eaten_right_literal_chars
+ #return unless $_[1] ne '';
+ #return if length $_[1] < 8; # x>(.*?)<
+ return unless ($_[2]+$_[3])/$literal_char_count<0.999999;
+ return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
+ pos($_[0]) = undef;
+ goto VISITING unless $_[0] =~ /(${_[1]})/g;
+ my $digit = 2;
+ while( my $cg = eval '$'.$digit++ ) {
+ my @suspicious_tokens = $cg =~ /$long_tokens/g;
+ goto VISITING if @suspicious_tokens>=2;
+ }
+ my $pff = (pos $_[0]);
+ my $pii = $pff - (length $1) if defined $pff;
+ $DIGEST_SINGLE[$_[2]][$_[3]] = [$_[2], $_[3], $pii, $pff];
+ return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
+
+ VISITING:
+ my $re = $_[1];
+ # strip off left shelter, also understood as walking righwards across the regex .+? stopping at a hopefuly serviceable string
+ goto VISITING_2 unless $re =~ s/(.+?)(.\(\.\*\?\).|\([^.])/\2/; # .+? is also the "shelter"
+ my $increment_eaten_literal_chars = $+[1];
+ $increment_eaten_literal_chars -= 5 if $1 =~ /\(\.\*\?\)/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
+ # expansions
+ $re =~ s/^\(\?:(?:[^)]|\)[^?+*])+\)\*//;
+ $re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\+/\1\(\?:\1\)\*/;
+ $re =~ s/^\(\?:((?:[^)]|\)[^?+*])+)\)\?/\1/;
+ my ($el, $er, $pi, $pf) = digest_single $_[0], $re, $_[2]+$increment_eaten_literal_chars, $_[3];
+
+ VISITING_2:
+ $re = $_[1];
+ # strip off right shelter, also understood as walking leftwards across the regex .+? stopping at a hopefuly serviceable string
+ goto SUBDIGEST unless $re =~ s/(.*(?:.\(\.\*\?\).|\)[?+*]))(.+?)$/\1/; # .+? is the "shelter"
+
+ $increment_eaten_literal_chars = $+[2] - $-[2];
+ $increment_eaten_literal_chars -= 5 if $2 =~ /\(\.\*\?\)/; # discount the single occurrence of non-literal chars: (.*?), left alone surrounding ""
+ # expansions
+ $re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\*$//;
+ $re =~ s/\(\?:((?:[^)]|\)[^?+*])+)\)\+$/\(\?:\1\)\*\1/;
+ $re =~ s/\(\?:(?:[^)]|\)[^?+*])+\)\?$//;
+ my ($el2, $er2, $pi2, $pf2) = digest_single $_[0], $re, $_[2], $_[3]+$increment_eaten_literal_chars;
+
+ SUBDIGEST:
+ $DIGEST_SINGLE[$_[2]][$_[3]] = [$el, $er, $pi, $pf] if (defined $pi) && ($er+$el<=$er2+$el2 || !defined $pi2);
+ return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
+ $DIGEST_SINGLE[$_[2]][$_[3]] = [$el2, $er2, $pi2, $pf2] if (defined $pi2) && ($er2+$el2<$er+$el || !defined $pi);
+ return @{$DIGEST_SINGLE[$_[2]][$_[3]]} if defined $DIGEST_SINGLE[$_[2]][$_[3]];
+ $DIGEST_SINGLE[$_[2]][$_[3]] = [];
+ return @{$DIGEST_SINGLE[$_[2]][$_[3]]};
+}
+
+sub digest_multiple {
+ my $regex_num = 0;
+ my $Pos_Ini;
+ my $Eaten_Left;
+ my $Eaten_Right;
+ my $Pos_Fin;
+ my $Regex_Num;
+
+ my @DIGEST_MULTIPLE = ();
+ for my $regex (@REGEX) {
+ @DIGEST_SINGLE = undef;
+ my @literal_chars = $regex =~ /[^.*(?:)+]/g;
+ $literal_char_count = scalar @literal_chars;
+ my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = digest_single $_[0], $regex, 0, 0;
+ return $eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num if(($eaten_right+$eaten_left)/$literal_char_count<0.2);
+ push @DIGEST_MULTIPLE, [$eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num] if defined $pos_ini && $pos_fin!=0;
+ $regex_num++;
+ }
+
+ $regex_num = 0;
+ while(@DIGEST_MULTIPLE!=0) {
+ my ($eaten_left, $eaten_right, $pos_ini, $pos_fin) = @{shift @DIGEST_MULTIPLE};
+ if($eaten_right+$eaten_left<$Eaten_Right+$Eaten_Left || !defined $Pos_Ini) {
+ $Pos_Ini = $pos_ini;
+ $Eaten_Left = $eaten_left;
+ $Eaten_Right = $eaten_right;
+ $Pos_Fin = $pos_fin;
+ $Regex_Num = $regex_num;
+ }
+ $regex_num++;
+ }
+ return $Eaten_Left, $Eaten_Right, $Pos_Ini, $Pos_Fin, $Regex_Num if defined $Pos_Ini;
+}
+
+sub parse_programme {
+ return if $_[0] eq '';
+ return $_[0] if length $_[0] < 8; # <x>y</x>
+ my ($eaten_left, $eaten_right, $pos_ini, $pos_fin, $regex_num) = digest_multiple($_[0]);
+ if(not defined $pos_ini) {
+ #my $oldfh = select;
+ #select STDERR;
+ #print "$ARGV[1], [ini-no-parseable]${target}[fin-no-parseable]\n";
+ #select $oldfh;
+ return $_[0];
+ }
+ my $mask_right = @MASK[$regex_num];
+ $mask_right =~ s/.*(.{$eaten_right})/\1/;
+ my $mask_left = @MASK[$regex_num];
+ $mask_left =~ s/(.{$eaten_left}).*/\1/;
+ my $match_length = $pos_fin-$pos_ini;
+ my ($target_left,$match,$target_right) = $_[0] =~ /(.{$pos_ini})(.{$match_length})(.*)/;
+ shift; # discard unused argument
+ return $mask_left.$match.$mask_right, +[parse_programme $target_left], +[parse_programme $target_right], $regex_num;
+}
+
+sub printAll {
+ return if @_==0;
+ if (@_==1) {
+ print "[ini-nonparse]${_[0]}[fin-nonparse]\n";
+ return;
+ }
+ printAll @{$_[1]};
+ print "[ini-prod]${_[0]}[fin-prod]\n";
+ printAll @{$_[2]};
+}
+
+sub parse {
+ @REGEX = ();
+ @MASK = ();
+ my $target = shift;
+ my $regex = shift;
+ while($regex =~ /^(.+)$/mg) {
+ my $_ = $1;
+ my $other = $_;
+ # strip out non-literal chars from MASK
+ $other =~ s/\(\?:(.+?)\)\+/\1/g;
+ $other =~ s/\(\?:(.+?)\)\?/\1/g;
+ $other =~ s/\.\*\?//g;
+ push @MASK, $other;
+ # surround wildcards with capturing group for REGEX
+ s/(\.\*\?)/\(\1\)/g;
+ push @REGEX, $_;
+ }
+
+ $long_tokens = '\b' . join '\b|\b', grep length>=3,keys %{+{ map +($_=>undef), map /\w+/g, @REGEX }};
+ $long_tokens = $long_tokens . '\b';
+
+ parse_programme $target;
+}
+
+1; \ No newline at end of file
diff --git a/synopsis.pl b/synopsis.pl
new file mode 100644
index 0000000..f3aa9e1
--- /dev/null
+++ b/synopsis.pl
@@ -0,0 +1,166 @@
+=pod
+ synopsis.pl: Programme to demonstrate the use of TokensV2.pm.
+ This script should serve both as a synopsis on using TokensV2.pm and as a programme to format MS Messenger Logs recovered with Photorec.
+ See http://www.cgsecurity.org/wiki/PhotoRec
+
+
+ Copyright 2013 Gabriel Czernikier
+
+
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+=cut
+
+use TokensV2;
+
+sub printFile;
+my @FORMAT = (
+['<Message Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From>(?:<User FriendlyName=".*?"/>)+</From><To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style=".*?")?>.*?</Text></Message>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Message Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?">(<From>(?:<User FriendlyName=".*?"/>)+</From>)<To>(?:<User FriendlyName=".*?"/>)+</To><Text(?: Style="(.*?)")?>(.*?)</Text></Message>|;
+ my $F = join '<br />', $f =~ m|<User FriendlyName="(.*?)"/>|g;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $F<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></Invitation>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><File>.*?</File><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><File>.*?</File><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<Invitation Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></Invitation>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Invitation Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></Invitation>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<InvitationResponse Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><From><User FriendlyName=".*?"/></From><Application>.*?</Application><Text(?: Style=".*?")?>.*?</Text></InvitationResponse>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<InvitationResponse Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><From><User FriendlyName="(.*?)"/></From><Application>.*?</Application><Text(?: Style="(.*?)")?>(.*?)</Text></InvitationResponse>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<Join Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Join>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Join Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Join>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+],
+['<Leave Date=".*?" Time=".*?" DateTime=".*?" SessionID=".*?"><User FriendlyName=".*?"/><Text(?: Style=".*?")?>.*?</Text></Leave>',
+ sub {
+ my $fh = $_[1];
+ my ($d, $t, $f, $s, $T) = $_[0] =~ m|<Leave Date="(.*?)" Time="(.*?)" DateTime=".*?" SessionID=".*?"><User FriendlyName="(.*?)"/><Text(?: Style="(.*?)")?>(.*?)</Text></Leave>|;
+ print $fh "<p><font size=\"-2\">($d&nbsp;$t)</font> $f<br /><span style=\"$s\">$T</span></p>";
+ }
+]
+);
+
+sub printFile {
+ return if @_==1;
+ if (@_==2) {
+ my $fh = $_[1];
+ ${_[0]} =~ s/</&lt;/g;
+ print $fh "<p>${_[0]}</p>\n";
+ return;
+ }
+ # @_==5
+ printFile @{$_[1]}, $_[4];
+ my $coderef = $FORMAT[$_[3]][1];
+ &$coderef($_[0], $_[4]);
+ my $fh = $_[4];
+ print $fh "\n";
+ printFile @{$_[2]}, $_[4];
+}
+
+sub joinParseTree {
+ return if @_==0;
+ if (@_==1) {
+ return ${_[0]};
+ }
+ # @_==4
+ return joinParseTree(@{$_[1]}) . $_[0] . joinParseTree(@{$_[2]});
+}
+
+open my $fH, "<:encoding(utf8)", "tokens-processed.txt"; # H for HASH, not handle
+my %HASH = map { /([^\s]*)/; $1 => undef } grep $_ ne "\n", <$fH>;
+close $fH;
+open $fH, ">>:encoding(utf8)", "tokens-processed.txt";
+print $fH "----------bookmark----------\n";
+
+local $/ = undef;
+
+my $regex = join "\n", map $$_[0], @FORMAT;
+while(glob "TXT/**") {
+ print "$_\n";
+ next if exists $HASH{$_};
+ open my $fh, "<:encoding(utf8)", $_;
+ my $text = <$fh>;
+ my $oName = $_;
+ $oName =~ s/TXT/HTML/;
+ $oName =~ s/\.txt/.html/;
+ my @parse_tree;
+ eval {
+ local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
+ alarm 5;
+ @parse_tree = parse($text, $regex);
+ alarm 0;
+ };
+ if ($@) {
+ die unless $@ eq "alarm\n"; # propagate unexpected errors
+ # timed out
+ print $fH "$_ timed out\n";
+ next;
+ }
+ my $parse_text = joinParseTree @parse_tree;
+ open $fhHTML, ">:encoding(utf8)", $oName;
+ print $fhHTML <<ENDDOC
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+</head>
+<body>
+ENDDOC
+;
+ printFile @parse_tree, $fhHTML;
+ $text =~ s/-( *)(?=-)/- \1/g;
+ $text =~ s/(.{1,512})/<!-- \1 -->\n/g;
+ print $fhHTML "
+<!-- Original text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
+$text";
+ $parse_text =~ s/-( *)(?=-)/- \1/g;
+ $parse_text =~ s/(.{1,512})/<!-- \1 -->\n/g;
+ print $fhHTML "
+<!-- Parsed text (cut on every 512 Char, -( *)(?=-) sequences escaped as - \\1):-->
+$parse_text
+</body>
+</html>";
+ close $fhHTML;
+ print $fH "$_\n";
+}
+

Return to:

Send suggestions and report system problems to the System administrator.