aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2015-03-11 22:58:19 +0200
committerSergey Poznyakoff <gray@gnu.org.ua>2015-03-11 22:58:19 +0200
commitd68a62e91cd92593c8dafce7d7c7eb4591895153 (patch)
tree39bd49cf8068e30db2ee18819b37364966872f18
parent12f78f506fd58a607fffc0300686a0ef8943c1e7 (diff)
downloadregexp-opt-d68a62e91cd92593c8dafce7d7c7eb4591895153.tar.gz
regexp-opt-d68a62e91cd92593c8dafce7d7c7eb4591895153.tar.bz2
Document; fix minor glitches
-rw-r--r--String/Regexp.pm214
1 files changed, 193 insertions, 21 deletions
diff --git a/String/Regexp.pm b/String/Regexp.pm
index 164609f..7c92d39 100644
--- a/String/Regexp.pm
+++ b/String/Regexp.pm
@@ -1,3 +1,19 @@
+# -*- perl -*-
+# Copyright (C) 2015 Sergey Poznyakoff <gray@gnu.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
package String::Regexp;
use strict;
@@ -12,7 +28,15 @@ our @EXPORT_OK = ( qw(array_to_regexp) );
our @EXPORT = qw(array_to_regexp);
our $VERSION = "1.00";
-# split_prefix(ARRAY)
+# Synopsis:
+# my @res = split_prefix(ARRAY)
+# Arguments:
+# ARRAY is a sorted array of char array references.
+# Description:
+# Find N first elements of ARRAY sharing the longest prefix (of length L).
+# In other words, find N and L such that ARRAY[N][L+1] != ARRAY[N][L+1].
+# Return value:
+# (N, L)
sub split_prefix {
my $aref = shift;
@@ -38,18 +62,35 @@ sub split_prefix {
return ($n,$k);
}
+# Each node of the parse tree is a list. Its 0th element keeps the type of
+# the node. Its lowest byte is one of the following:
+
+# Rest of the node is a list of alternatives.
use constant T_ALT => 0x0;
+
+# A prefixed list of alternatives. Element 1 is the prefix string and
+# element 2 is a reference to the list.
use constant T_PFX => 0x1;
+# A suffixed list of alternatives. Element 1 is the suffix string and
+# element 2 is a reference to the list.
use constant T_SFX => 0x2;
+
+# This mask is used to get the node type:
use constant T_MASK => 0xf;
+# If the type is ORed with T_OPT, the element is optional.
use constant T_OPT => 0x10;
+# Synopsis:
+# my @list = regexp_opt(ARRAY)
+# Arguments:
+# ARRAY is a sorted array of char array references.
+# Description:
+# Recursively parse the array of arguments and return a parse tree.
sub regexp_opt {
my @t = @_;
my @output;
return [] if $#t == -1;
- push @output;
while (1) {
my @res = split_prefix \@t;
if ($res[1] <= 0) {
@@ -94,28 +135,68 @@ sub regexp_opt {
}
return \@output;
}
+
+# ###################################
+# Convert parse tree to a regexp
+#
+# The functions below take hash and string reference as their first two
+# arguments.
+#
+# The first argument is a reference to a configuration hash, which contains
+# the following keys:
+#
+# rxchars: A Perl regular expression matching special characters, which should
+# be escaped with a backslash on output:
+# posix [][\\<>.(){}?*^\$]
+# pcre [][\\.(){}?*^\$]
+#
+# group: A reference to a list of two elements containig markers for
+# parenthesized groups. Non-capturing groups are used, if possible.
+# posix [ '(', ')' ]
+# pcre [ '(?:', ')' ]
+#
+# The second argument is a reference to a string where the generated
+# expression will be stored.
+#
+# ###################################
+# Synopsis:
+# my $s = escape_re_chars(CONF, STRING)
+# Arguments:
+# See above.
+# Description:
+# Escape special characters in the STRING
+# Return value:
+# Escaped string, safe to use in regular expressions.
sub escape_re_chars {
- my $s = shift;
- $s =~ s/([][\\<>.(){}?*^\$])/\\$1/g;
+ my ($conf,$s) = @_;
+ $s =~ s/($conf->{rxchars})/\\$1/g;
return $s;
}
+# Synopsis:
+# posix_build_opt(CONF, STRING, LIST...)
+# Arguments:
+# CONF and STRING as described above.
+# LIST is a subtree.
+# Description:
+# Convert subtree into regular expression.
sub posix_build_opt {
+ my $conf = shift;
my $s = shift;
my $delim;
my @cclass;
- $$s .= '(' if ($#_ > 0);
+ $$s .= $conf->{group}[0] if ($#_ > 0);
foreach my $elt (@_) {
$$s .= $delim if defined $delim;
if (ref($elt) eq 'ARRAY') {
- trans_posix_recursive($elt, $s);
+ trans_posix_recursive($conf, $s, $elt);
$delim = '|';
} elsif (length($elt) == 1) {
push @cclass, $elt;
} else {
- $$s .= '(' . escape_re_chars($elt) . ')';
+ $$s .= $conf->{group}[0] . escape_re_chars($conf, $elt) . $conf->{group}[1];
$delim = '|';
}
}
@@ -148,7 +229,7 @@ sub posix_build_opt {
} elsif ($b eq '-') {
return -1;
} else {
- $a <=> $b;
+ $a cmp $b;
}
} @cclass;
@@ -170,7 +251,8 @@ sub posix_build_opt {
} elsif (ord($c) - ord($start) == 1) {
$end = $c;
} else {
- $$s .= "$start$end";
+ $$s .= $start;
+ $$s .= $end if defined $end;
$start = $c;
$end = undef;
}
@@ -189,50 +271,140 @@ sub posix_build_opt {
$$s .= ']';
}
- $$s .= ')' if ($#_ > 0);
+ $$s .= $conf->{group}[1] if ($#_ > 0);
}
+# Synopsis:
+# trans_posix_recursive(CONF, STRING, TREE...)
+# Arguments:
+# CONF and STRING as described above.
+# TREE is a list of tree nodes.
+# Description:
+# Recursively convert tree into a regular expression.
+# Return value:
+# Regular expression string.
sub trans_posix_recursive {
- my ($treeref, $s) = @_;
+ my ($conf, $s, $treeref) = @_;
my @tree = @{$treeref};
my $delim;
my $mode = shift @tree;
my $type = $mode & T_MASK;
if ($type == T_ALT) {
- posix_build_opt($s, @tree);
+ posix_build_opt($conf, $s, @tree);
$$s .= '?' if ($mode & T_OPT); # FIXME
} elsif ($type == T_PFX) {
- $$s .= '(' . escape_re_chars(shift(@tree));
- posix_build_opt($s, @{$tree[0]});
+ $$s .= $conf->{group}[0] . escape_re_chars($conf, shift(@tree));
+ posix_build_opt($conf, $s, @{$tree[0]});
$$s .= '?' if ($mode & T_OPT);
- $$s .= ')';
+ $$s .= $conf->{group}[1];
} elsif ($type == T_SFX) {
my $sfx = shift(@tree);
- $$s .= '(';
- posix_build_opt($s, @{$tree[0]});
+ $$s .= $conf->{group}[0];
+ posix_build_opt($conf, $s, @{$tree[0]});
$$s .= '?' if ($mode & T_OPT);
- $$s .= escape_re_chars($sfx). ')';
+ $$s .= escape_re_chars($conf, $sfx). $conf->{group}[1];
} else {
croak "unrecognized element type";
}
return $$s;
}
+# ########################################################
+# Generate POSIX and Perl-compatible regular expressions.
+# ########################################################
+
+# Synopsis:
+# my $s = trans_posix(TREE, OPTS)
+# Arguments:
+# TREE - a reference to a parse tree obtained from
+# regexp_opt;
+# OPTS - hash reference
+# Description:
+# Convert tree into POSIX regular expression.
+
sub trans_posix {
my ($tree, $opts) = @_;
+ my %conf = (
+ rxchars => '[][\\<>.(){}?*^\$]',
+ group => [ '(', ')' ]
+ );
my $s = '';
- trans_posix_recursive($tree, \$s);
+ trans_posix_recursive(\%conf, \$s, $tree);
$s = "\\<$s\\>" if $opts->{word};
return $s;
}
+# Synopsis:
+# my $s = trans_pcre(TREE, OPTS)
+# Arguments:
+# TREE - a reference to a parse tree obtained from
+# regexp_opt;
+# OPTS - hash reference
+# Description:
+# Convert tree into Perl-compatible regular expression.
+
+sub trans_pcre {
+ my ($tree, $opts) = @_;
+ my %conf = (
+ rxchars => '[][\\.(){}?*^\$]',
+ group => [ '(?:', ')' ]
+ );
+ my $s = '';
+ trans_posix_recursive(\%conf, \$s, $tree);
+ $s = "\\b$s\\b" if $opts->{word};
+ return $s;
+}
+
my %transtab = (
- posix => \&trans_posix
+ posix => \&trans_posix,
+ pcre => \&trans_pcre
);
+=pod
+
+=head1 NAME
+
+array_to_regexp - Convert list of strings to a regular expression
+
+=head1 SYNOPSIS
+
+use String::Regexp qw(:all);
+
+my $s = array_to_regexp(@strings);
+
+my $s = array_to_regexp(\%opts, @strings);
+
+=head1 DESCRIPTION
+
+Return a regexp to match a string in the list B<@strings>. First argument
+can be a reference to a hash, which controls how the regexp is built.
+Valid keys are:
+
+=over 4
+
+=item B<type> => B<posix>|B<pcre>
+
+Controls the flavor of the generated expression: POSIX or Perl-compatible one.
+Default is B<pcre>.
+
+=item B<word> => B<0>|B<1>
+
+If B<1>, make sure the expression matches single words. If B<0>, the
+expression will also match any word from B<@strings> appearing as a part
+of another word.
+
+Default is B<0>.
+
+=item B<debug> => B<0>|B<1>
+
+If B<1>, enable debugging output.
+
+=back
+
+=cut
sub array_to_regexp {
- my $trans = \&trans_posix;
+ my $trans = \&trans_pcre;
my $opts;
if (ref($_[0]) eq 'HASH') {

Return to:

Send suggestions and report system problems to the System administrator.