# This file is part of Config::AST -*- perl -*- # Copyright (C) 2017-2019 Sergey Poznyakoff # # Config::AST is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # Config::AST is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Config::AST. If not, see . package Config::AST; use strict; use warnings; use Carp; use Text::Locus; use Config::AST::Node qw(:sort); use Config::AST::Node::Section; use Config::AST::Node::Value; use Config::AST::Follow; use Data::Dumper; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'sort' => [ qw(NO_SORT SORT_NATURAL SORT_PATH) ] ); our @EXPORT_OK = qw(NO_SORT SORT_NATURAL SORT_PATH); our $VERSION = "1.03"; =head1 NAME Config::AST - abstract syntax tree for configuration files =head1 SYNOPSIS my $cfg = new Config::AST(%opts); $cfg->parse() or die; $cfg->commit() or die; if ($cfg->is_set('core', 'variable')) { ... } my $x = $cfg->get('file', 'locking'); $cfg->set('file', 'locking', 'true'); $cfg->unset('file', 'locking'); =head1 DESCRIPTION This module aims to provide a generalized implementation of parse tree for various configuration files. It does not implement parser for any existing configuration file format. Instead, it provides an API that can be used by parsers to build internal representation for the particular configuration file format. See B module for an implementation of a parser based on this module. A configuration file in general is supposed to consist of statements of two kinds: simple statements and sections. A simple statement declares or sets a configuration parameter. Examples of simple statements are: # Bind configuration file: file "cache/named.root"; # Apache configuration file: ServerName example.com # Git configuration file: logallrefupdates = true A section statement groups together a number of another statements. These can be simple statements, as well as another sections. Examples of sections are (with subordinate statements replaced with ellipsis): # Bind configuration file: zone "." { ... }; # Apache configuration file: ... # Git configuration file: [core] ... The syntax of Git configuration file being one of the simplest, we will use it in the discussion below to illustrate various concepts. The abstract syntax tree (AST) for a configuration file consists of nodes. Each node represents a single statement and carries detailed information about that statement, in particular: =over 4 =item B Location of the statement in the configuration. It is represented by an object of class B. =item order 0-based number reflecting position of this node in the parent section node. =item value For simple statements - the value of this statement. =item subtree For sections - the subtree below this section. =back The type of each node can be determined using the following node attributes: =over 4 =item is_section True if node is a section node. =item is_value True if node is a simple statement. =back To retrieve a node, address it using its I, i.e. list of statement names that lead to this node. For example, in this simple configuration file: [core] filemode = true the path of the C statement is C. =head1 CONSTRUCTOR $cfg = new Config::AST(%opts); Creates new configuration parser object. Valid options are: =over 4 =item B => I Sets debug verbosity level. =item B => B<0> | B<1> If B<1>, enables case-insensitive keyword matching. Default is B<0>, i.e. the keywords are case-sensitive. =item B => \%hash Defines the I. =back =head3 Keyword lexicon The hash reference passed via the B keyword defines the keywords and sections allowed within a configuration file. In a simplest case, a keyword is described as name => 1 This means that B is a valid keyword, but does not imply anything about its properties. A more complex declaration is possible, in which the value is a hash reference, containing one or more of the following keywords: =over 4 =item mandatory => 0 | 1 Whether or not this setting is mandatory. =item default => I Default value for the setting. This value will be assigned if that particular statement is not explicitly used in the configuration file. If I is a CODE reference, it will be invoked as a method each time the value is accessed. Default values must be pure Perl values (not the values that should appear in the configuration file). They are not processed using the B callbacks (see below). =item array => 0 | 1 If B<1>, the value of the setting is an array. Each subsequent occurrence of the statement appends its value to the end of the array. =item re => I Defines a regular expression which the value must match. If it does not, a syntax error will be reported. =item select => I Reference to a method which will be called in order to decide whether to apply this hash to a particular configuration setting. The method is called as $self->$coderef($node, @path) where $node is the B object (use B<$vref-Evalue>, to obtain the actual value), and B<@path> is its pathname. =item check => I Defines a method which will be called after parsing the statement in order to verify its value. The I is called as $self->$coderef($valref, $prev_value, $locus) where B<$valref> is a reference to its value, and B<$prev_value> is the value of the previous instance of this setting. The function must return B, if the value is OK for that setting. In that case, it is allowed to modify the value referenced by B<$valref>. If the value is erroneous, the function must issue an appropriate error message using B<$cfg-Eerror>, and return 0. =back In taint mode, any value that matched B expression or passed the B function will be automatically untainted. To define a section, use the B
keyword, e.g.: core => { section => { pidfile => { mandatory => 1 }, verbose => { re => qr/^(?:on|off)/i } } } This says that the section named B can have two variables: B, which is mandatory, and B, whose value must be B, or B (case-insensitive). E.g.: [core] pidfile = /run/ast.pid verbose = off To accept arbitrary keywords, use B<*>. For example, the following declares B section, which must have the B setting and is allowed to have any other settings as well. code => { section => { pidfile => { mandatory => 1 }, '*' => 1 } } Everything said above applies to the B<'*'> as well. E.g. the following example declares the B section, which must have the B setting and is allowed to have I with arbitrary settings. code => { section => { pidfile = { mandatory => 1 }, '*' => { section => { '*' => 1 } } } } The special entry '*' => '*' means "any settings and any subsections are allowed". =cut sub new { my $class = shift; local %_ = @_; my $self = bless { _order => 0 }, $class; my $v; my $err; $self->{_debug} = delete $_{debug} || 0; $self->{_ci} = delete $_{ci} || 0; if (defined($v = delete $_{lexicon})) { if (ref($v) eq 'HASH') { $self->{_lexicon} = $v; } else { carp "lexicon must refer to a HASH"; ++$err; } } if (keys(%_)) { foreach my $k (keys %_) { carp "unknown parameter $k" } ++$err; } croak "can't create configuration instance" if $err; $self->reset; return $self; } =head2 $cfg->lexicon($hashref) Returns current lexicon. If B<$hashref> is supplied, installs it as a new lexicon. =cut sub lexicon { my $self = shift; if (@_) { my $lexicon = shift; carp "too many arguments" if @_; carp "lexicon must refer to a HASH" unless ref($lexicon) eq 'HASH'; $self->reset; $self->{_lexicon} = $lexicon; } return $self->{_lexicon}; } =head1 PARSING This module provides a framework for parsing, but does not implement parsers for any particular configuration formats. To implement a parser, the programmer must write a class that inherits from B. This class should implement the B method which, when called, will actually perform the parsing and build the AST using methods described in section B (see below). The caller must then perform the following operations =over 4 =item B<1.> Create an instance of the derived class B<$cfg>. =item B<2.> Call the B<$cfg-Eparse> method. =item B<3.> On success, call the B<$cfg-Ecommit> method. =back =head2 $cfg->parse(...) Abstract method that is supposed to actually parse the configuration file and build the parse tree from it. Derived classes must overload it. The must return true on success and false on failure. Eventual errors in the configuration should be reported using B. =cut sub parse { my ($self) = @_; croak "call to abstract method" } =head2 $cfg->commit Must be called after B to finalize the parse tree. This function does the actual syntax checking and applied default values to the statements where such are defined. Returns true on success. =cut sub commit { my ($self) = @_; # FIXME $self->fixup_tree($self->tree, $self->{_lexicon}) if exists $self->{_lexicon}; return $self->{_error_count} == 0; } sub fixup_tree { my ($self, $section, $params, @path) = @_; while (my ($k, $d) = each %{$params}) { next unless ref($d) eq 'HASH'; if (exists($d->{default}) && !$section->has_key($k)) { my $n; my $dfl = ref($d->{default}) eq 'CODE' ? sub { $self->${ \ $d->{default} } } : $d->{default}; if (exists($d->{section})) { $n = new Config::AST::Node::Section( default => 1, subtree => $dfl ); } else { $n = new Config::AST::Node::Value( default => 1, value => $dfl ); } $section->subtree($k => $n); } if (exists($d->{section})) { if ($k eq '*') { if (keys(%{$section->subtree})) { while (my ($name, $vref) = each %{$section->subtree}) { if (my $sel = $d->{select}) { if ($self->$sel($vref, @path, $name)) { next; } } elsif ($vref->is_section) { $self->fixup_tree($vref, $d->{section}, @path, $name); } } } else { my $node = new Config::AST::Node::Section; $self->fixup_tree($node, $d->{section}, @path, $k); if ($node->keys > 0) { # If the newly created node contains any subnodes # after fixup, they were created because syntax # contained mandatory variables with default values. # Treat sections containing such variables as # mandatory and report them. my %h; foreach my $p (map { pop @{$_->[0]}; join(' ', (@path, $k, @{$_->[0]})) } $node->flatten(sort => SORT_PATH)) { unless ($h{$p}) { $self->error("no section matches mandatory [$p]"); $self->{_error_count}++; $h{$p} = 1; } } } } } else { my $node; unless ($node = $section->subtree($k)) { $node = new Config::AST::Node::Section; } if ((!exists($d->{select}) || $self->${ \ $d->{select} }($node, @path, $k))) { $self->fixup_tree($node, $d->{section}, @path, $k); } if ($node->keys > 0) { $section->subtree($k => $node); } } } if ($d->{mandatory} && !$section->has_key($k)) { $self->error(exists($d->{section}) ? "mandatory section [" . join(' ', @path, $k) . "] not present" : "mandatory variable \"" . join('.', @path, $k) . "\" not set", locus => $section->locus); $self->{_error_count}++; } } } =head2 $cfg->reset Destroys the parse tree and clears error count, thereby preparing the object for parsing another file. =cut sub reset { my $self = shift; $self->{_error_count} = 0; delete $self->{_tree}; } =head1 METHODS =head2 $cfg->error($message) =head2 $cfg->error($message, locus => $loc) Prints the B<$message> on STDERR. If B is given, its value must be a reference to a valid B(3) object. In that case, the object will be formatted first, then followed by a ": " and the B<$message>. =cut sub error { my $self = shift; my $err = shift; local %_ = @_; print STDERR "$_{locus}: " if $_{locus}; print STDERR "$err\n"; } =head2 $cfg->debug($lev, @msg) If B<$lev> is greater than or equal to the B value used when creating B<$cfg>, outputs on standard error the strings from @msg, separating them with a single space character. Otherwise, does nothing. =cut sub debug { my $self = shift; my $lev = shift; return unless $self->{_debug} >= $lev; $self->error("DEBUG: " . join(' ', @_)); } =head1 NODE RETRIEVAL A node is addressed by its path, i.e. a list of names of the configuration sections leading to the statement plus the name of the statement itself. For example, the statement: pidfile = /var/run/x.pid has the path ( 'pidfile' ) The path of the B statement in section B, e.g.: [core] pidfile = /var/run/x.pid is ( 'core', 'pidfile' ) Similarly, the path of the B setting in the following configuration file: [item foo] file = bar is ( 'item', 'foo', 'bar' ) =head2 $node = $cfg->getnode(@path); Retrieves the AST node referred to by B<@path>. If no such node exists, returns C. =cut sub getnode { my $self = shift; my $node = $self->{_tree} or return undef; for (@_) { $node = $node->subtree($self->{_ci} ? lc($_) : $_) or return undef; } return $node; } =head2 $var = $cfg->get(@path); Returns the B(3) corresponding to the configuration variable represented by its path, or C if the variable is not set. =cut sub get { my $self = shift; croak "no variable to get" unless @_; if (my $node = $self->getnode(@_)) { return $node->value; } } =head2 $cfg->is_set(@path) Returns true if the configuration variable addressed by B<@path> is set. =cut sub is_set { my $self = shift; return defined $self->getnode(@_); } =head2 $cfg->is_section(@path) Returns true if the configuration section addressed by B<@path> is defined. =cut sub is_section { my $self = shift; my $node = $self->getnode(@_); return defined($node) && $node->is_section; } =head2 $cfg->is_variable(@path) Returns true if the configuration setting addressed by B<@path> is set and is a simple statement. =cut sub is_variable { my $self = shift; my $node = $self->getnode(@_); return defined($node) && $node->is_value; } =head2 $cfg->tree Returns the parse tree. =cut sub tree { my $self = shift; return $self->{_tree} //= new Config::AST::Node::Section(locus => new Text::Locus); } =head2 $cfg->subtree(@path) Returns the configuration subtree associated with the statement indicated by B<@path>. =cut sub subtree { my $self = shift; return $self->tree->subtree(@_); } =head1 DIRECT ADDRESSING Direct addressing allows programmer to access configuration settings as if they were methods of the configuration class. For example, to retrieve the node at path qw(foo bar baz) one can write: $node = $cfg->foo->bar->baz This statement is equivalent to $node = $cfg->getnode(qw(foo bar baz)) except that if the node in question does not exist, direct access returns a I, and B returns C. Null node is a special node representing a missing node. Its B method returns true and it can be used in conditional context as a boolean value, e.g.: if (my $node = $cfg->foo->bar->baz) { $val = $node->value; } Direct addressing is enabled only if lexicon is provided (either during creation of the object, or later, via the B method). Obviously, statements that have names coinciding with one of the methods of the B class (or any of its subclasses) can't be used in direct addressing. In other words, you can't have a top-level statement called C and access it as $cfg->tree This statement will always refer to the method B of the B class. Another possible problem when using direct access are keywords with dashes. Currently a kludge is implemented to make it possible to access such keywords: when looking for a matching keyword, double underscores compare equal to a single dash. For example, to retrieve the C node, use $cfg->files->temp__dir; =cut our $AUTOLOAD; sub AUTOLOAD { my $self = shift; $AUTOLOAD =~ s/(?:(.*)::)?(.+)//; my ($p, $m) = ($1, $2); croak "Can't locate object method \"$m\" via package \"$p\"" if @_ || !$self->lexicon; return Config::AST::Follow->new($self->tree, $self->lexicon)->${\$m}; } sub DESTROY { } =head1 CONSTRUCTING THE SYNTAX TREE The methods described in this section are intended for use by the parser implementers. They should be called from the implementation of the B method in order to construct the tree. =cut sub _section_lexicon { my ($self, $kw, $name) = @_; if (defined($kw)) { if (ref($kw) eq 'HASH') { my $synt; if (exists($kw->{$name})) { $synt = $kw->{$name}; } elsif (exists($kw->{'*'})) { $synt = $kw->{'*'}; if ($synt eq '*') { return { '*' => '*' }; } } if (defined($synt) && ref($synt) eq 'HASH' && exists($synt->{section})) { return $synt->{section}; } } } return } use constant TAINT => eval '${^TAINT}'; use constant TESTS => TAINT && defined eval 'require Taint::Util'; =head2 $cfg->add_node($path, $node) Adds the node in the node corresponding to B<$path>. B<$path> can be either a list of keyword names, or its string representation, where names are separated by dots. I.e., the following two calls are equivalent: $cfg->add_node(qw(core pidfile), $node) $cfg->add_node('core.pidfile', $node) =cut sub add_node { my ($self, $path, $node) = @_; unless (ref($path) eq 'ARRAY') { $path = [ split(/\./, $path) ] } my $kw = $self->{_lexicon} // { '*' => '*' }; my $tree = $self->tree; my $pn = $#{$path}; my $name; my $locus = $node->locus; for (my $i = 0; $i < $pn; $i++) { $name = ${$path}[$i]; unless ($tree->is_section) { $self->error(join('.', @{$path}[0..$i]) . ": not a section"); $self->{_error_count}++; return; } $kw = $self->_section_lexicon($kw, $name); unless ($kw) { $self->error(join('.', @{$path}[0..$i]) . ": unknown section"); $self->{_error_count}++; return; } if (my $subtree = $tree->subtree($name)) { $tree = $subtree; } else { $tree = $tree->subtree( $name => new Config::AST::Node::Section( order => $self->{_order}++, locus => $locus->clone) ); } } $name = ${$path}[-1]; my $x = $kw->{$name} // $kw->{'*'}; if (!defined($x)) { $self->error("keyword \"$name\" is unknown", locus => $locus); $self->{_error_count}++; return; } if ($node->is_section) { if ($tree->has_key($name)) { $tree->locus->union($locus); $tree->subtree($name)->merge($node); } else { $tree->subtree($name => $node); } return $node; } my $v = $node->value; if (ref($x) eq 'HASH') { if (exists($x->{section})) { $self->error('"'.join('.', @{$path})."\" must be a section", locus => $locus); $self->{_error_count}++; return; } my $errstr; my $prev_val; if ($tree->has_key($name)) { # FIXME: is_value? $prev_val = $tree->subtree($name)->value; } my $nchecks; # Number of checks passed if (exists($x->{re})) { if ($v !~ /$x->{re}/) { $self->error("invalid value for $name", locus => $locus); $self->{_error_count}++; return; } $nchecks++; } if (my $ck = $x->{check}) { unless ($self->$ck(\$v, $prev_val, $locus)) { $self->{_error_count}++; return; } $nchecks++; } if ($nchecks && TESTS) { Taint::Util::untaint($v); } if ($x->{array}) { if (!defined($prev_val)) { $v = [ $v ]; } else { $v = [ @{$prev_val}, $v ]; } } } $tree->locus->union($locus->clone); my $newnode; if ($newnode = $tree->subtree($name)) { $newnode->locus->union($locus); } else { $newnode = $tree->subtree($name => $node); } $newnode->order($self->{order}++); $newnode->value($v); return $newnode; } =head2 $cfg->add_value($path, $value, $locus) Adds a statement node with the given B<$value> and B<$locus> in position, indicated by $path. =cut sub add_value { my ($self, $path, $value, $locus) = @_; $self->add_node($path, new Config::AST::Node::Value(value => $value, locus => $locus)); } =head2 $cfg->set(@path, $value) Sets the configuration variable B<@path> to B<$value>. =cut sub set { my $self = shift; my $node = $self->tree; while ($#_ > 1) { croak "not a section" unless $node->is_section; my $arg = shift; if (my $n = $node->subtree($arg)) { $node = $n; } else { $node = $node->subtree( $arg => new Config::AST::Node::Section ); } } my $v = $node->subtree($_[0]) || $node->subtree($_[0] => new Config::AST::Node::Value( order => $self->{_order}++ )); $v->value($_[1]); $v->default(0); return $v; } =head2 cfg->unset(@path) Unsets the configuration variable. =cut sub unset { my $self = shift; my $node = $self->{_tree} or return; my @path; for (@_) { return unless $node->is_section && $node->has_key($_); push @path, [ $node, $_ ]; $node = $node->subtree($_); } while (1) { my $loc = pop @path; $loc->[0]->delete($loc->[1]); last unless ($loc->[0]->keys == 0); } } =head1 AUXILIARY METHODS =head2 @array = $cfg->names_of(@path) If B<@path> refers to an existing configuration section, returns a list of names of variables and subsections defined within that section. Otherwise, returns empty list. For example, if you have [item foo] x = 1 [item bar] x = 1 [item baz] y = 2 the call $cfg->names_of('item') will return ( 'foo', 'bar', 'baz' ) =cut sub names_of { my $self = shift; my $node = $self->getnode(@_); return () unless defined($node) && $node->is_section; return $node->keys; } =head2 @array = $cfg->flatten() =head2 @array = $cfg->flatten(sort => $sort) Returns a I representation of the configuration, as a list of pairs B<[ $path, $value ]>, where B<$path> is a reference to the variable pathname, and B<$value> is a B object. The I<$sort> argument controls the ordering of the entries in the returned B<@array>. It is either a code reference suitable to pass to the Perl B function, or one of the following constants: =over 4 =item NO_SORT Don't sort the array. Statements will be placed in an apparently random order. =item SORT_NATURAL Preserve relative positions of the statements. Entries in the array will be in the same order as they appeared in the configuration file. This is the default. =item SORT_PATH Sort by pathname. =back These constants are not exported by default. You can either import the ones you need, or use the B<:sort> keyword to import them all, e.g.: use Config::AST qw(:sort); @array = $cfg->flatten(sort => SORT_PATH); =cut sub flatten { my $self = shift; $self->tree->flatten(@_); } =head2 $h = $cfg->as_hash =head2 $h = $cfg->as_hash($map) Returns parse tree converted to a hash reference. If B<$map> is supplied, it must be a reference to a function. For each I<$key>/I<$value> pair, this function will be called as: ($newkey, $newvalue) = &{$map}($what, $key, $value) where B<$what> is C
or C, depending on the type of the hash entry being processed. Upon successful return, B<$newvalue> will be inserted in the hash slot for the key B<$newkey>. If B<$what> is C
, B<$value> is always a reference to an empty hash (since the parse tree is traversed in pre-order fashion). In that case, the B<$map> function is supposed to do whatever initialization that is necessary for the new subtree and return as B<$newvalue> either B<$value> itself, or a reference to a hash available inside the B<$value>. For example: sub map { my ($what, $name, $val) = @_; if ($name eq 'section') { $val->{section} = {}; $val = $val->{section}; } ($name, $val); } =cut sub as_hash { my $self = shift; $self->tree->as_hash(@_); } =head2 $cfg->canonical(%args) Returns the canonical string representation of the configuration tree. For details, please refer to the documentation of this method in class B. =cut sub canonical { my $self = shift; $self->tree->canonical(@_); } sub lint_node { my ($self, $lexicon, $node, @path) = @_; $lexicon = {} unless ref($lexicon) eq 'HASH'; if (exists($lexicon->{section})) { return unless $node->is_section; } else { return if $node->is_section; } if (exists($lexicon->{select}) && !$self->${ \ $lexicon->{select} }($node, @path)) { return; } if ($node->is_section) { $self->lint_subtree($lexicon->{section}, $node, @path); } else { my $val = $node->value; my %opts = ( locus => $node->locus ); if (ref($val) eq 'ARRAY') { if ($lexicon->{array}) { my @ar; foreach my $v (@$val) { if (exists($lexicon->{re})) { if ($v !~ /$lexicon->{re}/) { $self->error("invalid value for $path[-1]", %opts); $self->{_error_count}++; next; } } if (my $ck = $lexicon->{check}) { unless ($self->$ck(\$v, @ar ? $ar[-1] : undef, $node->locus)) { $self->{_error_count}++; next; } } push @ar, $v; } $node->value(\@ar); return; } else { $val = pop(@$val); } } if (exists($lexicon->{re})) { if ($val !~ /$lexicon->{re}/) { $self->error("invalid value for $path[-1]", %opts); $self->{_error_count}++; return; } } if (my $ck = $lexicon->{check}) { unless ($self->$ck(\$val, undef, $node->locus)) { $self->{_error_count}++; return; } } $node->value($val); } } sub lint_subtree { my ($self, $lexicon, $node, @path) = @_; while (my ($var, $value) = each %{$node->subtree}) { if (exists($lexicon->{$var})) { $self->lint_node($lexicon->{$var}, $value, @path, $var); } elsif (exists($lexicon->{'*'})) { $self->lint_node($lexicon->{'*'}, $value, @path, $var); } elsif ($value->is_section) { next; } else { $self->error("keyword \"$var\" is unknown", locus => $value->locus); $self->{_error_count}++; } } } =head2 $cfg->lint(\%lex) Checks the syntax according to the keyword lexicon B<%lex>. On success, applies eventual default values and returns true. On errors, reports them using B and returns false. This method provides a way to delay syntax checking for a later time, which is useful, e.g. if some parts of the parser are loaded as modules after calling B. =cut sub lint { my ($self, $lexicon) = @_; # $synt->{'*'} = { section => { '*' => 1 }} ; $self->lint_subtree($lexicon, $self->tree); $self->fixup_tree($self->tree, $lexicon); return $self->{_error_count} == 0; } =head1 SEE ALSO B. B. =cut 1;