File Coverage

File:blib/lib/XML/PP.pm
Coverage:79.1%

linestmtbrancondsubtimecode
1package XML::PP;
2
3
4
4
4
280837
1
54
use strict;
4
4
4
4
5
3
83
use warnings;
5
6
4
4
4
535
13262
70
use Params::Get 0.13;
7
4
4
4
7
3
44
use Scalar::Util;
8
4
4
4
524
52424
3400
use Return::Set;
9
10 - 18
=head1 NAME

XML::PP - A simple XML parser

=head1 VERSION

Version 0.07

=cut
19
20our $VERSION = '0.07';
21
22 - 74
=head1 SYNOPSIS

  use XML::PP;

  my $parser = XML::PP->new();
  my $xml = '<note id="1"><to priority="high">Tove</to><from>Jani</from><heading>Reminder</heading><body importance="high">Don\'t forget me this weekend!</body></note>';
  my $tree = $parser->parse($xml);

  print $tree->{name};       # 'note'
  print $tree->{children}[0]->{name};     # 'to'

=head1 DESCRIPTION

You almost certainly do not need this module.
For most tasks,
use L<XML::Simple> or L<XML::LibXML>.
C<XML::PP> exists only for the most lightweight scenarios where you can't get one of the above modules to install,
for example,
CI/CD machines running Windows that get stuck with L<https://stackoverflow.com/questions/11468141/cant-load-c-strawberry-perl-site-lib-auto-xml-libxml-libxml-dll-for-module-x>.

C<XML::PP> is a simple, lightweight XML parser written in pure Perl.
It does not rely on external libraries like C<XML::LibXML> and is suitable for small XML parsing tasks.
This module supports basic XML document parsing, including namespace handling, attributes, and text nodes.

=head1 METHODS

=head2 new

  my $parser = XML::PP->new();
  my $parser = XML::PP->new(strict => 1);
  my $parser = XML::PP->new(warn_on_error => 1);

Creates a new C<XML::PP> object.
It can take several optional arguments:

=over 4

=item * C<strict> - If set to true, the parser dies when it encounters unknown entities or unescaped ampersands.

=item * C<warn_on_error> - If true, the parser emits warnings for unknown or malformed XML entities. This is enabled automatically if C<strict> is enabled.

=item * C<logger>

Used for warnings and traces.
It can be an object that understands warn() and trace() messages,
such as a L<Log::Log4perl> or L<Log::Any> object,
a reference to code,
a reference to an array,
or a filename.

=back

=cut
75
76# Constructor for creating a new XML::PP object
77sub new
78{
79
8
270965
        my $class = shift;
80
8
16
        my $params = Params::Get::get_params(undef, @_) || {};
81
82        # strict implies warn_on_error
83
8
89
        if($params->{strict}) {
84
2
1
                $params->{warn_on_error} = 1;
85        }
86
87
8
8
7
14
        my $self = bless { %{$params} }, $class;
88
89
8
11
        if(my $logger = $self->{'logger'}) {
90
0
0
                if(!Scalar::Util::blessed($logger)) {
91                        # Don't "use" at the top, because of circular dependancy:
92                        #       Log::Abstraction->Config::Abstraction->XML::PP
93
0
0
0
0
                        eval { require Log::Abstraction };
94
0
0
                        if($@) {
95
0
0
                                die $@;
96                        }
97
0
0
                        Log::Abstraction->import();
98
0
0
                        $self->{'logger'} = Log::Abstraction->new($logger);
99                }
100        }
101
102
8
12
        return $self;
103}
104
105 - 126
=head2 parse

  my $tree = $parser->parse($xml_string);

Parses the XML string and returns a tree structure representing the XML content.
The returned structure is a hash reference with the following fields:

=over 4

=item * C<name> - The tag name of the node.

=item * C<ns> - The namespace prefix (if any).

=item * C<ns_uri> - The namespace URI (if any).

=item * C<attributes> - A hash reference of attributes.

=item * C<children> - An array reference of child nodes (either text nodes or further elements).

=back

=cut
127
128# Parse the given XML string and return the root node
129sub parse
130{
131
11
6614
        my $self = shift;
132
11
15
        my $params = Params::Get::get_params('xml', \@_);
133
11
124
        my $xml_string = $params->{'xml'};
134
135
11
14
        if(ref($xml_string) eq 'SCALAR') {
136
0
0
0
0
                $xml_string = ${$xml_string};
137        }
138        # Check if the XML string is empty
139        # if (!$xml_string || $xml_string !~ /<\?xml/) {
140                # $self->_handle_error("Invalid or empty XML document provided");
141
11
13
        if (!$xml_string) {
142                # $self->_handle_error("Empty XML document provided");
143
0
0
                return {};
144        }
145
146
11
14
        $xml_string =~ s/<!--.*?-->//sg;  # Ignore comments
147
11
12
        $xml_string =~ s/<\?xml.+\?>//;   # Ignore the header
148
149
11
87
        $xml_string =~ s/^\s+|\s+$//g;  # Trim whitespace
150        # Check if the XML string is empty
151
11
14
        return $self->_parse_node(\$xml_string, {});
152}
153
154 - 252
=head2 collapse_structure

Collapse an XML-like structure into a simplified hash (like L<XML::Simple>).

  use XML::PP;

  my $input = {
      name => 'note',
      children => [
          { name => 'to', children => [ { text => 'Tove' } ] },
          { name => 'from', children => [ { text => 'Jani' } ] },
          { name => 'heading', children => [ { text => 'Reminder' } ] },
          { name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
      ],
      attributes => { id => 'n1' },
  };

  my $result = collapse_structure($input);

  # Output:
  # {
  #     note => {
  #         to      => 'Tove',
  #         from    => 'Jani',
  #         heading => 'Reminder',
  #         body    => 'Don\'t forget me this weekend!',
  #     }
  # }

The C<collapse_structure> subroutine takes a nested hash structure (representing an XML-like data structure) and collapses it into a simplified hash where each child element is mapped to its name as the key, and the text content is mapped as the corresponding value. The final result is wrapped in a C<note> key, which contains a hash of all child elements.

This subroutine is particularly useful for flattening XML-like data into a more manageable hash format, suitable for further processing or display.

C<collapse_structure> accepts a single argument:

=over 4

=item * C<$node> (Required)

A hash reference representing a node with the following structure:

  {
      name      => 'element_name',  # Name of the element (e.g., 'note', 'to', etc.)
      children  => [                # List of child elements
          { name => 'child_name', children => [{ text => 'value' }] },
          ...
      ],
      attributes => { ... },        # Optional attributes for the element
      ns_uri => ... ,               # Optional namespace URI
      ns => ... ,                   # Optional namespace
  }

The C<children> key holds an array of child elements. Each child element may have its own C<name> and C<text>, and the function will collapse all text values into key-value pairs.

=back

The subroutine returns a hash reference that represents the collapsed structure, where the top-level key is C<note> and its value is another hash containing the child elements' names as keys and their corresponding text values as values.

For example:

  {
      note => {
          to      => 'Tove',
          from    => 'Jani',
          heading => 'Reminder',
          body    => 'Don\'t forget me this weekend!',
      }
  }

=over 4

=item Basic Example:

Given the following input structure:

  my $input = {
      name => 'note',
      children => [
          { name => 'to', children => [ { text => 'Tove' } ] },
          { name => 'from', children => [ { text => 'Jani' } ] },
          { name => 'heading', children => [ { text => 'Reminder' } ] },
          { name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
      ],
  };

Calling C<collapse_structure> will return:

  {
      note => {
          to      => 'Tove',
          from    => 'Jani',
          heading => 'Reminder',
          body    => 'Don\'t forget me this weekend!',
      }
  }

=back

=cut
253
254sub collapse_structure {
255
8
327
        my ($self, $node) = @_;
256        # my $self = shift;
257        # my $params = Params::Get::get_params('node', \@_);
258        # my $node = $params->{'node'};
259
260
8
20
        return {} unless ref $node eq 'HASH' && $node->{children};
261
262
8
6
        my %result;
263
8
8
6
8
        for my $child (@{ $node->{children} }) {
264
19
20
                my $name = $child->{name} or next;
265
19
10
                my $value;
266
267
19
19
30
26
                if ($child->{children} && @{ $child->{children} }) {
268
18
18
10
34
                        if (@{ $child->{children} } == 1 && exists $child->{children}[0]{text}) {
269
15
12
                                $value = $child->{children}[0]{text};
270                        } else {
271
3
5
                                $value = $self->collapse_structure($child)->{$name};
272                        }
273                }
274
275
19
56
                next unless defined $value && $value ne '';
276
277                # Handle multiple same-name children as an array
278
17
15
                if (exists $result{$name}) {
279
2
4
                        $result{$name} = [ $result{$name} ] unless ref $result{$name} eq 'ARRAY';
280
2
2
2
3
                        push @{ $result{$name} }, $value;
281                } else {
282
15
18
                        $result{$name} = $value;
283                }
284        }
285
8
19
        return { $node->{name} => \%result };
286}
287
288 - 297
=head2 _parse_node

  my $node = $self->_parse_node($xml_ref, $nsmap);

Recursively parses an individual XML node.
This method is used internally by the C<parse> method.
It handles the parsing of tags, attributes, text nodes, and child elements.
It also manages namespaces and handles self-closing tags.

=cut
298
299# Internal method to parse an individual XML node
300sub _parse_node {
301
24
20
        my ($self, $xml_ref, $nsmap) = @_;
302
303
24
20
        if(!defined($xml_ref)) {
304
0
0
                if($self->{'logger'}) {
305
0
0
                        $self->{'logger'}->fatal('BUG: _parse_node, xml_ref not defined');
306                }
307
0
0
                die 'BUG: _parse_node, xml_ref not defined';
308        }
309
310        # Match the start of a tag (self-closing or regular)
311
24
59
        $$xml_ref =~ s{^\s*<([^\s/>]+)([^>]*)\s*(/?)>}{}s or do {
312
0
0
                $self->_handle_error('Expected a valid XML tag, but none found at position: ' . pos($$xml_ref));
313
0
0
                return;
314        };
315
316
24
49
        my ($raw_tag, $attr_string, $self_close) = ($1, $2 || '', $3);
317
318        # Check for malformed self-closing tags
319
24
21
        if($self_close && $$xml_ref !~ /^\s*<\/(?:\w+:)?$raw_tag\s*>/) {
320
0
0
                $self->_handle_error("Malformed self-closing tag for <$raw_tag>");
321
0
0
                return;
322        }
323
324        # Handle possible trailing slash like <line break="yes"/>
325
24
25
        if($attr_string =~ s{/\s*$}{}) {
326
1
1
                $self_close = 1;
327        }
328
329
24
30
        my ($ns, $tag) = $raw_tag =~ /^([^:]+):(.+)$/
330                ? ($1, $2)
331                : (undef, $raw_tag);
332
333
24
21
        my %local_nsmap = (%$nsmap);
334
335        # XMLNS declarations
336
24
33
        while ($attr_string =~ /(\w+)(?::(\w+))?="([^"]*)"/g) {
337
12
17
                my ($k1, $k2, $v) = ($1, $2, $3);
338
12
29
                if ($k1 eq 'xmlns' && !defined $k2) {
339
0
0
                        $local_nsmap{''} = $v;
340                } elsif ($k1 eq 'xmlns' && defined $k2) {
341
1
2
                        $local_nsmap{$k2} = $v;
342                }
343        }
344
345        # Normalize whitespace between attributes but not inside quotes
346        # - Collapse run of whitespace to one space
347        # - Remove leading/trailing whitespace
348        # - Preserve quoted attribute values
349        {
350
24
24
16
10
                my $tmp = $attr_string;
351
352                # Replace all whitespace sequences outside of quotes with a single space
353                # This works because it alternates: quoted | non-quoted
354
24
34
                my @parts = $tmp =~ /"[^"]*"|'[^']*'|[^\s"'']+/g;
355
356                # Rejoin non-quoted segments with a single space
357
24
30
                $attr_string = join(' ', @parts);
358        }
359
360
24
25
        my %attributes;
361
24
37
        pos($attr_string) = 0;
362
363        # Accept name="value" and name='value' (value captured lazily, same quote used to open/close)
364        # Attribute name follows XML Name-ish rules: start with letter/underscore/colon, then letters/digits/._:-
365
24
44
        while ($attr_string =~ /([A-Za-z_:][-A-Za-z0-9_.:]*)\s*=\s*(['"])(.*?)\2/g) {
366
12
15
                my ($attr, $quote, $v) = ($1, $2, $3);
367
368                # Skip xmlns declarations (already handled)
369
12
11
                next if $attr =~ /^xmlns(?::|$)/;
370
371                # Decode XML entities inside attribute values
372
11
9
                $attributes{$attr} = $self->_decode_entities($v);
373        }
374
375        my $node = {
376                name => $tag,
377                ns => $ns,
378
22
44
                ns_uri => defined $ns ? $local_nsmap{$ns} : undef,
379                attributes => \%attributes,
380                children => [],
381        };
382
383        # Return immediately if self-closing tag
384
22
19
        return $node if $self_close;
385
386        # Capture text
387
21
29
        if ($$xml_ref =~ s{^([^<]+)}{}s) {
388
21
30
                my $text = $self->_decode_entities($1);
389
21
32
                $text =~ s/^\s+|\s+$//g;
390
21
15
19
17
                push @{ $node->{children} }, { text => $text } if $text ne '';
391        }
392
393        # Recursively parse children
394
21
36
        while ($$xml_ref =~ /^\s*<([^\/>"][^>]*)>/) {
395
13
15
                my $child = $self->_parse_node($xml_ref, \%local_nsmap);
396
13
13
780
31
                push @{ $node->{children} }, $child if $child;
397        }
398
399        # Consume closing tag
400
21
556
        $$xml_ref =~ s{^\s*</(?:\w+:)?$tag\s*>}{}s;
401
402
21
44
        return Return::Set::set_return($node, { 'type' => 'hashref', 'min' => 1 });
403}
404
405# Internal helper to decode XML entities
406sub _decode_entities {
407
32
31
        my ($self, $text) = @_;
408
409
32
22
        return undef unless defined $text;
410
411        # Decode known named entities
412
32
37
        $text =~ s/&lt;/</g;
413
32
17
        $text =~ s/&gt;/>/g;
414
32
17
        $text =~ s/&amp;/&/g;
415
32
15
        $text =~ s/&quot;/"/g;
416
32
22
        $text =~ s/&apos;/'/g;
417
418        # Decode decimal numeric entities
419
32
3
14
6
        $text =~ s/&#(\d+);/chr($1)/eg;
420
421        # Decode hex numeric entities
422
32
1
19
2
        $text =~ s/&#x([0-9a-fA-F]+);/chr(hex($1))/eg;
423
424
32
20
        if ($text =~ /&([^;]*);/) {
425
3
3
                my $entity = $1;
426
3
10
                unless ($entity =~ /^(lt|gt|amp|quot|apos)$/ || $entity =~ /^#(?:x[0-9a-fA-F]+|\d+)$/) {
427
3
2
                        my $msg = "Unknown or malformed XML entity: &$entity;";
428
3
3
                        $self->_handle_error($msg);
429                }
430        }
431
432
31
28
        if ($text =~ /&/) {
433
7
6
                my $msg = "Unescaped ampersand detected: $text";
434
7
7
                $self->_handle_error($msg);
435        }
436
437
30
36
        return $text;
438}
439
440sub _handle_error {
441
10
5
        my ($self, $message) = @_;
442
443
10
8
        my $error_message = __PACKAGE__ . ": XML Parsing Error: $message";
444
445
10
8
        if($self->{strict}) {
446                # Throws an error if strict mode is enabled
447
2
1
                if($self->{'logger'}) {
448
0
0
                        $self->{'logger'}->fatal($error_message);
449                }
450
2
7
                die $error_message;
451        } elsif ($self->{warn_on_error}) {
452                # Otherwise, just warn
453
3
3
                if($self->{'logger'}) {
454
0
0
                        $self->{'logger'}->warn($error_message);
455                } else {
456
3
9
                        warn $error_message;
457                }
458        } else {
459
5
4
                if($self->{'logger'}) {
460
0
0
                        $self->{'logger'}->notice($error_message);
461                } else {
462
5
14
                        print STDERR "Warning: $error_message\n";
463                }
464        }
465}
466
467 - 505
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/XML-PP/coverage/>

=item * L<XML::LibXML>

=item * L<XML::Simple>

=back

=head1 SUPPORT

This module is provided as-is without any warranty.

=head1 LICENSE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
506
5071;