| File: | blib/lib/XML/PP.pm |
| Coverage: | 79.1% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package XML::PP; | |||||
| 2 | ||||||
| 3 | 4 4 4 | 280837 1 54 | use strict; | |||
| 4 | 4 4 4 | 5 3 83 | use warnings; | |||
| 5 | ||||||
| 6 | 4 4 4 | 535 13262 70 | use Params::Get 0.13; | |||
| 7 | 4 4 4 | 7 3 44 | use Scalar::Util; | |||
| 8 | 4 4 4 | 524 52424 3400 | use Return::Set; | |||
| 9 | ||||||
| 10 - 18 | =head1 NAME XML::PP - A simple XML parser =head1 VERSION Version 0.07 =cut | |||||
| 19 | ||||||
| 20 | our $VERSION = '0.07'; | |||||
| 21 | ||||||
| 22 - 74 | =head1 SYNOPSIS
use XML::PP;
my $parser = XML::PP->new();
my $xml = '<note id="1"><to priority="high">Tove</to><from>Jani</from><heading>Reminder</heading><body importance="high">Don\'t forget me this weekend!</body></note>';
my $tree = $parser->parse($xml);
print $tree->{name}; # 'note'
print $tree->{children}[0]->{name}; # 'to'
=head1 DESCRIPTION
You almost certainly do not need this module.
For most tasks,
use L<XML::Simple> or L<XML::LibXML>.
C<XML::PP> exists only for the most lightweight scenarios where you can't get one of the above modules to install,
for example,
CI/CD machines running Windows that get stuck with L<https://stackoverflow.com/questions/11468141/cant-load-c-strawberry-perl-site-lib-auto-xml-libxml-libxml-dll-for-module-x>.
C<XML::PP> is a simple, lightweight XML parser written in pure Perl.
It does not rely on external libraries like C<XML::LibXML> and is suitable for small XML parsing tasks.
This module supports basic XML document parsing, including namespace handling, attributes, and text nodes.
=head1 METHODS
=head2 new
my $parser = XML::PP->new();
my $parser = XML::PP->new(strict => 1);
my $parser = XML::PP->new(warn_on_error => 1);
Creates a new C<XML::PP> object.
It can take several optional arguments:
=over 4
=item * C<strict> - If set to true, the parser dies when it encounters unknown entities or unescaped ampersands.
=item * C<warn_on_error> - If true, the parser emits warnings for unknown or malformed XML entities. This is enabled automatically if C<strict> is enabled.
=item * C<logger>
Used for warnings and traces.
It can be an object that understands warn() and trace() messages,
such as a L<Log::Log4perl> or L<Log::Any> object,
a reference to code,
a reference to an array,
or a filename.
=back
=cut | |||||
| 75 | ||||||
| 76 | # Constructor for creating a new XML::PP object | |||||
| 77 | sub new | |||||
| 78 | { | |||||
| 79 | 8 | 270965 | my $class = shift; | |||
| 80 | 8 | 16 | my $params = Params::Get::get_params(undef, @_) || {}; | |||
| 81 | ||||||
| 82 | # strict implies warn_on_error | |||||
| 83 | 8 | 89 | if($params->{strict}) { | |||
| 84 | 2 | 1 | $params->{warn_on_error} = 1; | |||
| 85 | } | |||||
| 86 | ||||||
| 87 | 8 8 | 7 14 | my $self = bless { %{$params} }, $class; | |||
| 88 | ||||||
| 89 | 8 | 11 | if(my $logger = $self->{'logger'}) { | |||
| 90 | 0 | 0 | if(!Scalar::Util::blessed($logger)) { | |||
| 91 | # Don't "use" at the top, because of circular dependancy: | |||||
| 92 | # Log::Abstraction->Config::Abstraction->XML::PP | |||||
| 93 | 0 0 | 0 0 | eval { require Log::Abstraction }; | |||
| 94 | 0 | 0 | if($@) { | |||
| 95 | 0 | 0 | die $@; | |||
| 96 | } | |||||
| 97 | 0 | 0 | Log::Abstraction->import(); | |||
| 98 | 0 | 0 | $self->{'logger'} = Log::Abstraction->new($logger); | |||
| 99 | } | |||||
| 100 | } | |||||
| 101 | ||||||
| 102 | 8 | 12 | return $self; | |||
| 103 | } | |||||
| 104 | ||||||
| 105 - 126 | =head2 parse my $tree = $parser->parse($xml_string); Parses the XML string and returns a tree structure representing the XML content. The returned structure is a hash reference with the following fields: =over 4 =item * C<name> - The tag name of the node. =item * C<ns> - The namespace prefix (if any). =item * C<ns_uri> - The namespace URI (if any). =item * C<attributes> - A hash reference of attributes. =item * C<children> - An array reference of child nodes (either text nodes or further elements). =back =cut | |||||
| 127 | ||||||
| 128 | # Parse the given XML string and return the root node | |||||
| 129 | sub parse | |||||
| 130 | { | |||||
| 131 | 11 | 6614 | my $self = shift; | |||
| 132 | 11 | 15 | my $params = Params::Get::get_params('xml', \@_); | |||
| 133 | 11 | 124 | my $xml_string = $params->{'xml'}; | |||
| 134 | ||||||
| 135 | 11 | 14 | if(ref($xml_string) eq 'SCALAR') { | |||
| 136 | 0 0 | 0 0 | $xml_string = ${$xml_string}; | |||
| 137 | } | |||||
| 138 | # Check if the XML string is empty | |||||
| 139 | # if (!$xml_string || $xml_string !~ /<\?xml/) { | |||||
| 140 | # $self->_handle_error("Invalid or empty XML document provided"); | |||||
| 141 | 11 | 13 | if (!$xml_string) { | |||
| 142 | # $self->_handle_error("Empty XML document provided"); | |||||
| 143 | 0 | 0 | return {}; | |||
| 144 | } | |||||
| 145 | ||||||
| 146 | 11 | 14 | $xml_string =~ s/<!--.*?-->//sg; # Ignore comments | |||
| 147 | 11 | 12 | $xml_string =~ s/<\?xml.+\?>//; # Ignore the header | |||
| 148 | ||||||
| 149 | 11 | 87 | $xml_string =~ s/^\s+|\s+$//g; # Trim whitespace | |||
| 150 | # Check if the XML string is empty | |||||
| 151 | 11 | 14 | return $self->_parse_node(\$xml_string, {}); | |||
| 152 | } | |||||
| 153 | ||||||
| 154 - 252 | =head2 collapse_structure
Collapse an XML-like structure into a simplified hash (like L<XML::Simple>).
use XML::PP;
my $input = {
name => 'note',
children => [
{ name => 'to', children => [ { text => 'Tove' } ] },
{ name => 'from', children => [ { text => 'Jani' } ] },
{ name => 'heading', children => [ { text => 'Reminder' } ] },
{ name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
],
attributes => { id => 'n1' },
};
my $result = collapse_structure($input);
# Output:
# {
# note => {
# to => 'Tove',
# from => 'Jani',
# heading => 'Reminder',
# body => 'Don\'t forget me this weekend!',
# }
# }
The C<collapse_structure> subroutine takes a nested hash structure (representing an XML-like data structure) and collapses it into a simplified hash where each child element is mapped to its name as the key, and the text content is mapped as the corresponding value. The final result is wrapped in a C<note> key, which contains a hash of all child elements.
This subroutine is particularly useful for flattening XML-like data into a more manageable hash format, suitable for further processing or display.
C<collapse_structure> accepts a single argument:
=over 4
=item * C<$node> (Required)
A hash reference representing a node with the following structure:
{
name => 'element_name', # Name of the element (e.g., 'note', 'to', etc.)
children => [ # List of child elements
{ name => 'child_name', children => [{ text => 'value' }] },
...
],
attributes => { ... }, # Optional attributes for the element
ns_uri => ... , # Optional namespace URI
ns => ... , # Optional namespace
}
The C<children> key holds an array of child elements. Each child element may have its own C<name> and C<text>, and the function will collapse all text values into key-value pairs.
=back
The subroutine returns a hash reference that represents the collapsed structure, where the top-level key is C<note> and its value is another hash containing the child elements' names as keys and their corresponding text values as values.
For example:
{
note => {
to => 'Tove',
from => 'Jani',
heading => 'Reminder',
body => 'Don\'t forget me this weekend!',
}
}
=over 4
=item Basic Example:
Given the following input structure:
my $input = {
name => 'note',
children => [
{ name => 'to', children => [ { text => 'Tove' } ] },
{ name => 'from', children => [ { text => 'Jani' } ] },
{ name => 'heading', children => [ { text => 'Reminder' } ] },
{ name => 'body', children => [ { text => 'Don\'t forget me this weekend!' } ] },
],
};
Calling C<collapse_structure> will return:
{
note => {
to => 'Tove',
from => 'Jani',
heading => 'Reminder',
body => 'Don\'t forget me this weekend!',
}
}
=back
=cut | |||||
| 253 | ||||||
| 254 | sub collapse_structure { | |||||
| 255 | 8 | 327 | my ($self, $node) = @_; | |||
| 256 | # my $self = shift; | |||||
| 257 | # my $params = Params::Get::get_params('node', \@_); | |||||
| 258 | # my $node = $params->{'node'}; | |||||
| 259 | ||||||
| 260 | 8 | 20 | return {} unless ref $node eq 'HASH' && $node->{children}; | |||
| 261 | ||||||
| 262 | 8 | 6 | my %result; | |||
| 263 | 8 8 | 6 8 | for my $child (@{ $node->{children} }) { | |||
| 264 | 19 | 20 | my $name = $child->{name} or next; | |||
| 265 | 19 | 10 | my $value; | |||
| 266 | ||||||
| 267 | 19 19 | 30 26 | if ($child->{children} && @{ $child->{children} }) { | |||
| 268 | 18 18 | 10 34 | if (@{ $child->{children} } == 1 && exists $child->{children}[0]{text}) { | |||
| 269 | 15 | 12 | $value = $child->{children}[0]{text}; | |||
| 270 | } else { | |||||
| 271 | 3 | 5 | $value = $self->collapse_structure($child)->{$name}; | |||
| 272 | } | |||||
| 273 | } | |||||
| 274 | ||||||
| 275 | 19 | 56 | next unless defined $value && $value ne ''; | |||
| 276 | ||||||
| 277 | # Handle multiple same-name children as an array | |||||
| 278 | 17 | 15 | if (exists $result{$name}) { | |||
| 279 | 2 | 4 | $result{$name} = [ $result{$name} ] unless ref $result{$name} eq 'ARRAY'; | |||
| 280 | 2 2 | 2 3 | push @{ $result{$name} }, $value; | |||
| 281 | } else { | |||||
| 282 | 15 | 18 | $result{$name} = $value; | |||
| 283 | } | |||||
| 284 | } | |||||
| 285 | 8 | 19 | return { $node->{name} => \%result }; | |||
| 286 | } | |||||
| 287 | ||||||
| 288 - 297 | =head2 _parse_node my $node = $self->_parse_node($xml_ref, $nsmap); Recursively parses an individual XML node. This method is used internally by the C<parse> method. It handles the parsing of tags, attributes, text nodes, and child elements. It also manages namespaces and handles self-closing tags. =cut | |||||
| 298 | ||||||
| 299 | # Internal method to parse an individual XML node | |||||
| 300 | sub _parse_node { | |||||
| 301 | 24 | 20 | my ($self, $xml_ref, $nsmap) = @_; | |||
| 302 | ||||||
| 303 | 24 | 20 | if(!defined($xml_ref)) { | |||
| 304 | 0 | 0 | if($self->{'logger'}) { | |||
| 305 | 0 | 0 | $self->{'logger'}->fatal('BUG: _parse_node, xml_ref not defined'); | |||
| 306 | } | |||||
| 307 | 0 | 0 | die 'BUG: _parse_node, xml_ref not defined'; | |||
| 308 | } | |||||
| 309 | ||||||
| 310 | # Match the start of a tag (self-closing or regular) | |||||
| 311 | 24 | 59 | $$xml_ref =~ s{^\s*<([^\s/>]+)([^>]*)\s*(/?)>}{}s or do { | |||
| 312 | 0 | 0 | $self->_handle_error('Expected a valid XML tag, but none found at position: ' . pos($$xml_ref)); | |||
| 313 | 0 | 0 | return; | |||
| 314 | }; | |||||
| 315 | ||||||
| 316 | 24 | 49 | my ($raw_tag, $attr_string, $self_close) = ($1, $2 || '', $3); | |||
| 317 | ||||||
| 318 | # Check for malformed self-closing tags | |||||
| 319 | 24 | 21 | if($self_close && $$xml_ref !~ /^\s*<\/(?:\w+:)?$raw_tag\s*>/) { | |||
| 320 | 0 | 0 | $self->_handle_error("Malformed self-closing tag for <$raw_tag>"); | |||
| 321 | 0 | 0 | return; | |||
| 322 | } | |||||
| 323 | ||||||
| 324 | # Handle possible trailing slash like <line break="yes"/> | |||||
| 325 | 24 | 25 | if($attr_string =~ s{/\s*$}{}) { | |||
| 326 | 1 | 1 | $self_close = 1; | |||
| 327 | } | |||||
| 328 | ||||||
| 329 | 24 | 30 | my ($ns, $tag) = $raw_tag =~ /^([^:]+):(.+)$/ | |||
| 330 | ? ($1, $2) | |||||
| 331 | : (undef, $raw_tag); | |||||
| 332 | ||||||
| 333 | 24 | 21 | my %local_nsmap = (%$nsmap); | |||
| 334 | ||||||
| 335 | # XMLNS declarations | |||||
| 336 | 24 | 33 | while ($attr_string =~ /(\w+)(?::(\w+))?="([^"]*)"/g) { | |||
| 337 | 12 | 17 | my ($k1, $k2, $v) = ($1, $2, $3); | |||
| 338 | 12 | 29 | if ($k1 eq 'xmlns' && !defined $k2) { | |||
| 339 | 0 | 0 | $local_nsmap{''} = $v; | |||
| 340 | } elsif ($k1 eq 'xmlns' && defined $k2) { | |||||
| 341 | 1 | 2 | $local_nsmap{$k2} = $v; | |||
| 342 | } | |||||
| 343 | } | |||||
| 344 | ||||||
| 345 | # Normalize whitespace between attributes but not inside quotes | |||||
| 346 | # - Collapse run of whitespace to one space | |||||
| 347 | # - Remove leading/trailing whitespace | |||||
| 348 | # - Preserve quoted attribute values | |||||
| 349 | { | |||||
| 350 | 24 24 | 16 10 | my $tmp = $attr_string; | |||
| 351 | ||||||
| 352 | # Replace all whitespace sequences outside of quotes with a single space | |||||
| 353 | # This works because it alternates: quoted | non-quoted | |||||
| 354 | 24 | 34 | my @parts = $tmp =~ /"[^"]*"|'[^']*'|[^\s"'']+/g; | |||
| 355 | ||||||
| 356 | # Rejoin non-quoted segments with a single space | |||||
| 357 | 24 | 30 | $attr_string = join(' ', @parts); | |||
| 358 | } | |||||
| 359 | ||||||
| 360 | 24 | 25 | my %attributes; | |||
| 361 | 24 | 37 | pos($attr_string) = 0; | |||
| 362 | ||||||
| 363 | # Accept name="value" and name='value' (value captured lazily, same quote used to open/close) | |||||
| 364 | # Attribute name follows XML Name-ish rules: start with letter/underscore/colon, then letters/digits/._:- | |||||
| 365 | 24 | 44 | while ($attr_string =~ /([A-Za-z_:][-A-Za-z0-9_.:]*)\s*=\s*(['"])(.*?)\2/g) { | |||
| 366 | 12 | 15 | my ($attr, $quote, $v) = ($1, $2, $3); | |||
| 367 | ||||||
| 368 | # Skip xmlns declarations (already handled) | |||||
| 369 | 12 | 11 | next if $attr =~ /^xmlns(?::|$)/; | |||
| 370 | ||||||
| 371 | # Decode XML entities inside attribute values | |||||
| 372 | 11 | 9 | $attributes{$attr} = $self->_decode_entities($v); | |||
| 373 | } | |||||
| 374 | ||||||
| 375 | my $node = { | |||||
| 376 | name => $tag, | |||||
| 377 | ns => $ns, | |||||
| 378 | 22 | 44 | ns_uri => defined $ns ? $local_nsmap{$ns} : undef, | |||
| 379 | attributes => \%attributes, | |||||
| 380 | children => [], | |||||
| 381 | }; | |||||
| 382 | ||||||
| 383 | # Return immediately if self-closing tag | |||||
| 384 | 22 | 19 | return $node if $self_close; | |||
| 385 | ||||||
| 386 | # Capture text | |||||
| 387 | 21 | 29 | if ($$xml_ref =~ s{^([^<]+)}{}s) { | |||
| 388 | 21 | 30 | my $text = $self->_decode_entities($1); | |||
| 389 | 21 | 32 | $text =~ s/^\s+|\s+$//g; | |||
| 390 | 21 15 | 19 17 | push @{ $node->{children} }, { text => $text } if $text ne ''; | |||
| 391 | } | |||||
| 392 | ||||||
| 393 | # Recursively parse children | |||||
| 394 | 21 | 36 | while ($$xml_ref =~ /^\s*<([^\/>"][^>]*)>/) { | |||
| 395 | 13 | 15 | my $child = $self->_parse_node($xml_ref, \%local_nsmap); | |||
| 396 | 13 13 | 780 31 | push @{ $node->{children} }, $child if $child; | |||
| 397 | } | |||||
| 398 | ||||||
| 399 | # Consume closing tag | |||||
| 400 | 21 | 556 | $$xml_ref =~ s{^\s*</(?:\w+:)?$tag\s*>}{}s; | |||
| 401 | ||||||
| 402 | 21 | 44 | return Return::Set::set_return($node, { 'type' => 'hashref', 'min' => 1 }); | |||
| 403 | } | |||||
| 404 | ||||||
| 405 | # Internal helper to decode XML entities | |||||
| 406 | sub _decode_entities { | |||||
| 407 | 32 | 31 | my ($self, $text) = @_; | |||
| 408 | ||||||
| 409 | 32 | 22 | return undef unless defined $text; | |||
| 410 | ||||||
| 411 | # Decode known named entities | |||||
| 412 | 32 | 37 | $text =~ s/</</g; | |||
| 413 | 32 | 17 | $text =~ s/>/>/g; | |||
| 414 | 32 | 17 | $text =~ s/&/&/g; | |||
| 415 | 32 | 15 | $text =~ s/"/"/g; | |||
| 416 | 32 | 22 | $text =~ s/'/'/g; | |||
| 417 | ||||||
| 418 | # Decode decimal numeric entities | |||||
| 419 | 32 3 | 14 6 | $text =~ s/&#(\d+);/chr($1)/eg; | |||
| 420 | ||||||
| 421 | # Decode hex numeric entities | |||||
| 422 | 32 1 | 19 2 | $text =~ s/&#x([0-9a-fA-F]+);/chr(hex($1))/eg; | |||
| 423 | ||||||
| 424 | 32 | 20 | if ($text =~ /&([^;]*);/) { | |||
| 425 | 3 | 3 | my $entity = $1; | |||
| 426 | 3 | 10 | unless ($entity =~ /^(lt|gt|amp|quot|apos)$/ || $entity =~ /^#(?:x[0-9a-fA-F]+|\d+)$/) { | |||
| 427 | 3 | 2 | my $msg = "Unknown or malformed XML entity: &$entity;"; | |||
| 428 | 3 | 3 | $self->_handle_error($msg); | |||
| 429 | } | |||||
| 430 | } | |||||
| 431 | ||||||
| 432 | 31 | 28 | if ($text =~ /&/) { | |||
| 433 | 7 | 6 | my $msg = "Unescaped ampersand detected: $text"; | |||
| 434 | 7 | 7 | $self->_handle_error($msg); | |||
| 435 | } | |||||
| 436 | ||||||
| 437 | 30 | 36 | return $text; | |||
| 438 | } | |||||
| 439 | ||||||
| 440 | sub _handle_error { | |||||
| 441 | 10 | 5 | my ($self, $message) = @_; | |||
| 442 | ||||||
| 443 | 10 | 8 | my $error_message = __PACKAGE__ . ": XML Parsing Error: $message"; | |||
| 444 | ||||||
| 445 | 10 | 8 | if($self->{strict}) { | |||
| 446 | # Throws an error if strict mode is enabled | |||||
| 447 | 2 | 1 | if($self->{'logger'}) { | |||
| 448 | 0 | 0 | $self->{'logger'}->fatal($error_message); | |||
| 449 | } | |||||
| 450 | 2 | 7 | die $error_message; | |||
| 451 | } elsif ($self->{warn_on_error}) { | |||||
| 452 | # Otherwise, just warn | |||||
| 453 | 3 | 3 | if($self->{'logger'}) { | |||
| 454 | 0 | 0 | $self->{'logger'}->warn($error_message); | |||
| 455 | } else { | |||||
| 456 | 3 | 9 | warn $error_message; | |||
| 457 | } | |||||
| 458 | } else { | |||||
| 459 | 5 | 4 | if($self->{'logger'}) { | |||
| 460 | 0 | 0 | $self->{'logger'}->notice($error_message); | |||
| 461 | } else { | |||||
| 462 | 5 | 14 | print STDERR "Warning: $error_message\n"; | |||
| 463 | } | |||||
| 464 | } | |||||
| 465 | } | |||||
| 466 | ||||||
| 467 - 505 | =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/XML-PP/coverage/> =item * L<XML::LibXML> =item * L<XML::Simple> =back =head1 SUPPORT This module is provided as-is without any warranty. =head1 LICENSE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut | |||||
| 506 | ||||||
| 507 | 1; | |||||