| File: | lib/Schema/Validator.pm |
| Coverage: | 97.5% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package Schema::Validator; | |||||
| 2 | ||||||
| 3 | # --------------------------------------------------------------------------- | |||||
| 4 | # Schema::Validator -- ISO 8601 datetime validation and Schema.org vocabulary | |||||
| 5 | # loading. Purely functional; all symbols are opt-in via import list. | |||||
| 6 | # --------------------------------------------------------------------------- | |||||
| 7 | ||||||
| 8 | 7 7 7 | 768237 5 91 | use strict; | |||
| 9 | 7 7 7 | 9 5 131 | use warnings; | |||
| 10 | 7 7 7 | 1288 40561 13 | use autodie qw(:all); | |||
| 11 | ||||||
| 12 | 7 7 7 | 52565 7 194 | use Carp qw(carp croak); | |||
| 13 | 7 7 7 | 1780 1735673 206 | use DateTime::Format::ISO8601; | |||
| 14 | 7 7 7 | 334 4372 188 | use Encode qw(decode encode); | |||
| 15 | 7 7 7 | 14 5 75 | use File::Spec; | |||
| 16 | 7 7 7 | 16 8 173 | use JSON::MaybeXS qw(decode_json); | |||
| 17 | 7 7 7 | 2322 140729 153 | use LWP::UserAgent; | |||
| 18 | 7 7 7 | 204 4455 182 | use Params::Get qw(get_params); | |||
| 19 | 7 7 7 | 302 13991 138 | use Params::Validate::Strict qw(validate_strict); | |||
| 20 | 7 7 7 | 13 7 117 | use Readonly; | |||
| 21 | 7 7 7 | 22 4 96 | use Scalar::Util qw(reftype); | |||
| 22 | ||||||
| 23 | 7 7 7 | 10 6 5522 | use base 'Exporter'; | |||
| 24 | ||||||
| 25 | # Only these two symbols may be imported by callers via 'use ... qw(...)'. | |||||
| 26 | our @EXPORT_OK = qw(is_valid_datetime load_dynamic_vocabulary); | |||||
| 27 | ||||||
| 28 | our $VERSION = '0.03'; | |||||
| 29 | ||||||
| 30 | # --------------------------------------------------------------------------- | |||||
| 31 | # Package globals: both are populated as a side-effect of | |||||
| 32 | # load_dynamic_vocabulary(). Callers may read them after that call. | |||||
| 33 | # --------------------------------------------------------------------------- | |||||
| 34 | ||||||
| 35 | # rdfs:Class items from the Schema.org JSON-LD graph, keyed by class label | |||||
| 36 | our %dynamic_schema; | |||||
| 37 | ||||||
| 38 | # rdf:Property items from the Schema.org JSON-LD graph, keyed by property label | |||||
| 39 | our %dynamic_properties; | |||||
| 40 | ||||||
| 41 | # =========================================================================== | |||||
| 42 | # CONSTANTS | |||||
| 43 | # =========================================================================== | |||||
| 44 | # All magic strings and numbers are confined here; nothing below uses bare | |||||
| 45 | # literals. Every constant mirrors a key in %config so runtime overrides | |||||
| 46 | # are possible without re-opening the Readonly namespace. | |||||
| 47 | # --------------------------------------------------------------------------- | |||||
| 48 | ||||||
| 49 | # Default cache directory: $CACHEDIR env var if set, otherwise the system | |||||
| 50 | # temporary directory. Evaluated once at module load time. | |||||
| 51 | Readonly::Scalar my $DEFAULT_CACHE_DIR => | |||||
| 52 | (defined $ENV{CACHEDIR} && length $ENV{CACHEDIR}) | |||||
| 53 | ? $ENV{CACHEDIR} | |||||
| 54 | : File::Spec->tmpdir(); | |||||
| 55 | ||||||
| 56 | # Default cache filename -- stored in $DEFAULT_CACHE_DIR, never in CWD. | |||||
| 57 | Readonly::Scalar my $DEFAULT_CACHE_FILE => | |||||
| 58 | File::Spec->catfile($DEFAULT_CACHE_DIR, 'schemaorg_dynamic_vocabulary.jsonld'); | |||||
| 59 | ||||||
| 60 | # 86400 == 60 * 60 * 24: cache is considered fresh for one full day. | |||||
| 61 | Readonly::Scalar my $DEFAULT_CACHE_DURATION => 86_400; | |||||
| 62 | ||||||
| 63 | # Canonical URL for the Schema.org full vocabulary in JSON-LD format. | |||||
| 64 | Readonly::Scalar my $DEFAULT_VOCAB_URL => 'https://schema.org/version/latest/schemaorg-current-https.jsonld'; | |||||
| 65 | ||||||
| 66 | # HTTP timeout for the vocabulary download request, in seconds. | |||||
| 67 | Readonly::Scalar my $DEFAULT_UA_TIMEOUT => 30; | |||||
| 68 | ||||||
| 69 | # JSON-LD structural keys and RDF type labels used when traversing @graph. | |||||
| 70 | Readonly::Scalar my $AT_GRAPH => '@graph'; | |||||
| 71 | Readonly::Scalar my $RDF_CLASS => 'rdfs:Class'; | |||||
| 72 | Readonly::Scalar my $RDF_PROPERTY => 'rdf:Property'; | |||||
| 73 | Readonly::Scalar my $RDFS_LABEL => 'rdfs:label'; | |||||
| 74 | Readonly::Scalar my $RDFS_LABEL_FULL => 'http://www.w3.org/2000/01/rdf-schema#label'; | |||||
| 75 | ||||||
| 76 | # =========================================================================== | |||||
| 77 | # CONFIGURATION | |||||
| 78 | # =========================================================================== | |||||
| 79 | # Callers may override any key before calling an exported function, or inject | |||||
| 80 | # a full replacement via Object::Configure->configure('Schema::Validator', \%h). | |||||
| 81 | # --------------------------------------------------------------------------- | |||||
| 82 | our %config = ( | |||||
| 83 | cache_file => $DEFAULT_CACHE_FILE, | |||||
| 84 | cache_duration => $DEFAULT_CACHE_DURATION, | |||||
| 85 | vocab_url => $DEFAULT_VOCAB_URL, | |||||
| 86 | ua_timeout => $DEFAULT_UA_TIMEOUT, | |||||
| 87 | ); | |||||
| 88 | ||||||
| 89 | # =========================================================================== | |||||
| 90 | # PUBLIC INTERFACE (POD + code) | |||||
| 91 | # =========================================================================== | |||||
| 92 | ||||||
| 93 - 248 | =head1 NAME
Schema::Validator - Tools for validating and loading Schema.org vocabulary definitions
=head1 VERSION
Version 0.03
=head1 SYNOPSIS
use Schema::Validator qw(is_valid_datetime load_dynamic_vocabulary);
# Validate a date or datetime string
if (is_valid_datetime('2024-11-14')) {
print "Valid date\n";
}
# Load and query the Schema.org vocabulary
my $classes = load_dynamic_vocabulary();
if (exists $classes->{'Person'}) {
print "Person class is defined\n";
}
# Override a config value for a single call
my $classes = load_dynamic_vocabulary(ua_timeout => 60);
=head1 DESCRIPTION
C<Schema::Validator> provides two utilities for working with Schema.org
structured data:
=over 4
=item * L</is_valid_datetime> -- validates a string against the ISO 8601
date/datetime subset used by Schema.org.
=item * L</load_dynamic_vocabulary> -- downloads (and caches for 24 hours)
the full Schema.org JSON-LD vocabulary and exposes all class and property
definitions as a hashref and via package globals.
=back
=head2 Configuration
Runtime behaviour is controlled by the package-level C<%Schema::Validator::config>
hash. Supported keys and their defaults:
cache_file => "$CACHEDIR/schemaorg_dynamic_vocabulary.jsonld" # or tmpdir
cache_duration => 86400 # seconds
vocab_url => 'https://schema.org/.../schemaorg-current-https.jsonld'
ua_timeout => 30 # seconds
Override any key before calling an exported function:
$Schema::Validator::config{ua_timeout} = 60;
Or supply a complete replacement via L<Object::Configure>:
Object::Configure->configure('Schema::Validator', \%my_config);
=head1 PACKAGE VARIABLES
=head2 %dynamic_schema
Package hash keyed by Schema.org class label (e.g. C<Person>, C<Event>).
Values are the raw item hashrefs from the JSON-LD C<@graph> array.
Populated as a side-effect of L</load_dynamic_vocabulary>.
=head2 %dynamic_properties
Package hash keyed by Schema.org property label (e.g. C<name>, C<startDate>).
Values are the raw item hashrefs from the JSON-LD C<@graph> array.
Populated as a side-effect of L</load_dynamic_vocabulary>.
=head1 FUNCTIONS
=head2 is_valid_datetime
=head3 PURPOSE
Tests whether a scalar string conforms to one of the ISO 8601
date or datetime formats accepted by Schema.org:
YYYY-MM-DD (date only)
YYYY-MM-DDTHH:MM (T separator, no seconds)
YYYY-MM-DD HH:MM (space separator, no seconds)
YYYY-MM-DDTHH:MM:SS (T separator, with seconds)
YYYY-MM-DD HH:MM:SS (space separator, with seconds)
Optional timezone designators (C<Z>, C<+HH:MM>, C<-HH:MM>) are B<accepted>.
Calendar sanity B<is> enforced: out-of-range values (e.g. month 99) are B<rejected>.
=head3 ARGUMENTS
=over 4
=item * C<string> (required, scalar) -- the candidate string to test.
Both positional (C<is_valid_datetime('2024-11-14')>) and named
(C<is_valid_datetime(string =E<gt> '2024-11-14')>) calling conventions
are accepted.
=back
=head3 RETURNS
C<1> if the string is in a supported format; C<0> otherwise.
Returns C<0> for C<undef> or an empty string without throwing.
=head3 SIDE EFFECTS
None.
=head3 NOTES
Delegates to C<DateTime::Format::ISO8601->parse_datetime()> for semantic
validation, so out-of-range values (e.g. month 99) are rejected.
The space-separator variant (C<YYYY-MM-DD HH:MM>) is normalised to a T
separator before parsing since the module requires strict ISO 8601.
Timezone designators (C<Z>, C<+HH:MM>, C<-HH:MM>) are now accepted.
=head3 EXAMPLE
use Schema::Validator qw(is_valid_datetime);
is_valid_datetime('2024-11-14'); # 1
is_valid_datetime('2024-11-14T15:30:00'); # 1
is_valid_datetime('2024-11-14 15:30'); # 1 (space sep normalised)
is_valid_datetime('2024-11-14T15:30:00Z'); # 1 (UTC timezone)
is_valid_datetime('2024-11-14T15:30:00+01:00'); # 1 (offset timezone)
is_valid_datetime('2024-99-01'); # 0 (invalid month)
is_valid_datetime('28/06/2025'); # 0
is_valid_datetime(undef); # 0 (no exception)
is_valid_datetime(''); # 0 (no exception)
# Named calling convention
is_valid_datetime(string => '2024-11-14'); # 1
=head3 API SPECIFICATION
=head4 Input (Params::Validate::Strict)
{
string => {
type => 'string',
optional => 0,
},
}
=head4 Output (Return::Set)
{
type => 'boolean'
description => '1 (valid) or 0 (invalid, undef, or empty input)'
}
=cut | |||||
| 249 | ||||||
| 250 | sub is_valid_datetime { | |||||
| 251 | # Accept both positional (is_valid_datetime($s)) and named | |||||
| 252 | # (is_valid_datetime(string => $s)) calling conventions. | |||||
| 253 | # Validate: value must be a scalar or undef (undef returns 0 cleanly below). | |||||
| 254 | 113 | 633022 | my $p = validate_strict( | |||
| 255 | input => get_params('string', \@_), | |||||
| 256 | schema => { 'string' => { type => 'string', optional => 0 } }, | |||||
| 257 | ); | |||||
| 258 | ||||||
| 259 | 107 | 8366 | my $string = $p->{string}; | |||
| 260 | ||||||
| 261 | # Treat undef or empty string as invalid without throwing. | |||||
| 262 | 107 | 234 | return 0 unless defined $string && length $string; | |||
| 263 | ||||||
| 264 | # Normalise the space-separator variant to T before handing off to the | |||||
| 265 | # module, which requires strict ISO 8601 (T separator only). | |||||
| 266 | 90 | 183 | (my $normalised = $string) =~ s/^(\d{4}-\d{2}-\d{2}) (?=\d{2}:)/$1T/; | |||
| 267 | ||||||
| 268 | # Delegate to DateTime::Format::ISO8601 for full semantic validation; | |||||
| 269 | # a truthy (DateTime) object means valid, undef/$@ means invalid. | |||||
| 270 | 90 90 | 75 140 | return eval { DateTime::Format::ISO8601->parse_datetime($normalised) } ? 1 : 0; | |||
| 271 | } | |||||
| 272 | ||||||
| 273 | # =========================================================================== | |||||
| 274 | ||||||
| 275 - 381 | =head2 load_dynamic_vocabulary
=head3 PURPOSE
Downloads the complete Schema.org vocabulary from the official JSON-LD
endpoint, parses it into class and property lookup tables, caches the raw
JSON-LD locally, and returns the class table as a hashref.
The cache is considered fresh for C<cache_duration> seconds (default 24 hours).
On network failure the function falls back to a stale cache rather than
returning an empty result, and emits a C<carp> warning.
=head3 ARGUMENTS
All arguments are optional; defaults come from C<%Schema::Validator::config>.
=over 4
=item * C<cache_file> (optional, scalar) -- path to the local cache file.
Defaults to C<$config{cache_file}>: C<$CACHEDIR/schemaorg_dynamic_vocabulary.jsonld>
if C<$ENV{CACHEDIR}> is set, otherwise C<File::Spec-E<gt>tmpdir()> is used.
=item * C<cache_duration> (optional, scalar) -- cache validity window in seconds.
Defaults to C<$config{cache_duration}>.
=item * C<vocab_url> (optional, scalar) -- URL of the JSON-LD vocabulary endpoint.
Defaults to C<$config{vocab_url}>.
=item * C<ua_timeout> (optional, scalar) -- LWP::UserAgent timeout in seconds.
Defaults to C<$config{ua_timeout}>.
=back
Both zero-argument and named calling conventions are supported:
load_dynamic_vocabulary();
load_dynamic_vocabulary(ua_timeout => 60);
=head3 RETURNS
A hashref mapping class labels (e.g. C<'Person'>) to their raw JSON-LD
definition hashrefs from the C<@graph> array.
Returns an empty hashref C<{}> on all failure paths (network unreachable,
no cache, JSON parse error). Never throws.
=head3 SIDE EFFECTS
=over 4
=item * Populates C<%Schema::Validator::dynamic_schema> with class definitions.
=item * Populates C<%Schema::Validator::dynamic_properties> with property definitions.
=item * Creates or updates the local cache file on a successful download.
=item * Emits C<carp> warnings on network failures, I/O errors, or JSON
parse errors.
=back
=head3 NOTES
The default cache directory is determined once at module load time: the
C<$CACHEDIR> environment variable is used if set; otherwise C<File::Spec-E<gt>tmpdir()>
is used (typically C</tmp> on Unix). Override for the session with:
$Schema::Validator::config{cache_file} = '/my/path/vocab.jsonld';
The C<bin/validate-schema> CLI tool imports this function from the module and
uses C<cache_file =E<gt> $path> to store its cache under C<~/.cache/schema_validator/>.
=head3 EXAMPLE
use Schema::Validator qw(load_dynamic_vocabulary);
my $classes = load_dynamic_vocabulary();
printf "%d classes loaded\n", scalar keys %{$classes};
# Check for a specific class in the returned hashref
print "Has Person\n" if exists $classes->{'Person'};
# Or query the package globals directly after the call
Schema::Validator::load_dynamic_vocabulary();
my @names = sort keys %Schema::Validator::dynamic_schema;
=head3 API SPECIFICATION
=head4 Input (Params::Validate::Strict)
{
cache_file => { type => 'string', optional => 1 },
cache_duration => { type => 'string', optional => 1 },
vocab_url => { type => 'string', optional => 1 },
ua_timeout => { type => 'string', optional => 1 },
}
=head4 Output (Return::Set)
{
type => 'hashref',
description => 'class-label => JSON-LD item hashref'
# ON_FAILURE => 'empty hashref {}; never throws'
# SIDE_EFFECTS => 'populates %dynamic_schema and %dynamic_properties'
}
=cut | |||||
| 382 | ||||||
| 383 | sub load_dynamic_vocabulary { | |||||
| 384 | 104 | 279910 | my $params; | |||
| 385 | ||||||
| 386 | # Validate types of any supplied overrides (all are optional scalars). | |||||
| 387 | 104 | 138 | if(scalar(@_)) { | |||
| 388 | 103 | 172 | $params = validate_strict( | |||
| 389 | input => get_params(undef, \@_), | |||||
| 390 | schema => { | |||||
| 391 | cache_file => { type => 'string', optional => 1 }, | |||||
| 392 | cache_duration => { type => 'integer', optional => 1 }, | |||||
| 393 | vocab_url => { type => 'string', optional => 1 }, | |||||
| 394 | ua_timeout => { type => 'integer', optional => 1 }, | |||||
| 395 | } | |||||
| 396 | ); | |||||
| 397 | } | |||||
| 398 | ||||||
| 399 | # Merge caller overrides with module-level configuration defaults. | |||||
| 400 | 102 | 13253 | my $cache_file = $params->{cache_file} // $config{cache_file}; | |||
| 401 | 102 | 130 | my $cache_duration = $params->{cache_duration} // $config{cache_duration}; | |||
| 402 | 102 | 190 | my $vocab_url = $params->{vocab_url} // $config{vocab_url}; | |||
| 403 | 102 | 162 | my $ua_timeout = $params->{ua_timeout} // $config{ua_timeout}; | |||
| 404 | ||||||
| 405 | 102 | 58 | my $content; | |||
| 406 | ||||||
| 407 | # Attempt to read a fresh cache file. Open directly to avoid the TOCTOU | |||||
| 408 | # race that would exist between a separate -e test and the open call. | |||||
| 409 | 102 | 929 | if (-e $cache_file && (time - (stat($cache_file))[9] < $cache_duration)) { | |||
| 410 | 33 33 | 36 52 | eval { $content = _slurp_file($cache_file) }; | |||
| 411 | 33 | 850 | carp "Could not read cache '$cache_file': $@" if $@; | |||
| 412 | } | |||||
| 413 | ||||||
| 414 | # If no usable content yet, try to download the vocabulary. | |||||
| 415 | 102 | 314 | unless (defined $content) { | |||
| 416 | 71 | 76 | $content = _fetch_url($vocab_url, $ua_timeout); | |||
| 417 | ||||||
| 418 | 71 | 222 | if (defined $content) { | |||
| 419 | # Persist the download to the cache (best-effort; warn, do not die). | |||||
| 420 | 53 53 | 40 50 | eval { _spit_file($cache_file, $content) }; | |||
| 421 | 53 | 860 | carp "Could not write cache '$cache_file': $@" if $@; | |||
| 422 | } else { | |||||
| 423 | # Network failed; fall back to a stale cache if one exists. | |||||
| 424 | 18 | 64 | if (-e $cache_file) { | |||
| 425 | 8 8 | 6 11 | eval { $content = _slurp_file($cache_file) }; | |||
| 426 | 8 | 1045 | if ($@) { | |||
| 427 | 3 | 31 | carp "Could not read stale cache '$cache_file': $@"; | |||
| 428 | } else { | |||||
| 429 | 5 | 32 | carp "Network unavailable; using stale cache '$cache_file'"; | |||
| 430 | } | |||||
| 431 | } | |||||
| 432 | } | |||||
| 433 | } | |||||
| 434 | ||||||
| 435 | # All content-acquisition strategies failed; return empty result. | |||||
| 436 | 102 | 985 | unless (defined $content) { | |||
| 437 | 13 | 121 | carp 'load_dynamic_vocabulary: no vocabulary content available'; | |||
| 438 | 13 | 1450 | return {}; | |||
| 439 | } | |||||
| 440 | ||||||
| 441 | # Parse the JSON; treat errors as non-fatal warnings. | |||||
| 442 | 89 89 | 60 933 | my $data = eval { decode_json($content) }; | |||
| 443 | 89 | 2226 | if ($@) { | |||
| 444 | 4 | 45 | carp "Failed to parse vocabulary JSON: $@"; | |||
| 445 | 4 | 572 | return {}; | |||
| 446 | } | |||||
| 447 | ||||||
| 448 | # Guard against decode_json returning a non-object (e.g. a JSON array, | |||||
| 449 | # a bare number, or any other non-hash type). Calling exists on a | |||||
| 450 | # non-hashref dies; catching it here keeps the "never throws" contract. | |||||
| 451 | 85 | 126 | unless (ref($data) eq 'HASH') { | |||
| 452 | 2 | 25 | carp "Vocabulary JSON is not a JSON object"; | |||
| 453 | 2 | 208 | return {}; | |||
| 454 | } | |||||
| 455 | ||||||
| 456 | # Confirm the expected JSON-LD graph structure is present. | |||||
| 457 | 83 | 156 | unless (exists $data->{$AT_GRAPH} && ref($data->{$AT_GRAPH}) eq 'ARRAY') { | |||
| 458 | 5 | 56 | carp "Vocabulary JSON is missing the '\@graph' array"; | |||
| 459 | 5 | 680 | return {}; | |||
| 460 | } | |||||
| 461 | ||||||
| 462 | # Delegate parsing to the internal graph processor. | |||||
| 463 | 78 | 86 | my ($classes, $props) = _parse_graph($data->{$AT_GRAPH}); | |||
| 464 | ||||||
| 465 | # Populate package globals as documented side-effects. | |||||
| 466 | 78 78 | 52 486 | %dynamic_schema = %{$classes}; | |||
| 467 | 78 78 | 75 93 | %dynamic_properties = %{$props}; | |||
| 468 | ||||||
| 469 | # Report the result count via carp (informational, not an error). | |||||
| 470 | 78 | 859 | carp sprintf( | |||
| 471 | 'Dynamic vocabulary loaded: %d classes, %d properties', | |||||
| 472 | scalar(keys %dynamic_schema), | |||||
| 473 | scalar(keys %dynamic_properties), | |||||
| 474 | ); | |||||
| 475 | ||||||
| 476 | # Return the class hashref; callers needing properties use the global. | |||||
| 477 | 78 | 7452 | return $classes; | |||
| 478 | } | |||||
| 479 | ||||||
| 480 | # =========================================================================== | |||||
| 481 | # INTERNAL HELPERS | |||||
| 482 | # All routines below begin with _ and are not part of the public API. | |||||
| 483 | # =========================================================================== | |||||
| 484 | ||||||
| 485 | # --------------------------------------------------------------------------- | |||||
| 486 | # _slurp_file($path) | |||||
| 487 | # | |||||
| 488 | # Purpose: Read the complete contents of a file into a scalar. | |||||
| 489 | # Entry: $path is a path to an existing, readable file. | |||||
| 490 | # Returns: The file contents as a scalar string. | |||||
| 491 | # Side fx: None beyond reading the file. | |||||
| 492 | # Notes: autodie causes open/close to throw on failure; callers should | |||||
| 493 | # wrap in eval { } and handle $@ if a non-fatal path is needed. | |||||
| 494 | # --------------------------------------------------------------------------- | |||||
| 495 | sub _slurp_file { | |||||
| 496 | 38 | 9111 | my ($path) = @_; | |||
| 497 | ||||||
| 498 | # Open the file; autodie will throw if this fails. | |||||
| 499 | 38 | 64 | open my $fh, '<', $path; | |||
| 500 | ||||||
| 501 | # Temporarily undefine $/ to read the whole file in one operation. | |||||
| 502 | 37 | 5073 | local $/; | |||
| 503 | 37 | 335 | my $content = <$fh>; | |||
| 504 | ||||||
| 505 | 37 | 55 | close $fh; | |||
| 506 | 37 | 2247 | return $content; | |||
| 507 | } | |||||
| 508 | ||||||
| 509 | # --------------------------------------------------------------------------- | |||||
| 510 | # _spit_file($path, $content) | |||||
| 511 | # | |||||
| 512 | # Purpose: Write a scalar string to a file, creating or truncating it. | |||||
| 513 | # Entry: $path is a writable path; $content is a defined scalar. | |||||
| 514 | # Returns: 1 on success. | |||||
| 515 | # Side fx: Creates or overwrites $path. | |||||
| 516 | # Notes: autodie causes open/close to throw on failure; wrap in eval | |||||
| 517 | # when the write is non-critical (e.g. cache population). | |||||
| 518 | # --------------------------------------------------------------------------- | |||||
| 519 | sub _spit_file { | |||||
| 520 | 32 | 9323 | my ($path, $content) = @_; | |||
| 521 | ||||||
| 522 | # Open for writing; autodie throws on permission or path errors. | |||||
| 523 | 32 | 51 | open my $fh, '>', $path; | |||
| 524 | 31 | 3985 | print $fh $content; | |||
| 525 | 31 | 38 | close $fh; | |||
| 526 | ||||||
| 527 | 31 | 1980 | return 1; | |||
| 528 | } | |||||
| 529 | ||||||
| 530 | # --------------------------------------------------------------------------- | |||||
| 531 | # _fetch_url($url, $timeout) | |||||
| 532 | # | |||||
| 533 | # Purpose: Perform an HTTP GET and return the decoded response body. | |||||
| 534 | # Entry: $url is a valid absolute HTTP/HTTPS URL; $timeout is a positive | |||||
| 535 | # integer (seconds). | |||||
| 536 | # Returns: Decoded response content on success; undef on HTTP error. | |||||
| 537 | # Side fx: Network I/O; emits carp on non-success HTTP status. | |||||
| 538 | # Notes: Transport-level errors (DNS failure, TLS error) may propagate as | |||||
| 539 | # exceptions from LWP::UserAgent; callers should wrap in eval if | |||||
| 540 | # they need a guaranteed non-throwing call. | |||||
| 541 | # --------------------------------------------------------------------------- | |||||
| 542 | sub _fetch_url { | |||||
| 543 | 70 | 11029 | my ($url, $timeout) = @_; | |||
| 544 | ||||||
| 545 | # Build a minimal UA; timeout prevents indefinite hangs. | |||||
| 546 | 70 | 121 | my $ua = LWP::UserAgent->new(timeout => $timeout); | |||
| 547 | 70 | 163 | my $res = $ua->get($url); | |||
| 548 | ||||||
| 549 | # Treat any non-2xx status as a soft failure so callers can try fallbacks. | |||||
| 550 | 70 | 174 | unless ($res->is_success) { | |||
| 551 | 17 | 53 | carp "Failed to fetch '$url': ", $res->status_line; | |||
| 552 | 17 | 2293 | return; | |||
| 553 | } | |||||
| 554 | ||||||
| 555 | 53 | 124 | return $res->decoded_content; | |||
| 556 | } | |||||
| 557 | ||||||
| 558 | # --------------------------------------------------------------------------- | |||||
| 559 | # _extract_label($item) | |||||
| 560 | # | |||||
| 561 | # Purpose: Extract the rdfs:label string from a JSON-LD graph item hashref. | |||||
| 562 | # Entry: $item is a hashref that may contain 'rdfs:label' or the full | |||||
| 563 | # URI equivalent key. | |||||
| 564 | # Returns: The label as a plain string, or undef if no label is found. | |||||
| 565 | # Side fx: None. | |||||
| 566 | # Notes: Schema.org JSON-LD may represent the label as a scalar string or | |||||
| 567 | # as an array (for multi-language entries); this function always | |||||
| 568 | # returns the first (or only) value. | |||||
| 569 | # --------------------------------------------------------------------------- | |||||
| 570 | sub _extract_label { | |||||
| 571 | 1177 | 7048 | my ($item) = @_; | |||
| 572 | ||||||
| 573 | # Try the compact key first; fall back to the full RDF URI form. | |||||
| 574 | 1177 | 891 | my $label = $item->{$RDFS_LABEL} // $item->{$RDFS_LABEL_FULL}; | |||
| 575 | 1177 | 784 | return unless defined $label; | |||
| 576 | ||||||
| 577 | # If the label is multi-valued, take the first entry. | |||||
| 578 | 1175 | 1044 | return ref($label) eq 'ARRAY' ? $label->[0] : $label; | |||
| 579 | } | |||||
| 580 | ||||||
| 581 | # --------------------------------------------------------------------------- | |||||
| 582 | # _parse_graph(\@graph) | |||||
| 583 | # | |||||
| 584 | # Purpose: Iterate over a JSON-LD @graph array and partition items into | |||||
| 585 | # Schema.org class definitions and property definitions. | |||||
| 586 | # Entry: $graph_ref is an arrayref of item hashrefs as decoded from the | |||||
| 587 | # Schema.org JSON-LD vocabulary. | |||||
| 588 | # Returns: Two hashrefs: (\%classes, \%properties), each keyed by label. | |||||
| 589 | # Items are also indexed by the short name extracted from their | |||||
| 590 | # @id URI so that both 'MusicEvent' and its label resolve correctly. | |||||
| 591 | # Side fx: None. | |||||
| 592 | # Notes: Items with no recognisable label or @type are silently skipped. | |||||
| 593 | # The @id short-name index uses //= so the label always wins if | |||||
| 594 | # it differs. | |||||
| 595 | # --------------------------------------------------------------------------- | |||||
| 596 | sub _parse_graph { | |||||
| 597 | 90 | 17388 | my ($graph_ref) = @_; | |||
| 598 | ||||||
| 599 | 90 | 61 | my (%classes, %props); | |||
| 600 | ||||||
| 601 | # Iterate every item in the JSON-LD graph array. | |||||
| 602 | 90 90 | 50 85 | for my $item (@{$graph_ref}) { | |||
| 603 | ||||||
| 604 | # Skip items that do not declare an RDF type. | |||||
| 605 | 1174 | 883 | next unless exists $item->{'@type'}; | |||
| 606 | 1172 | 685 | my $item_type = $item->{'@type'}; | |||
| 607 | ||||||
| 608 | # Normalise @type: the spec allows either a scalar or an array. | |||||
| 609 | 1172 2 | 825 3 | my @types = ref($item_type) eq 'ARRAY' ? @{$item_type} : ($item_type); | |||
| 610 | ||||||
| 611 | # Extract the human-readable label; skip items with none. | |||||
| 612 | 1172 | 720 | my $label = _extract_label($item) or next; | |||
| 613 | ||||||
| 614 | # Index rdfs:Class items under their label and their @id short name. | |||||
| 615 | 1167 1169 | 724 1027 | if (grep { $_ eq $RDF_CLASS } @types) { | |||
| 616 | 1092 | 920 | $classes{$label} = $item; | |||
| 617 | ||||||
| 618 | # Secondary index by short URI fragment (e.g. 'MusicGroup'). | |||||
| 619 | 1092 | 767 | if (my $id = $item->{'@id'}) { | |||
| 620 | 1090 | 1048 | (my $short = $id) =~ s{.*/}{}; | |||
| 621 | 1090 | 868 | $classes{$short} //= $item; | |||
| 622 | } | |||||
| 623 | } | |||||
| 624 | ||||||
| 625 | # Index rdf:Property items under their label and @id short name. | |||||
| 626 | 1167 1169 | 630 1076 | if (grep { $_ eq $RDF_PROPERTY } @types) { | |||
| 627 | 76 | 65 | $props{$label} = $item; | |||
| 628 | ||||||
| 629 | # Secondary index by short URI fragment (e.g. 'startDate'). | |||||
| 630 | 76 | 63 | if (my $id = $item->{'@id'}) { | |||
| 631 | 75 | 88 | (my $short = $id) =~ s{.*/}{}; | |||
| 632 | 75 | 85 | $props{$short} //= $item; | |||
| 633 | } | |||||
| 634 | } | |||||
| 635 | } | |||||
| 636 | ||||||
| 637 | 90 | 112 | return (\%classes, \%props); | |||
| 638 | } | |||||
| 639 | ||||||
| 640 | # =========================================================================== | |||||
| 641 | # END OF MODULE POD | |||||
| 642 | # =========================================================================== | |||||
| 643 | ||||||
| 644 | =encoding utf-8 | |||||
| 645 | ||||||
| 646 - 793 | =head1 FILES
=head2 schemaorg_dynamic_vocabulary.jsonld
Cache file written to C<$CACHEDIR> (if set) or the system temporary directory
(C<File::Spec-E<gt>tmpdir()>), unless overridden via C<$config{cache_file}>.
Contains the downloaded Schema.org vocabulary in JSON-LD format. Refreshed
when older than C<$config{cache_duration}> seconds.
=head1 ERROR HANDLING
The module uses C<carp> rather than C<die> for recoverable failures:
=over 4
=item * Failed HTTP requests emit C<carp> and trigger the stale-cache fallback.
=item * JSON parse errors emit C<carp> and return C<{}>.
=item * File I/O errors emit C<carp>; the download path is attempted next.
=item * C<croak> is reserved for programmer errors (bad argument types).
=back
=head1 BUGS
=over 4
=item * Cache invalidation is time-based only; no checksum or version check.
=back
=head1 SEE ALSO
=over 4
=item * L<Test Dashboard|https://nigelhorne.github.io/Schema-Validator/coverage/>
=back
=head1 REPOSITORY
L<https://github.com/nigelhorne/schema-validator>
=head2 FORMAL SPECIFICATION
=head3 is_valid_datetime
Let CHAR denote the set of all Unicode code points and
DIGIT = { c : CHAR | c in {'0'..'9'} }.
Let seqN(S) = { s : seq S | #s = N }.
YEAR â seqN(4, DIGIT)
MONTH â seqN(2, DIGIT)
DAY â seqN(2, DIGIT)
HOUR â seqN(2, DIGIT)
MINUTE â seqN(2, DIGIT)
SECOND â seqN(2, DIGIT)
SEP â { 'T', ' ' }
DATE â { d : seq CHAR | â y â YEAR; mo â MONTH; dy â DAY
⢠d = y ⢠â¨'-'⩠⢠mo ⢠â¨'-'⩠⢠dy }
HHMM â { t : seq CHAR | â h â HOUR; m â MINUTE
⢠t = h ⢠â¨':'⩠⢠m }
HHMMSS â { t : seq CHAR | â h â HOUR; m â MINUTE; s â SECOND
⢠t = h ⢠â¨':'⩠⢠m ⢠â¨':'⩠⢠s }
TIMEFRAG â { tf : seq CHAR | â sep â SEP; hm â (HHMM ⪠HHMMSS)
⢠tf = â¨sep⩠⢠hm }
DATETIME â DATE ⪠{ dt : seq CHAR | â d â DATE; tf â TIMEFRAG
⢠dt = d ⢠tf }
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
IsValidDatetime
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
str? : seq CHAR
result! : B
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
result! ⺠str? â DATETIME
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
=head3 load_dynamic_library
Let FILE, DUR, URL be the resolved config values.
Let now : N be the current UNIX epoch time.
Let mtime : PATH -> N map a path to its last-modification time.
Let readable, writeable : PATH -> B be filesystem predicates.
Let reachable : URL -> B test HTTP reachability.
Let slurp : PATH -> seq CHAR and spit : PATH x seq CHAR -> 1.
Let fetch : URL x N -> seq CHAR (second arg is timeout).
Let decode_json : seq CHAR -> ITEM.
Let label : ITEM -> (LABEL | {}) extract rdfs:label.
Let types : ITEM -> P TYPE extract @type values.
FRESH â ( -e(FILE) ) â§ ( (now - mtime(FILE)) < DUR )
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
LoadDynamicVocabulary
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
ÎVocabularyStore
cache_file? : PATH
cache_duration? : N
vocab_url? : URL
ua_timeout? : N
result! : CLASS_LABEL ⸠ITEM
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
content : seq CHAR
FRESH â§ readable(cache_file?)
â content = slurp(cache_file?)
¬FRESH ⧠reachable(vocab_url?)
â content = fetch(vocab_url?, ua_timeout?)
â§ ( writeable(cache_file?) â spit(cache_file?, content) )
¬FRESH ⧠¬reachable(vocab_url?) ⧠-e(cache_file?)
â content = slurp(cache_file?)
graph â (decode_json content)[AT_GRAPH]
dynamic_schema' =
{ item â graph | RDF_CLASS â types(item) â§ label(item) â â
⢠label(item) ⦠item }
dynamic_properties' =
{ item â graph | RDF_PROPERTY â types(item) â§ label(item) â â
⢠label(item) ⦠item }
result! = dynamic_schema'
ââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââââ
=head1 AUTHOR
Nigel Horne, C<< <njh at nigelhorne.com> >>
=head1 LICENCE AND COPYRIGHT
Copyright 2025-2026 Nigel Horne.
Usage is subject to the GPL2 licence terms.
If you use it,
please let me know.
=cut | |||||
| 794 | ||||||
| 795 | 1; | |||||