| File: | blib/lib/SEO/Inspector.pm | 
| Coverage: | 54.7% | 
| line | stmt | bran | cond | sub | time | code | 
|---|---|---|---|---|---|---|
| 1 | package SEO::Inspector; | |||||
| 2 | ||||||
| 3 | 6 6 6 | 349188 2 82 | use strict; | |||
| 4 | 6 6 6 | 8 4 107 | use warnings; | |||
| 5 | ||||||
| 6 | 6 6 6 | 11 4 121 | use Carp; | |||
| 7 | 6 6 6 | 1396 1023435 22 | use Mojo::UserAgent; | |||
| 8 | 6 6 6 | 148 5 8 | use Mojo::URL; | |||
| 9 | 6 6 6 | 1259 18209 19 | use Module::Pluggable require => 1, search_path => 'SEO::Inspector::Plugin'; | |||
| 10 | 6 6 6 | 1497 147006 98 | use Object::Configure 0.14; | |||
| 11 | 6 6 6 | 17 26 117 | use Params::Get 0.13; | |||
| 12 | 6 6 6 | 9 29 11818 | use Params::Validate::Strict 0.11; | |||
| 13 | ||||||
| 14 - 22 | =head1 NAME SEO::Inspector - Run SEO checks on HTML or URLs =head1 VERSION Version 0.02 =cut | |||||
| 23 | ||||||
| 24 | our $VERSION = '0.02'; | |||||
| 25 | ||||||
| 26 - 166 | =head1 SYNOPSIS
  use SEO::Inspector;
  my $inspector = SEO::Inspector->new(url => 'https://example.com');
  # Run plugins
  my $html = '<html><body>......</body></html>';
  my $plugin_results = $inspector->check_html($html);
  # Run built-in checks
  my $builtin_results = $inspector->run_all($html);
  # Check a single URL and get all results
  my $all_results = $inspector->check_url('https://example.com');
=head1 DESCRIPTION
SEO::Inspector provides:
=over 4
=item * 14 built-in SEO checks
=item * Plugin system: dynamically load modules under SEO::Inspector::Plugin namespace
=item * Methods to check HTML strings or fetch and analyze a URL
=back
=head1 PLUGIN SYSTEM
In addition to the built-in SEO checks, C<SEO::Inspector> supports a flexible
plugin system.
Plugins allow you to extend the checker with new rules or
specialized analysis without modifying the core module.
=head2 How Plugins Are Found
Plugins are loaded dynamically from the C<SEO::Inspector::Plugin> namespace.
For example, a module called:
  package SEO::Inspector::Plugin::MyCheck;
will be detected and loaded automatically if it is available in C<@INC>.
You can also tell the constructor to search additional directories by passing
the C<plugin_dirs> argument:
  my $inspector = SEO::Inspector->new(
    plugin_dirs => ['t/lib', '/path/to/custom/plugins'],
  );
Each directory must contain files under a subpath corresponding to the
namespace, for example:
  /path/to/custom/plugins/SEO/Inspector/Plugin/Foo.pm
=head2 Plugin Interface
A plugin must provide at least two methods:
=over 4
=item * C<new>
Constructor, called with no arguments.
=item * C<run($html)>
Given a string of raw HTML, return a hashref describing the result of the check.
The hashref should have at least these keys:
  {
    name   => 'My Check',
    status => 'ok' | 'warn' | 'error',
    notes  => 'human-readable message',
    resolution => 'how to resolve'
  }
=back
=head2 Running Plugins
You can run all loaded plugins against a piece of HTML with:
  my $results = $inspector->check_html($html);
This returns a hashref keyed by plugin name (lowercased), each value being the
hashref returned by the plugin's C<run> method.
Plugins are also run automatically when you call C<check_url>:
  my $results = $inspector->check_url('https://example.com');
That result will include both built-in checks and plugin checks.
=head2 Example Plugin
Here is a minimal example plugin that checks whether the page contains
the string "Hello":
        package SEO::Inspector::Plugin::HelloCheck;
        use strict;
        use warnings;
        sub new { bless {}, shift }
        sub run {
                my ($self, $html) = @_;
                if($html =~ /Hello/) {
                        return { name => 'Hello Check', status => 'ok', notes => 'found Hello' };
                } else {
                        return { name => 'Hello Check', status => 'warn', notes => 'no Hello', resolution => 'add a hello field' };
                }
        }
        1;
Place this file under C<lib/SEO/Inspector/Plugin/HelloCheck.pm> (or another
directory listed in C<plugin_dirs>), and it will be discovered automatically.
=head2 Naming Conventions
The plugin key stored in C<< $inspector->{plugins} >> is derived from the final
part of the package name, lowercased. For example:
        SEO::Inspector::Plugin::HelloCheck -> "hellocheck"
This is the key you will see in the hashref returned by C<check_html> or
C<check_url>.
=head1 METHODS
=head2 new(%args)
Create a new inspector object.
Accepts optional C<ua>, C<plugins>, C<url> and C<plugin_dirs> arguments.
If C<plugin_dirs> isn't given, it tries hard to find the right place.
=cut | |||||
| 167 | ||||||
| 168 | sub new { | |||||
| 169 | 5 | 306202 | my $class = shift; | |||
| 170 | ||||||
| 171 | 5 | 18 | my $params = Params::Validate::Strict::validate_strict({ | |||
| 172 | args => Params::Get::get_params(undef, \@_) || {}, | |||||
| 173 | schema => { | |||||
| 174 | 'ua' => { | |||||
| 175 | 'type' => 'object', | |||||
| 176 | 'can' => 'get', | |||||
| 177 | 'optional' => 1, | |||||
| 178 | }, 'plugins' => { | |||||
| 179 | 'type' => 'hashref', | |||||
| 180 | 'optional' => 1, | |||||
| 181 | }, 'url' => { | |||||
| 182 | 'type' => 'string', | |||||
| 183 | 'matches' => qr{ | |||||
| 184 | ^ | |||||
| 185 | (?:https?://)? # Optional http or https scheme | |||||
| 186 | (?:www\.)? # Optional www subdomain | |||||
| 187 | [a-zA-Z0-9-]+ # Domain name part (alphanumeric and hyphens) | |||||
| 188 | (?:\.[a-zA-Z0-9-]+)+ # Subdomains or TLDs (e.g., .com, .co.uk) | |||||
| 189 | (?::\d{1,5})? # Optional port number | |||||
| 190 | (?:/[^\s]*)? # Optional path (any non-whitespace characters) | |||||
| 191 | $ | |||||
| 192 | }x, | |||||
| 193 | 'optional' => 1, | |||||
| 194 | }, 'plugin_dirs' => { | |||||
| 195 | 'type' => 'arrayref', | |||||
| 196 | 'optional' => 1, | |||||
| 197 | } | |||||
| 198 | } | |||||
| 199 | }); | |||||
| 200 | ||||||
| 201 | 5 | 425 | $params = Object::Configure::configure($class, $params); | |||
| 202 | ||||||
| 203 | 5 | 109675 | $params->{'ua'} ||= Mojo::UserAgent->new(); | |||
| 204 | 5 | 39 | $params->{'plugins'} ||= {}; | |||
| 205 | ||||||
| 206 | 5 5 | 4 19 | my $self = bless { %{$params} }, $class; | |||
| 207 | ||||||
| 208 | 5 | 13 | $self->load_plugins(); | |||
| 209 | ||||||
| 210 | 5 | 11 | return $self; | |||
| 211 | } | |||||
| 212 | ||||||
| 213 - 217 | =head2 load_plugins Loads plugins from the C<SEO::Inspector::Plugin> namespace. =cut | |||||
| 218 | ||||||
| 219 | sub load_plugins | |||||
| 220 | { | |||||
| 221 | 5 | 5 | my $self = $_[0]; | |||
| 222 | ||||||
| 223 | 5 | 16 | for my $plugin ($self->plugins()) { | |||
| 224 | 13 | 4178 | my $key = lc($plugin =~ s/.*:://r); | |||
| 225 | 13 | 28 | $self->{plugins}{$key} = $plugin->new(); | |||
| 226 | } | |||||
| 227 | 5 | 11 | if($self->{plugin_dirs}) { | |||
| 228 | 2 2 | 2 2 | for my $dir (@{$self->{plugin_dirs}}) { | |||
| 229 | 2 | 5 | local @INC = ($dir, @INC); | |||
| 230 | ||||||
| 231 | 2 | 7 | my $finder = Module::Pluggable::Object->new( | |||
| 232 | search_path => ['SEO::Inspector::Plugin'], | |||||
| 233 | require => 1, | |||||
| 234 | instantiate => 'new', | |||||
| 235 | ); | |||||
| 236 | ||||||
| 237 | 2 | 11 | for my $plugin ($finder->plugins) { | |||
| 238 | 6 | 24 | my $key = lc(ref($plugin) =~ s/.*:://r); | |||
| 239 | 6 | 28 | $self->{plugins}{$key} = $plugin; | |||
| 240 | } | |||||
| 241 | } | |||||
| 242 | } | |||||
| 243 | } | |||||
| 244 | ||||||
| 245 | # Fetch HTML from URL or object default | |||||
| 246 | sub _fetch_html { | |||||
| 247 | 0 | 0 | my ($self, $url) = @_; | |||
| 248 | ||||||
| 249 | 0 | 0 | $url //= $self->{url}; | |||
| 250 | 0 | 0 | croak 'URL missing' unless $url; | |||
| 251 | ||||||
| 252 | 0 | 0 | my $res = $self->{'ua'}->get($url)->result; | |||
| 253 | 0 | 0 | if ($res->is_error) { | |||
| 254 | 0 | 0 | croak 'Fetch failed: ', $res->message(); | |||
| 255 | } | |||||
| 256 | 0 | 0 | return $res->body; | |||
| 257 | } | |||||
| 258 | ||||||
| 259 - 263 | =head2 check($check_name, $html) Run a single built-in check or plugin on provided HTML (or fetch from object URL if HTML not provided). =cut | |||||
| 264 | ||||||
| 265 | # ------------------------------- | |||||
| 266 | # Run a single plugin or built-in check | |||||
| 267 | # ------------------------------- | |||||
| 268 | sub check { | |||||
| 269 | 60 | 41 | my ($self, $check_name, $html) = @_; | |||
| 270 | 60 | 38 | $html //= $self->_fetch_html(); | |||
| 271 | ||||||
| 272 | 60 | 144 | my %dispatch = ( | |||
| 273 | title => \&_check_title, | |||||
| 274 | meta_description => \&_check_meta_description, | |||||
| 275 | canonical => \&_check_canonical, | |||||
| 276 | robots_meta => \&_check_robots_meta, | |||||
| 277 | viewport => \&_check_viewport, | |||||
| 278 | h1_presence => \&_check_h1_presence, | |||||
| 279 | word_count => \&_check_word_count, | |||||
| 280 | links_alt_text => \&_check_links_alt_text, | |||||
| 281 | check_structured_data => \&_check_structured_data, | |||||
| 282 | check_headings => \&_check_headings, | |||||
| 283 | check_links => \&_check_links, | |||||
| 284 | open_graph => \&_check_open_graph, | |||||
| 285 | twitter_cards => \&_check_twitter_cards, | |||||
| 286 | page_size => \&_check_page_size, | |||||
| 287 | readability => \&_check_readability, | |||||
| 288 | ); | |||||
| 289 | ||||||
| 290 | # built-in checks | |||||
| 291 | 60 | 47 | if (exists $dispatch{$check_name}) { | |||
| 292 | 60 | 69 | return $dispatch{$check_name}->($self, $html); | |||
| 293 | } else { | |||||
| 294 | 0 | 0 | croak "Unknown check $check_name"; | |||
| 295 | } | |||||
| 296 | ||||||
| 297 | # plugin checks | |||||
| 298 | 0 | 0 | if (exists $self->{plugins}{$check_name}) { | |||
| 299 | 0 | 0 | my $plugin = $self->{plugins}{$check_name}; | |||
| 300 | 0 | 0 | return $plugin->run($html); | |||
| 301 | } | |||||
| 302 | ||||||
| 303 | 0 | 0 | return { name => $check_name, status => 'unknown', notes => '' }; | |||
| 304 | } | |||||
| 305 | ||||||
| 306 - 310 | =head2 run_all($html) Run all built-in checks on HTML (or object URL). =cut | |||||
| 311 | ||||||
| 312 | # ------------------------------- | |||||
| 313 | # Run all built-in checks | |||||
| 314 | # ------------------------------- | |||||
| 315 | sub run_all | |||||
| 316 | { | |||||
| 317 | 4 | 820 | my ($self, $html) = @_; | |||
| 318 | 4 | 7 | $html //= $self->_fetch_html(); | |||
| 319 | ||||||
| 320 | 4 | 3 | my %results; | |||
| 321 | 4 | 7 | for my $check (qw( | |||
| 322 | title meta_description canonical robots_meta viewport h1_presence word_count links_alt_text | |||||
| 323 | check_structured_data check_headings check_links | |||||
| 324 | open_graph twitter_cards page_size readability | |||||
| 325 | )) { | |||||
| 326 | 60 | 50 | $results{$check} = $self->check($check, $html); | |||
| 327 | } | |||||
| 328 | ||||||
| 329 | 4 | 7 | return \%results; | |||
| 330 | } | |||||
| 331 | ||||||
| 332 - 336 | =head2 check_html($html) Run all loaded plugins on HTML. =cut | |||||
| 337 | ||||||
| 338 | # ------------------------------- | |||||
| 339 | # Run all plugins on HTML | |||||
| 340 | # ------------------------------- | |||||
| 341 | sub check_html { | |||||
| 342 | 5 | 864 | my ($self, $html) = @_; | |||
| 343 | 5 | 9 | $html //= $self->_fetch_html(); | |||
| 344 | 5 | 13 | my %results; | |||
| 345 | ||||||
| 346 | 5 5 | 6 12 | for my $key (keys %{ $self->{plugins} }) { | |||
| 347 | 15 | 36 | my $plugin = $self->{plugins}{$key}; | |||
| 348 | 15 | 22 | $results{$key} = $plugin->run($html); | |||
| 349 | } | |||||
| 350 | ||||||
| 351 | 5 | 8 | return \%results; | |||
| 352 | } | |||||
| 353 | ||||||
| 354 - 358 | =head2 check_url($url) Fetch the URL and run all plugins and built-in checks. =cut | |||||
| 359 | ||||||
| 360 | # ------------------------------- | |||||
| 361 | # Run URL: fetch and check | |||||
| 362 | # ------------------------------- | |||||
| 363 | sub check_url { | |||||
| 364 | 1 | 7 | my ($self, $url) = @_; | |||
| 365 | ||||||
| 366 | 1 | 2 | $url //= $self->{url}; | |||
| 367 | ||||||
| 368 | 1 | 2 | croak('URL missing') unless $url; | |||
| 369 | ||||||
| 370 | 1 | 1 | my $html = $self->_fetch_html($url); | |||
| 371 | ||||||
| 372 | 1 | 49 | my $plugin_results = $self->check_html($html); | |||
| 373 | 1 | 2 | my $builtin_results = $self->run_all($html); | |||
| 374 | ||||||
| 375 | # merge all results | |||||
| 376 | 1 | 8 | my %results = (%$plugin_results, %$builtin_results, _html => $html); | |||
| 377 | 1 | 3 | return \%results; | |||
| 378 | } | |||||
| 379 | ||||||
| 380 | # ------------------------------- | |||||
| 381 | # Built-in check implementations | |||||
| 382 | # ------------------------------- | |||||
| 383 | sub _check_title { | |||||
| 384 | 4 | 4 | my ($self, $html) = @_; | |||
| 385 | ||||||
| 386 | 4 | 15 | if ($html =~ /<title>(.*?)<\/title>/is) { | |||
| 387 | 4 | 9 | my $title = $1; | |||
| 388 | 4 | 10 | $title =~ s/^\s+|\s+$//g; # trim | |||
| 389 | 4 | 30 | $title =~ s/\s{2,}/ /g; # collapse spaces | |||
| 390 | ||||||
| 391 | 4 | 4 | my $len = length($title); | |||
| 392 | 4 | 10 | my $status = 'ok'; | |||
| 393 | 4 | 6 | my $notes = "title present ($len chars)"; | |||
| 394 | ||||||
| 395 | 4 | 9 | if ($len == 0) { | |||
| 396 | 0 | 0 | $status = 'error'; | |||
| 397 | 0 | 0 | $notes = 'empty title'; | |||
| 398 | } elsif ($len < 10) { | |||||
| 399 | 4 | 3 | $status = 'warn'; | |||
| 400 | 4 | 4 | $notes = "title too short ($len chars)"; | |||
| 401 | } elsif ($len > 65) { | |||||
| 402 | 0 | 0 | $status = 'warn'; | |||
| 403 | 0 | 0 | $notes = "title too long ($len chars)"; | |||
| 404 | } | |||||
| 405 | ||||||
| 406 | # Flag really weak titles | |||||
| 407 | 4 | 16 | if ($title =~ /^(home|untitled|index)$/i) { | |||
| 408 | 0 | 0 | $status = 'warn'; | |||
| 409 | 0 | 0 | $notes = "generic title: $title"; | |||
| 410 | } | |||||
| 411 | ||||||
| 412 | 4 | 18 | return { name => 'Title', status => $status, notes => $notes }; | |||
| 413 | } | |||||
| 414 | ||||||
| 415 | 0 | 0 | return { name => 'Title', status => 'error', notes => 'missing title' }; | |||
| 416 | } | |||||
| 417 | ||||||
| 418 | ||||||
| 419 | sub _check_meta_description { | |||||
| 420 | 4 | 8 | my ($self, $html) = @_; | |||
| 421 | 4 | 18 | if ($html =~ /<meta\s+name=["']description["']\s+content=["'](.*?)["']/is) { | |||
| 422 | 3 | 5 | my $desc = $1; | |||
| 423 | 3 | 9 | return { name => 'Meta Description', status => 'ok', notes => 'meta description present' }; | |||
| 424 | } | |||||
| 425 | 1 | 3 | return { name => 'Meta Description', status => 'warn', notes => 'missing meta description' }; | |||
| 426 | } | |||||
| 427 | ||||||
| 428 | sub _check_canonical | |||||
| 429 | { | |||||
| 430 | 4 | 5 | my ($self, $html) = @_; | |||
| 431 | 4 | 8 | if ($html =~ /<link\s+rel=["']canonical["']\s+href=["'](.*?)["']/is) { | |||
| 432 | 0 | 0 | return { name => 'Canonical', status => 'ok', notes => 'canonical link present' }; | |||
| 433 | } | |||||
| 434 | return { | |||||
| 435 | 4 | 16 | name => 'Canonical', | |||
| 436 | status => 'warn', | |||||
| 437 | notes => 'missing canonical link', | |||||
| 438 | resolution => 'Add canonical link to <head>: <link rel="canonical" href="https://your-domain.com/this-page-url"> - use the preferred URL for this page to prevent duplicate content issues' | |||||
| 439 | }; | |||||
| 440 | } | |||||
| 441 | ||||||
| 442 | sub _check_robots_meta { | |||||
| 443 | 4 | 4 | my ($self, $html) = @_; | |||
| 444 | 4 | 11 | if ($html =~ /<meta\s+name=["']robots["']\s+content=["'](.*?)["']/is) { | |||
| 445 | 0 | 0 | return { name => 'Robots Meta', status => 'ok', notes => 'robots meta present' }; | |||
| 446 | } | |||||
| 447 | return { | |||||
| 448 | 4 | 14 | name => 'Robots Meta', | |||
| 449 | status => 'warn', | |||||
| 450 | notes => 'missing robots meta', | |||||
| 451 | resolution => 'Add robots meta tag to <head>: <meta name="robots" content="index, follow"> for normal indexing, or <meta name="robots" content="noindex, nofollow"> to prevent indexing - controls how search engines crawl and index this page' | |||||
| 452 | }; | |||||
| 453 | } | |||||
| 454 | ||||||
| 455 | sub _check_viewport { | |||||
| 456 | 4 | 3 | my ($self, $html) = @_; | |||
| 457 | 4 | 10 | if ($html =~ /<meta\s+name=["']viewport["']\s+content=["'](.*?)["']/is) { | |||
| 458 | 0 | 0 | return { name => 'Viewport', status => 'ok', notes => 'viewport meta present' }; | |||
| 459 | } | |||||
| 460 | return { | |||||
| 461 | 4 | 19 | name => 'Viewport', | |||
| 462 | status => 'warn', | |||||
| 463 | notes => 'missing viewport meta', | |||||
| 464 | resolution => 'Add viewport meta tag to <head>: <meta name="viewport" content="width=device-width, initial-scale=1.0"> - essential for mobile responsiveness and Google mobile-first indexing' | |||||
| 465 | }; | |||||
| 466 | } | |||||
| 467 | ||||||
| 468 | sub _check_h1_presence { | |||||
| 469 | 4 | 5 | my ($self, $html) = @_; | |||
| 470 | 4 | 17 | if ($html =~ /<h1\b[^>]*>(.*?)<\/h1>/is) { | |||
| 471 | 3 | 21 | return { name => 'H1 Presence', status => 'ok', notes => 'h1 tag present' }; | |||
| 472 | } | |||||
| 473 | 1 | 5 | return { name => 'H1 Presence', status => 'warn', notes => 'missing h1' }; | |||
| 474 | } | |||||
| 475 | ||||||
| 476 | sub _check_word_count { | |||||
| 477 | 4 | 4 | my ($self, $html) = @_; | |||
| 478 | 4 | 6 | my $text = $html; | |||
| 479 | 4 | 20 | $text =~ s/<[^>]+>//g; | |||
| 480 | 4 | 5 | my $words = scalar split /\s+/, $text; | |||
| 481 | 4 | 16 | return { name => 'Word Count', status => $words > 0 ? 'ok' : 'warn', notes => "$words words" }; | |||
| 482 | } | |||||
| 483 | ||||||
| 484 | sub _check_links_alt_text { | |||||
| 485 | 4 | 4 | my ($self, $html) = @_; | |||
| 486 | 4 | 3 | my @missing; | |||
| 487 | 4 | 14 | while ($html =~ /<img\b(.*?)>/gis) { | |||
| 488 | 0 | 0 | my $attr = $1; | |||
| 489 | 0 | 0 | push @missing, $1 unless $attr =~ /alt=/i; | |||
| 490 | } | |||||
| 491 | 4 | 7 | if(scalar(@missing)) { | |||
| 492 | return { | |||||
| 493 | 0 | 0 | name => 'Links Alt Text', | |||
| 494 | status => 'warn', | |||||
| 495 | notes => scalar(@missing) . ' images missing alt', | |||||
| 496 | resolution => 'Add alt attributes to all images: <img src="image.jpg" alt="Descriptive text"> - describe the image content for screen readers and SEO. Use alt="" for decorative images that don\'t add meaning' | |||||
| 497 | }; | |||||
| 498 | } | |||||
| 499 | ||||||
| 500 | return { | |||||
| 501 | 4 | 14 | name => 'Links Alt Text', | |||
| 502 | status => 'ok', | |||||
| 503 | notes => 'all images have alt' | |||||
| 504 | }; | |||||
| 505 | } | |||||
| 506 | ||||||
| 507 | sub _check_structured_data { | |||||
| 508 | 4 | 5 | my ($self, $html) = @_; | |||
| 509 | ||||||
| 510 | 4 | 9 | my @jsonld = ($html =~ /<script\b[^>]*type=["']application\/ld\+json["'][^>]*>(.*?)<\/script>/gis); | |||
| 511 | ||||||
| 512 | 4 | 6 | if(scalar(@jsonld)) { | |||
| 513 | return { | |||||
| 514 | 0 | 0 | name => 'Structured Data', | |||
| 515 | status => 'ok', | |||||
| 516 | notes => scalar(@jsonld) . ' JSON-LD block(s) found' | |||||
| 517 | }; | |||||
| 518 | } | |||||
| 519 | ||||||
| 520 | return { | |||||
| 521 | 4 | 12 | name => 'Structured Data', | |||
| 522 | status => 'warn', | |||||
| 523 | notes => 'no structured data found', | |||||
| 524 | resolution => 'Add JSON-LD structured data to <head>: <script type="application/ld+json">{"@context": "https://schema.org", "@type": "WebPage", "name": "Page Title", "description": "Page description"}</script> - helps search engines understand your content better and enables rich snippets' | |||||
| 525 | } | |||||
| 526 | } | |||||
| 527 | ||||||
| 528 | # _check_headings | |||||
| 529 | # ---------------- | |||||
| 530 | # Analyzes the HTML document for heading structure and returns a structured | |||||
| 531 | # SEO/a11y report. | |||||
| 532 | # | |||||
| 533 | # Checks performed: | |||||
| 534 | # - Presence of headings (<h1>â<h6>), with counts of each level. | |||||
| 535 | # - Ensures exactly one <h1> exists (warns if missing or multiple). | |||||
| 536 | # - Validates heading hierarchy (no skipped levels, e.g. <h3> should not appear before an <h2>). | |||||
| 537 | # - Flags suspicious heading text lengths (too short < 2 chars, or too long > 120 chars). | |||||
| 538 | # | |||||
| 539 | # Returns: | |||||
| 540 | # { | |||||
| 541 | # name => 'Headings', | |||||
| 542 | # status => 'ok' | 'warn', | |||||
| 543 | # notes => 'summary of counts and issues' | |||||
| 544 | # } | |||||
| 545 | # | |||||
| 546 | # Notes: | |||||
| 547 | # - Status is 'warn' if issues are found, otherwise 'ok'. | |||||
| 548 | # - 'error' status is reserved for future use (currently unused). | |||||
| 549 | ||||||
| 550 | sub _check_headings { | |||||
| 551 | 4 | 4 | my ($self, $html) = @_; | |||
| 552 | ||||||
| 553 | 4 | 5 | my %counts; | |||
| 554 | my @headings; | |||||
| 555 | ||||||
| 556 | # Capture all headings and their order | |||||
| 557 | 4 | 15 | while ($html =~ /<(h[1-6])\b[^>]*>(.*?)<\/\1>/gi) { | |||
| 558 | 3 | 4 | my $tag = lc $1; | |||
| 559 | 3 | 5 | my $text = $2 // ''; | |||
| 560 | 3 | 4 | $text =~ s/\s+/ /g; # normalize whitespace | |||
| 561 | 3 | 6 | $text =~ s/^\s+|\s+$//g; | |||
| 562 | ||||||
| 563 | 3 | 6 | $counts{$tag}++; | |||
| 564 | 3 | 10 | push @headings, { level => substr($tag, 1), text => $text }; | |||
| 565 | } | |||||
| 566 | ||||||
| 567 | 4 | 4 | my @issues; | |||
| 568 | 4 | 4 | my $status = 'ok'; | |||
| 569 | ||||||
| 570 | # Check for no headings | |||||
| 571 | 4 | 8 | if (!%counts) { | |||
| 572 | return { | |||||
| 573 | 1 | 3 | name => 'Headings', | |||
| 574 | status => 'warn', | |||||
| 575 | notes => 'no headings found', | |||||
| 576 | }; | |||||
| 577 | } | |||||
| 578 | ||||||
| 579 | # Check H1 presence/uniqueness | |||||
| 580 | 3 | 9 | if (!$counts{h1}) { | |||
| 581 | 0 | 0 | push @issues, 'missing <h1>'; | |||
| 582 | 0 | 0 | $status = 'warn'; | |||
| 583 | } | |||||
| 584 | elsif ($counts{h1} > 1) { | |||||
| 585 | 0 | 0 | push @issues, 'multiple <h1> tags'; | |||
| 586 | 0 | 0 | $status = 'warn'; | |||
| 587 | } | |||||
| 588 | ||||||
| 589 | # Check heading hierarchy (no skipped levels) | |||||
| 590 | 3 | 4 | my $last_level = 0; | |||
| 591 | 3 | 5 | for my $h (@headings) { | |||
| 592 | 3 | 3 | my $level = $h->{level}; | |||
| 593 | 3 | 6 | if ($last_level && $level > $last_level + 1) { | |||
| 594 | 0 | 0 | push @issues, "skipped heading level before <h$level>"; | |||
| 595 | 0 | 0 | $status = 'warn'; | |||
| 596 | } | |||||
| 597 | 3 | 3 | $last_level = $level; | |||
| 598 | } | |||||
| 599 | ||||||
| 600 | # Check heading text length (too short or too long) | |||||
| 601 | 3 | 5 | for my $h (@headings) { | |||
| 602 | 3 | 3 | my $len = length($h->{text}); | |||
| 603 | 3 | 12 | if ($len < 2) { | |||
| 604 | 0 | 0 | push @issues, "<h$h->{level}> too short"; | |||
| 605 | 0 | 0 | $status = 'warn'; | |||
| 606 | } | |||||
| 607 | elsif ($len > 120) { | |||||
| 608 | 0 | 0 | push @issues, "<h$h->{level}> too long"; | |||
| 609 | 0 | 0 | $status = 'warn'; | |||
| 610 | } | |||||
| 611 | } | |||||
| 612 | ||||||
| 613 | # Summarize counts and issues | |||||
| 614 | 3 3 | 5 7 | my $summary = join ', ', map { "$_: $counts{$_}" } sort keys %counts; | |||
| 615 | 3 | 12 | $summary .= @issues ? " | Issues: " . join('; ', @issues) : ''; | |||
| 616 | ||||||
| 617 | return { | |||||
| 618 | 3 | 12 | name => 'Headings', | |||
| 619 | status => $status, | |||||
| 620 | notes => $summary, | |||||
| 621 | }; | |||||
| 622 | } | |||||
| 623 | ||||||
| 624 | sub _check_links { | |||||
| 625 | 4 | 4 | my ($self, $html) = @_; | |||
| 626 | ||||||
| 627 | 4 | 3 | my $base_host; | |||
| 628 | 4 | 10 | if ($self->{url} && $self->{url} =~ m{^https?://}i) { | |||
| 629 | 0 | 0 | $base_host = Mojo::URL->new($self->{url})->host; | |||
| 630 | } | |||||
| 631 | ||||||
| 632 | 4 | 6 | my ($total, $internal, $external, $badtext) = (0,0,0,0); | |||
| 633 | ||||||
| 634 | # common "bad" link text patterns (exact match or just punctuation around) | |||||
| 635 | 4 | 8 | my $bad_rx = qr/^(?:click\s*here|read\s*more|more|link|here|details)$/i; | |||
| 636 | 4 | 2 | my $badtext_content; | |||
| 637 | ||||||
| 638 | 4 | 11 | while ($html =~ m{<a\b([^>]*)>(.*?)</a>}gis) { | |||
| 639 | 0 | 0 | my $attrs = $1; | |||
| 640 | 0 | 0 | my $text = $2 // ''; | |||
| 641 | ||||||
| 642 | 0 | 0 | $total++; | |||
| 643 | ||||||
| 644 | # get href (prefer quoted values) | |||||
| 645 | 0 | 0 | my ($href) = $attrs =~ /\bhref\s*=\s*"(.*?)"/i; | |||
| 646 | 0 | 0 | $href //= ($attrs =~ /\bhref\s*=\s*'(.*?)'/i ? $1 : undef); | |||
| 647 | 0 | 0 | $href //= ($attrs =~ /\bhref\s*=\s*([^\s>]+)/i ? $1 : undef); | |||
| 648 | ||||||
| 649 | # classify internal vs external | |||||
| 650 | 0 | 0 | if (defined $href && $href =~ m{^\s*https?://}i) { | |||
| 651 | # attempt to compare host | |||||
| 652 | 0 | 0 | my ($host) = $href =~ m{^\s*https?://([^/:\s]+)}i; | |||
| 653 | 0 | 0 | if (defined $base_host && defined $host) { | |||
| 654 | 0 | 0 | if (lc $host eq lc $base_host) { | |||
| 655 | 0 | 0 | $internal++; | |||
| 656 | } else { | |||||
| 657 | 0 | 0 | $external++; | |||
| 658 | } | |||||
| 659 | } else { | |||||
| 660 | # no base host to compare; treat as external if absolute URL | |||||
| 661 | 0 | 0 | $external++; | |||
| 662 | } | |||||
| 663 | } else { | |||||
| 664 | # relative URL or fragment or mailto/etc -> treat as internal | |||||
| 665 | 0 | 0 | $internal++; | |||
| 666 | } | |||||
| 667 | ||||||
| 668 | # normalize visible text: strip tags, trim whitespace, collapse spaces | |||||
| 669 | 0 | 0 | $text =~ s/<[^>]+>//g; | |||
| 670 | 0 | 0 | $text =~ s/^\s+|\s+$//g; | |||
| 671 | 0 | 0 | $text =~ s/\s+/ /g; | |||
| 672 | ||||||
| 673 | # check for bad link text (exact-ish) | |||||
| 674 | 0 | 0 | if($text =~ $bad_rx) { | |||
| 675 | 0 | 0 | $badtext++; | |||
| 676 | 0 | 0 | $badtext_content = $text; | |||
| 677 | } | |||||
| 678 | } | |||||
| 679 | ||||||
| 680 | 4 | 13 | my $status = ($external || $badtext) ? 'warn' : ($total ? 'ok' : 'warn'); | |||
| 681 | ||||||
| 682 | 4 | 5 | if ($total) { | |||
| 683 | 0 | 0 | if($badtext == 0) { | |||
| 684 | return { | |||||
| 685 | 0 | 0 | name => 'Links', | |||
| 686 | status => $status, | |||||
| 687 | notes => sprintf('%d total (%d internal, %d external)', | |||||
| 688 | $total, $internal, $external), | |||||
| 689 | } | |||||
| 690 | } | |||||
| 691 | 0 | 0 | if($badtext == 1) { | |||
| 692 | return { | |||||
| 693 | 0 | 0 | name => 'Links', | |||
| 694 | status => $status, | |||||
| 695 | notes => sprintf('%d total (%d internal, %d external). 1 link with poor anchor text', | |||||
| 696 | $total, $internal, $external), | |||||
| 697 | resolution => "fix link text '$badtext_content'" | |||||
| 698 | } | |||||
| 699 | } | |||||
| 700 | return { | |||||
| 701 | 0 | 0 | name => 'Links', | |||
| 702 | status => $status, | |||||
| 703 | notes => sprintf('%d total (%d internal, %d external). %d links with poor anchor text', | |||||
| 704 | $total, $internal, $external, $badtext), | |||||
| 705 | }; | |||||
| 706 | } | |||||
| 707 | return { | |||||
| 708 | 4 | 17 | name => 'Links', | |||
| 709 | status => $status, | |||||
| 710 | notes => 'no links found' | |||||
| 711 | } | |||||
| 712 | } | |||||
| 713 | ||||||
| 714 | # Checks for essential Open Graph tags that improve social media sharing | |||||
| 715 | sub _check_open_graph { | |||||
| 716 | 4 | 5 | my ($self, $html) = @_; | |||
| 717 | ||||||
| 718 | 4 | 4 | my %og_tags; | |||
| 719 | 4 | 8 | my @required = qw(title description image url); | |||
| 720 | ||||||
| 721 | # Extract all Open Graph meta tags | |||||
| 722 | 4 | 16 | while ($html =~ /<meta\s+(?:property|name)=["']og:([^"']+)["']\s+content=["']([^"']*)["']/gis) { | |||
| 723 | 0 | 0 | $og_tags{$1} = $2; | |||
| 724 | } | |||||
| 725 | ||||||
| 726 | 4 16 | 12 30 | my @missing = grep { !exists $og_tags{$_} || !$og_tags{$_} } @required; | |||
| 727 | 4 | 5 | my $found = keys %og_tags; | |||
| 728 | ||||||
| 729 | 4 | 7 | my $status = @missing ? 'warn' : 'ok'; | |||
| 730 | 4 | 6 | my $notes; | |||
| 731 | ||||||
| 732 | 4 | 4 | if ($found == 0) { | |||
| 733 | 4 | 6 | $notes = 'no Open Graph tags found'; | |||
| 734 | 4 | 7 | $status = 'warn'; | |||
| 735 | } elsif (@missing) { | |||||
| 736 | 0 | 0 | $notes = sprintf('%d OG tags found, missing: %s', $found, join(', ', @missing)); | |||
| 737 | } else { | |||||
| 738 | 0 | 0 | $notes = sprintf('all essential OG tags present (%d total)', $found); | |||
| 739 | } | |||||
| 740 | ||||||
| 741 | return { | |||||
| 742 | 4 | 14 | name => 'Open Graph', | |||
| 743 | status => $status, | |||||
| 744 | notes => $notes, | |||||
| 745 | resolution => 'Add missing tags to <head>: <meta property="og:title" content="Your Page Title">, <meta property="og:description" content="Brief page description">' | |||||
| 746 | }; | |||||
| 747 | } | |||||
| 748 | ||||||
| 749 | # Checks for Twitter Card meta tags for better Twitter sharing | |||||
| 750 | sub _check_twitter_cards { | |||||
| 751 | 4 | 3 | my ($self, $html) = @_; | |||
| 752 | ||||||
| 753 | 4 | 2 | my %twitter_tags; | |||
| 754 | 4 | 8 | my @recommended = qw(card title description); | |||
| 755 | ||||||
| 756 | # Extract Twitter Card meta tags | |||||
| 757 | 4 | 12 | while ($html =~ /<meta\s+(?:property|name)=["']twitter:([^"']+)["']\s+content=["']([^"']*)["']/gis) { | |||
| 758 | 0 | 0 | $twitter_tags{$1} = $2; | |||
| 759 | } | |||||
| 760 | ||||||
| 761 | 4 12 | 5 15 | my @missing = grep { !exists $twitter_tags{$_} || !$twitter_tags{$_} } @recommended; | |||
| 762 | 4 | 4 | my $found = keys %twitter_tags; | |||
| 763 | ||||||
| 764 | 4 | 6 | my $status = @missing ? 'warn' : 'ok'; | |||
| 765 | 4 | 4 | my $notes; | |||
| 766 | ||||||
| 767 | 4 | 7 | if ($found == 0) { | |||
| 768 | 4 | 3 | $notes = 'no Twitter Card tags found'; | |||
| 769 | 4 | 4 | $status = 'warn'; | |||
| 770 | } elsif (@missing) { | |||||
| 771 | 0 | 0 | $notes = sprintf('%d Twitter tags found, missing: %s', $found, join(', ', @missing)); | |||
| 772 | } else { | |||||
| 773 | 0 | 0 | $notes = sprintf('essential Twitter Card tags present (%d total)', $found); | |||
| 774 | } | |||||
| 775 | ||||||
| 776 | return { | |||||
| 777 | 4 | 12 | name => 'Twitter Cards', | |||
| 778 | status => $status, | |||||
| 779 | notes => $notes, | |||||
| 780 | resolution => 'Add missing tags to <head>: <meta name="twitter:card" content="summary">, <meta name="twitter:title" content="Your Page Title">' | |||||
| 781 | }; | |||||
| 782 | } | |||||
| 783 | ||||||
| 784 | # Checks HTML size and warns if too large (impacts loading speed) | |||||
| 785 | sub _check_page_size { | |||||
| 786 | 4 | 6 | my ($self, $html) = @_; | |||
| 787 | ||||||
| 788 | 4 | 2 | my $size_bytes = length($html); | |||
| 789 | 4 | 8 | my $size_kb = int($size_bytes / 1024); | |||
| 790 | ||||||
| 791 | 4 | 5 | my $status = 'ok'; | |||
| 792 | 4 | 5 | my $notes = "${size_kb}KB HTML size"; | |||
| 793 | 4 | 3 | my $resolution = ''; | |||
| 794 | ||||||
| 795 | 4 | 16 | if ($size_bytes > 1_048_576) { # > 1MB | |||
| 796 | 0 | 0 | $status = 'error'; | |||
| 797 | 0 | 0 | $notes .= ' (too large, over 1MB)'; | |||
| 798 | 0 | 0 | $resolution = 'Consider optimizing: minify CSS/JS, compress images, remove unused elements, enable server compression'; | |||
| 799 | } elsif ($size_bytes > 102_400) { # > 100KB | |||||
| 800 | 0 | 0 | $status = 'warn'; | |||
| 801 | 0 | 0 | $notes .= ' (large, consider optimization)'; | |||
| 802 | } elsif ($size_bytes < 1024) { # < 1KB | |||||
| 803 | 4 | 3 | $status = 'warn'; | |||
| 804 | 4 | 4 | $notes .= ' (suspiciously small)'; | |||
| 805 | } else { | |||||
| 806 | 0 | 0 | $notes .= ' (good size)'; | |||
| 807 | } | |||||
| 808 | ||||||
| 809 | return { | |||||
| 810 | 4 | 10 | name => 'Page Size', | |||
| 811 | status => $status, | |||||
| 812 | notes => $notes, | |||||
| 813 | resolution => $resolution | |||||
| 814 | }; | |||||
| 815 | } | |||||
| 816 | ||||||
| 817 | # Calculates approximate Flesch Reading Ease score for content readability | |||||
| 818 | sub _check_readability { | |||||
| 819 | 4 | 4 | my ($self, $html) = @_; | |||
| 820 | ||||||
| 821 | # Extract text content (remove scripts, styles, and HTML tags) | |||||
| 822 | 4 | 5 | my $text = $html; | |||
| 823 | 4 | 8 | $text =~ s/<script\b[^>]*>.*?<\/script>//gis; | |||
| 824 | 4 | 6 | $text =~ s/<style\b[^>]*>.*?<\/style>//gis; | |||
| 825 | 4 | 19 | $text =~ s/<[^>]+>//g; | |||
| 826 | 4 | 7 | $text =~ s/\s+/ /g; | |||
| 827 | 4 | 9 | $text =~ s/^\s+|\s+$//g; | |||
| 828 | ||||||
| 829 | return { | |||||
| 830 | 4 | 18 | name => 'Readability', | |||
| 831 | status => 'warn', | |||||
| 832 | notes => 'insufficient text for analysis', | |||||
| 833 | resolution => 'Add more content to the page - aim for at least 300 words of meaningful text', | |||||
| 834 | } if length($text) < 100; | |||||
| 835 | ||||||
| 836 | # Count sentences (approximate) | |||||
| 837 | 0 | my $sentences = () = $text =~ /[.!?]+/g; | ||||
| 838 | 0 | $sentences = 1 if $sentences == 0; # avoid division by zero | ||||
| 839 | ||||||
| 840 | # Count words | |||||
| 841 | 0 | my @words = split /\s+/, $text; | ||||
| 842 | 0 | my $word_count = @words; | ||||
| 843 | ||||||
| 844 | return { | |||||
| 845 | 0 | name => 'Readability', | ||||
| 846 | status => 'warn', | |||||
| 847 | notes => 'insufficient content for analysis', | |||||
| 848 | resolution => 'Add more substantial content - aim for at least 300 words for proper SEO value', | |||||
| 849 | } if $word_count < 50; | |||||
| 850 | ||||||
| 851 | # Count syllables (very basic approximation) | |||||
| 852 | 0 | my $syllables = 0; | ||||
| 853 | 0 | for my $word (@words) { | ||||
| 854 | 0 | $word = lc($word); | ||||
| 855 | 0 | $word =~ s/[^a-z]//g; # remove punctuation | ||||
| 856 | 0 | next if length($word) == 0; | ||||
| 857 | ||||||
| 858 | # Simple syllable counting heuristic | |||||
| 859 | 0 | my $vowels = () = $word =~ /[aeiouy]/g; | ||||
| 860 | 0 | $syllables += $vowels > 0 ? $vowels : 1; | ||||
| 861 | 0 | $syllables-- if $word =~ /e$/; # silent e | ||||
| 862 | } | |||||
| 863 | 0 | $syllables = $word_count if $syllables < $word_count; # minimum 1 syllable per word | ||||
| 864 | ||||||
| 865 | # Flesch Reading Ease formula | |||||
| 866 | 0 | my $avg_sentence_length = $word_count / $sentences; | ||||
| 867 | 0 | my $avg_syllables_per_word = $syllables / $word_count; | ||||
| 868 | 0 | my $flesch_score = 206.835 - (1.015 * $avg_sentence_length) - (84.6 * $avg_syllables_per_word); | ||||
| 869 | ||||||
| 870 | 0 | my $status = 'ok'; | ||||
| 871 | 0 | my $level; | ||||
| 872 | my $notes; | |||||
| 873 | 0 | my $resolution = ''; | ||||
| 874 | ||||||
| 875 | 0 | if ($flesch_score >= 90) { | ||||
| 876 | 0 | $level = 'very easy'; | ||||
| 877 | } elsif ($flesch_score >= 80) { | |||||
| 878 | 0 | $level = 'easy'; | ||||
| 879 | } elsif ($flesch_score >= 70) { | |||||
| 880 | 0 | $level = 'fairly easy'; | ||||
| 881 | } elsif ($flesch_score >= 60) { | |||||
| 882 | 0 | $level = 'standard'; | ||||
| 883 | } elsif ($flesch_score >= 50) { | |||||
| 884 | 0 | $level = 'fairly difficult'; | ||||
| 885 | 0 | $status = 'warn'; | ||||
| 886 | 0 | $resolution = 'Consider simplifying: use shorter sentences (aim for 15-20 words), choose simpler words, break up long paragraphs, add bullet points or lists'; | ||||
| 887 | } elsif ($flesch_score >= 30) { | |||||
| 888 | 0 | $level = 'difficult'; | ||||
| 889 | 0 | $status = 'warn'; | ||||
| 890 | 0 | $resolution = 'Improve readability: use much shorter sentences (10-15 words), replace complex words with simpler alternatives, add more paragraph breaks, use active voice'; | ||||
| 891 | } else { | |||||
| 892 | 0 | $level = 'very difficult'; | ||||
| 893 | 0 | $status = 'warn'; | ||||
| 894 | 0 | $resolution = 'Significantly simplify content: break long sentences into multiple short ones, replace jargon with plain language, add explanations for technical terms, use more white space and formatting'; | ||||
| 895 | } | |||||
| 896 | ||||||
| 897 | 0 | $notes = sprintf('Flesch score: %.1f (%s) - %d words, %d sentences', | ||||
| 898 | $flesch_score, $level, $word_count, $sentences); | |||||
| 899 | ||||||
| 900 | return { | |||||
| 901 | 0 | name => 'Readability', | ||||
| 902 | status => $status, | |||||
| 903 | notes => $notes, | |||||
| 904 | resolution => $resolution, | |||||
| 905 | }; | |||||
| 906 | } | |||||
| 907 | ||||||
| 908 - 982 | =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/SEO-Inspector/coverage/> =item * L<https://github.com/nigelhorne/SEO-Checker> =item * L<https://github.com/sethblack/python-seo-analyzer> =back =head1 REPOSITORY L<https://github.com/nigelhorne/SEO-Inspector> =head1 SUPPORT This module is provided as-is without any warranty. Please report any bugs or feature requests to C<bug-seo-inspector at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SEO-Inspector>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. You can find documentation for this module with the perldoc command. perldoc SEO::Inspector You can also look for information at: =over 4 =item * MetaCPAN L<https://metacpan.org/dist/SEO-Inspector> =item * RT: CPAN's request tracker L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=SEO-Inspector> =item * CPAN Testers' Matrix L<http://matrix.cpantesters.org/?dist=SEO-Inspector> =item * CPAN Testers Dependencies L<http://deps.cpantesters.org/?module=SEO::Inspector> =back =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut | |||||
| 983 | ||||||
| 984 | 1; | |||||
| 985 | ||||||