File: | blib/lib/SEO/Inspector.pm |
Coverage: | 55.1% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | package SEO::Inspector; | |||||
2 | ||||||
3 | 6 6 6 | 348202 6 76 | use strict; | |||
4 | 6 6 6 | 11 2 105 | use warnings; | |||
5 | ||||||
6 | 6 6 6 | 8 10 118 | use Carp; | |||
7 | 6 6 6 | 1380 1023494 22 | use Mojo::UserAgent; | |||
8 | 6 6 6 | 145 5 6 | use Mojo::URL; | |||
9 | 6 6 6 | 1261 17819 16 | use Module::Pluggable require => 1, search_path => 'SEO::Inspector::Plugin'; | |||
10 | 6 6 6 | 1370 136209 84 | use Object::Configure 0.14; | |||
11 | 6 6 6 | 15 24 11116 | use Params::Get 0.13; | |||
12 | ||||||
13 - 21 | =head1 NAME SEO::Inspector - Run SEO checks on HTML or URLs =head1 VERSION Version 0.02 =cut | |||||
22 | ||||||
23 | our $VERSION = '0.02'; | |||||
24 | ||||||
25 - 164 | =head1 SYNOPSIS use SEO::Inspector; my $inspector = SEO::Inspector->new(url => 'https://example.com'); # Run plugins my $html = '<html><body>......</body></html>'; my $plugin_results = $inspector->check_html($html); # Run built-in checks my $builtin_results = $inspector->run_all($html); # Check a single URL and get all results my $all_results = $inspector->check_url('https://example.com'); =head1 DESCRIPTION SEO::Inspector provides: =over 4 =item * 14 built-in SEO checks =item * Plugin system: dynamically load modules under SEO::Inspector::Plugin namespace =item * Methods to check HTML strings or fetch and analyze a URL =back =head1 PLUGIN SYSTEM In addition to the built-in SEO checks, C<SEO::Inspector> supports a flexible plugin system. Plugins allow you to extend the checker with new rules or specialized analysis without modifying the core module. =head2 How Plugins Are Found Plugins are loaded dynamically from the C<SEO::Inspector::Plugin> namespace. For example, a module called: package SEO::Inspector::Plugin::MyCheck; will be detected and loaded automatically if it is available in C<@INC>. You can also tell the constructor to search additional directories by passing the C<plugin_dirs> argument: my $inspector = SEO::Inspector->new( plugin_dirs => ['t/lib', '/path/to/custom/plugins'], ); Each directory must contain files under a subpath corresponding to the namespace, for example: /path/to/custom/plugins/SEO/Inspector/Plugin/Foo.pm =head2 Plugin Interface A plugin must provide at least two methods: =over 4 =item * C<new> Constructor, called with no arguments. =item * C<run($html)> Given a string of raw HTML, return a hashref describing the result of the check. The hashref should have at least these keys: { name => 'My Check', status => 'ok' | 'warn' | 'error', notes => 'human-readable message', resolution => 'how to resolve' } =back =head2 Running Plugins You can run all loaded plugins against a piece of HTML with: my $results = $inspector->check_html($html); This returns a hashref keyed by plugin name (lowercased), each value being the hashref returned by the plugin's C<run> method. Plugins are also run automatically when you call C<check_url>: my $results = $inspector->check_url('https://example.com'); That result will include both built-in checks and plugin checks. =head2 Example Plugin Here is a minimal example plugin that checks whether the page contains the string "Hello": package SEO::Inspector::Plugin::HelloCheck; use strict; use warnings; sub new { bless {}, shift } sub run { my ($self, $html) = @_; if($html =~ /Hello/) { return { name => 'Hello Check', status => 'ok', notes => 'found Hello' }; } else { return { name => 'Hello Check', status => 'warn', notes => 'no Hello', resolution => 'add a hello field' }; } } 1; Place this file under C<lib/SEO/Inspector/Plugin/HelloCheck.pm> (or another directory listed in C<plugin_dirs>), and it will be discovered automatically. =head2 Naming Conventions The plugin key stored in C<< $inspector->{plugins} >> is derived from the final part of the package name, lowercased. For example: SEO::Inspector::Plugin::HelloCheck -> "hellocheck" This is the key you will see in the hashref returned by C<check_html> or C<check_url>. =head1 METHODS =head2 new(%args) Create a new inspector object. Accepts optional C<url> and C<plugin_dirs> arguments. If C<plugin_dirs> isn't given, it tries hard to find the right place. =cut | |||||
165 | ||||||
166 | # ------------------------------- | |||||
167 | # Constructor | |||||
168 | # ------------------------------- | |||||
169 | sub new { | |||||
170 | 5 | 297747 | my $class = shift; | |||
171 | 5 | 16 | my $params = Object::Configure::configure($class, Params::Get::get_params(undef, \@_)); | |||
172 | ||||||
173 | 5 | 110010 | $params->{'ua'} ||= Mojo::UserAgent->new(); | |||
174 | 5 | 44 | $params->{'plugins'} ||= {}; | |||
175 | ||||||
176 | 5 5 | 4 12 | my $self = bless { %{$params} }, $class; | |||
177 | ||||||
178 | 5 | 9 | $self->load_plugins(); | |||
179 | ||||||
180 | 5 | 11 | return $self; | |||
181 | } | |||||
182 | ||||||
183 - 187 | =head2 load_plugins Loads plugins from the C<SEO::Inspector::Plugin> namespace. =cut | |||||
188 | ||||||
189 | # ------------------------------- | |||||
190 | # Load plugins from SEO::Inspector::Plugin namespace | |||||
191 | # ------------------------------- | |||||
192 | sub load_plugins { | |||||
193 | 5 | 5 | my $self = $_[0]; | |||
194 | ||||||
195 | 5 | 14 | for my $plugin ($self->plugins()) { | |||
196 | 13 | 4110 | my $key = lc($plugin =~ s/.*:://r); | |||
197 | 13 | 30 | $self->{plugins}{$key} = $plugin->new(); | |||
198 | } | |||||
199 | 5 | 10 | if($self->{plugin_dirs}) { | |||
200 | 2 2 | 2 3 | for my $dir (@{$self->{plugin_dirs}}) { | |||
201 | 2 | 5 | local @INC = ($dir, @INC); | |||
202 | ||||||
203 | 2 | 6 | my $finder = Module::Pluggable::Object->new( | |||
204 | search_path => ['SEO::Inspector::Plugin'], | |||||
205 | require => 1, | |||||
206 | instantiate => 'new', | |||||
207 | ); | |||||
208 | ||||||
209 | 2 | 11 | for my $plugin ($finder->plugins) { | |||
210 | 6 | 24 | my $key = lc(ref($plugin) =~ s/.*:://r); | |||
211 | 6 | 26 | $self->{plugins}{$key} = $plugin; | |||
212 | } | |||||
213 | } | |||||
214 | } | |||||
215 | } | |||||
216 | ||||||
217 | # ------------------------------- | |||||
218 | # Fetch HTML from URL or object default | |||||
219 | # ------------------------------- | |||||
220 | sub _fetch_html { | |||||
221 | 0 | 0 | my ($self, $url) = @_; | |||
222 | 0 | 0 | $url //= $self->{url}; | |||
223 | 0 | 0 | croak 'URL missing' unless $url; | |||
224 | ||||||
225 | 0 | 0 | my $res = $self->{ua}->get($url)->result; | |||
226 | 0 | 0 | if ($res->is_error) { | |||
227 | 0 | 0 | croak 'Fetch failed: ', $res->message(); | |||
228 | } | |||||
229 | 0 | 0 | return $res->body; | |||
230 | } | |||||
231 | ||||||
232 - 236 | =head2 check($check_name, $html) Run a single built-in check or plugin on provided HTML (or fetch from object URL if HTML not provided). =cut | |||||
237 | ||||||
238 | # ------------------------------- | |||||
239 | # Run a single plugin or built-in check | |||||
240 | # ------------------------------- | |||||
241 | sub check { | |||||
242 | 60 | 33 | my ($self, $check_name, $html) = @_; | |||
243 | 60 | 47 | $html //= $self->_fetch_html(); | |||
244 | ||||||
245 | 60 | 135 | my %dispatch = ( | |||
246 | title => \&_check_title, | |||||
247 | meta_description => \&_check_meta_description, | |||||
248 | canonical => \&_check_canonical, | |||||
249 | robots_meta => \&_check_robots_meta, | |||||
250 | viewport => \&_check_viewport, | |||||
251 | h1_presence => \&_check_h1_presence, | |||||
252 | word_count => \&_check_word_count, | |||||
253 | links_alt_text => \&_check_links_alt_text, | |||||
254 | check_structured_data => \&_check_structured_data, | |||||
255 | check_headings => \&_check_headings, | |||||
256 | check_links => \&_check_links, | |||||
257 | open_graph => \&_check_open_graph, | |||||
258 | twitter_cards => \&_check_twitter_cards, | |||||
259 | page_size => \&_check_page_size, | |||||
260 | readability => \&_check_readability, | |||||
261 | ); | |||||
262 | ||||||
263 | # built-in checks | |||||
264 | 60 | 43 | if (exists $dispatch{$check_name}) { | |||
265 | 60 | 57 | return $dispatch{$check_name}->($self, $html); | |||
266 | } else { | |||||
267 | 0 | 0 | croak "Unknown check $check_name"; | |||
268 | } | |||||
269 | ||||||
270 | # plugin checks | |||||
271 | 0 | 0 | if (exists $self->{plugins}{$check_name}) { | |||
272 | 0 | 0 | my $plugin = $self->{plugins}{$check_name}; | |||
273 | 0 | 0 | return $plugin->run($html); | |||
274 | } | |||||
275 | ||||||
276 | 0 | 0 | return { name => $check_name, status => 'unknown', notes => '' }; | |||
277 | } | |||||
278 | ||||||
279 - 283 | =head2 run_all($html) Run all built-in checks on HTML (or object URL). =cut | |||||
284 | ||||||
285 | # ------------------------------- | |||||
286 | # Run all built-in checks | |||||
287 | # ------------------------------- | |||||
288 | sub run_all | |||||
289 | { | |||||
290 | 4 | 894 | my ($self, $html) = @_; | |||
291 | 4 | 5 | $html //= $self->_fetch_html(); | |||
292 | ||||||
293 | 4 | 3 | my %results; | |||
294 | 4 | 8 | for my $check (qw( | |||
295 | title meta_description canonical robots_meta viewport h1_presence word_count links_alt_text | |||||
296 | check_structured_data check_headings check_links | |||||
297 | open_graph twitter_cards page_size readability | |||||
298 | )) { | |||||
299 | 60 | 50 | $results{$check} = $self->check($check, $html); | |||
300 | } | |||||
301 | ||||||
302 | 4 | 5 | return \%results; | |||
303 | } | |||||
304 | ||||||
305 - 309 | =head2 check_html($html) Run all loaded plugins on HTML. =cut | |||||
310 | ||||||
311 | # ------------------------------- | |||||
312 | # Run all plugins on HTML | |||||
313 | # ------------------------------- | |||||
314 | sub check_html { | |||||
315 | 5 | 691 | my ($self, $html) = @_; | |||
316 | 5 | 10 | $html //= $self->_fetch_html(); | |||
317 | 5 | 3 | my %results; | |||
318 | ||||||
319 | 5 5 | 5 9 | for my $key (keys %{ $self->{plugins} }) { | |||
320 | 15 | 24 | my $plugin = $self->{plugins}{$key}; | |||
321 | 15 | 20 | $results{$key} = $plugin->run($html); | |||
322 | } | |||||
323 | ||||||
324 | 5 | 12 | return \%results; | |||
325 | } | |||||
326 | ||||||
327 - 331 | =head2 check_url($url) Fetch the URL and run all plugins and built-in checks. =cut | |||||
332 | ||||||
333 | # ------------------------------- | |||||
334 | # Run URL: fetch and check | |||||
335 | # ------------------------------- | |||||
336 | sub check_url { | |||||
337 | 1 | 7 | my ($self, $url) = @_; | |||
338 | ||||||
339 | 1 | 1 | $url //= $self->{url}; | |||
340 | ||||||
341 | 1 | 2 | croak('URL missing') unless $url; | |||
342 | ||||||
343 | 1 | 1 | my $html = $self->_fetch_html($url); | |||
344 | ||||||
345 | 1 | 32 | my $plugin_results = $self->check_html($html); | |||
346 | 1 | 2 | my $builtin_results = $self->run_all($html); | |||
347 | ||||||
348 | # merge all results | |||||
349 | 1 | 4 | my %results = (%$plugin_results, %$builtin_results, _html => $html); | |||
350 | 1 | 2 | return \%results; | |||
351 | } | |||||
352 | ||||||
353 | # ------------------------------- | |||||
354 | # Built-in check implementations | |||||
355 | # ------------------------------- | |||||
356 | sub _check_title { | |||||
357 | 4 | 5 | my ($self, $html) = @_; | |||
358 | ||||||
359 | 4 | 13 | if ($html =~ /<title>(.*?)<\/title>/is) { | |||
360 | 4 | 7 | my $title = $1; | |||
361 | 4 | 8 | $title =~ s/^\s+|\s+$//g; # trim | |||
362 | 4 | 5 | $title =~ s/\s{2,}/ /g; # collapse spaces | |||
363 | ||||||
364 | 4 | 4 | my $len = length($title); | |||
365 | 4 | 3 | my $status = 'ok'; | |||
366 | 4 | 15 | my $notes = "title present ($len chars)"; | |||
367 | ||||||
368 | 4 | 6 | if ($len == 0) { | |||
369 | 0 | 0 | $status = 'error'; | |||
370 | 0 | 0 | $notes = 'empty title'; | |||
371 | } elsif ($len < 10) { | |||||
372 | 4 | 4 | $status = 'warn'; | |||
373 | 4 | 5 | $notes = "title too short ($len chars)"; | |||
374 | } elsif ($len > 65) { | |||||
375 | 0 | 0 | $status = 'warn'; | |||
376 | 0 | 0 | $notes = "title too long ($len chars)"; | |||
377 | } | |||||
378 | ||||||
379 | # Flag really weak titles | |||||
380 | 4 | 14 | if ($title =~ /^(home|untitled|index)$/i) { | |||
381 | 0 | 0 | $status = 'warn'; | |||
382 | 0 | 0 | $notes = "generic title: $title"; | |||
383 | } | |||||
384 | ||||||
385 | 4 | 18 | return { name => 'Title', status => $status, notes => $notes }; | |||
386 | } | |||||
387 | ||||||
388 | 0 | 0 | return { name => 'Title', status => 'error', notes => 'missing title' }; | |||
389 | } | |||||
390 | ||||||
391 | ||||||
392 | sub _check_meta_description { | |||||
393 | 4 | 4 | my ($self, $html) = @_; | |||
394 | 4 | 13 | if ($html =~ /<meta\s+name=["']description["']\s+content=["'](.*?)["']/is) { | |||
395 | 3 | 3 | my $desc = $1; | |||
396 | 3 | 9 | return { name => 'Meta Description', status => 'ok', notes => 'meta description present' }; | |||
397 | } | |||||
398 | 1 | 4 | return { name => 'Meta Description', status => 'warn', notes => 'missing meta description' }; | |||
399 | } | |||||
400 | ||||||
401 | sub _check_canonical | |||||
402 | { | |||||
403 | 4 | 4 | my ($self, $html) = @_; | |||
404 | 4 | 10 | if ($html =~ /<link\s+rel=["']canonical["']\s+href=["'](.*?)["']/is) { | |||
405 | 0 | 0 | return { name => 'Canonical', status => 'ok', notes => 'canonical link present' }; | |||
406 | } | |||||
407 | return { | |||||
408 | 4 | 11 | name => 'Canonical', | |||
409 | status => 'warn', | |||||
410 | notes => 'missing canonical link', | |||||
411 | resolution => 'Add canonical link to <head>: <link rel="canonical" href="https://your-domain.com/this-page-url"> - use the preferred URL for this page to prevent duplicate content issues' | |||||
412 | }; | |||||
413 | } | |||||
414 | ||||||
415 | sub _check_robots_meta { | |||||
416 | 4 | 8 | my ($self, $html) = @_; | |||
417 | 4 | 12 | if ($html =~ /<meta\s+name=["']robots["']\s+content=["'](.*?)["']/is) { | |||
418 | 0 | 0 | return { name => 'Robots Meta', status => 'ok', notes => 'robots meta present' }; | |||
419 | } | |||||
420 | return { | |||||
421 | 4 | 28 | name => 'Robots Meta', | |||
422 | status => 'warn', | |||||
423 | notes => 'missing robots meta', | |||||
424 | resolution => 'Add robots meta tag to <head>: <meta name="robots" content="index, follow"> for normal indexing, or <meta name="robots" content="noindex, nofollow"> to prevent indexing - controls how search engines crawl and index this page' | |||||
425 | }; | |||||
426 | } | |||||
427 | ||||||
428 | sub _check_viewport { | |||||
429 | 4 | 4 | my ($self, $html) = @_; | |||
430 | 4 | 10 | if ($html =~ /<meta\s+name=["']viewport["']\s+content=["'](.*?)["']/is) { | |||
431 | 0 | 0 | return { name => 'Viewport', status => 'ok', notes => 'viewport meta present' }; | |||
432 | } | |||||
433 | return { | |||||
434 | 4 | 13 | name => 'Viewport', | |||
435 | status => 'warn', | |||||
436 | notes => 'missing viewport meta', | |||||
437 | resolution => 'Add viewport meta tag to <head>: <meta name="viewport" content="width=device-width, initial-scale=1.0"> - essential for mobile responsiveness and Google mobile-first indexing' | |||||
438 | }; | |||||
439 | } | |||||
440 | ||||||
441 | sub _check_h1_presence { | |||||
442 | 4 | 3 | my ($self, $html) = @_; | |||
443 | 4 | 13 | if ($html =~ /<h1\b[^>]*>(.*?)<\/h1>/is) { | |||
444 | 3 | 9 | return { name => 'H1 Presence', status => 'ok', notes => 'h1 tag present' }; | |||
445 | } | |||||
446 | 1 | 3 | return { name => 'H1 Presence', status => 'warn', notes => 'missing h1' }; | |||
447 | } | |||||
448 | ||||||
449 | sub _check_word_count { | |||||
450 | 4 | 4 | my ($self, $html) = @_; | |||
451 | 4 | 10 | my $text = $html; | |||
452 | 4 | 20 | $text =~ s/<[^>]+>//g; | |||
453 | 4 | 13 | my $words = scalar split /\s+/, $text; | |||
454 | 4 | 15 | return { name => 'Word Count', status => $words > 0 ? 'ok' : 'warn', notes => "$words words" }; | |||
455 | } | |||||
456 | ||||||
457 | sub _check_links_alt_text { | |||||
458 | 4 | 5 | my ($self, $html) = @_; | |||
459 | 4 | 2 | my @missing; | |||
460 | 4 | 13 | while ($html =~ /<img\b(.*?)>/gis) { | |||
461 | 0 | 0 | my $attr = $1; | |||
462 | 0 | 0 | push @missing, $1 unless $attr =~ /alt=/i; | |||
463 | } | |||||
464 | 4 | 5 | if(scalar(@missing)) { | |||
465 | return { | |||||
466 | 0 | 0 | name => 'Links Alt Text', | |||
467 | status => 'warn', | |||||
468 | notes => scalar(@missing) . ' images missing alt', | |||||
469 | resolution => 'Add alt attributes to all images: <img src="image.jpg" alt="Descriptive text"> - describe the image content for screen readers and SEO. Use alt="" for decorative images that don\'t add meaning' | |||||
470 | }; | |||||
471 | } | |||||
472 | ||||||
473 | return { | |||||
474 | 4 | 10 | name => 'Links Alt Text', | |||
475 | status => 'ok', | |||||
476 | notes => 'all images have alt' | |||||
477 | }; | |||||
478 | } | |||||
479 | ||||||
480 | sub _check_structured_data { | |||||
481 | 4 | 6 | my ($self, $html) = @_; | |||
482 | ||||||
483 | 4 | 9 | my @jsonld = ($html =~ /<script\b[^>]*type=["']application\/ld\+json["'][^>]*>(.*?)<\/script>/gis); | |||
484 | ||||||
485 | 4 | 12 | if(scalar(@jsonld)) { | |||
486 | return { | |||||
487 | 0 | 0 | name => 'Structured Data', | |||
488 | status => 'ok', | |||||
489 | notes => scalar(@jsonld) . ' JSON-LD block(s) found' | |||||
490 | }; | |||||
491 | } | |||||
492 | ||||||
493 | return { | |||||
494 | 4 | 13 | name => 'Structured Data', | |||
495 | status => 'warn', | |||||
496 | notes => 'no structured data found', | |||||
497 | resolution => 'Add JSON-LD structured data to <head>: <script type="application/ld+json">{"@context": "https://schema.org", "@type": "WebPage", "name": "Page Title", "description": "Page description"}</script> - helps search engines understand your content better and enables rich snippets' | |||||
498 | } | |||||
499 | } | |||||
500 | ||||||
501 | # _check_headings | |||||
502 | # ---------------- | |||||
503 | # Analyzes the HTML document for heading structure and returns a structured | |||||
504 | # SEO/a11y report. | |||||
505 | # | |||||
506 | # Checks performed: | |||||
507 | # - Presence of headings (<h1>â<h6>), with counts of each level. | |||||
508 | # - Ensures exactly one <h1> exists (warns if missing or multiple). | |||||
509 | # - Validates heading hierarchy (no skipped levels, e.g. <h3> should not appear before an <h2>). | |||||
510 | # - Flags suspicious heading text lengths (too short < 2 chars, or too long > 120 chars). | |||||
511 | # | |||||
512 | # Returns: | |||||
513 | # { | |||||
514 | # name => 'Headings', | |||||
515 | # status => 'ok' | 'warn', | |||||
516 | # notes => 'summary of counts and issues' | |||||
517 | # } | |||||
518 | # | |||||
519 | # Notes: | |||||
520 | # - Status is 'warn' if issues are found, otherwise 'ok'. | |||||
521 | # - 'error' status is reserved for future use (currently unused). | |||||
522 | ||||||
523 | sub _check_headings { | |||||
524 | 4 | 8 | my ($self, $html) = @_; | |||
525 | ||||||
526 | 4 | 4 | my %counts; | |||
527 | my @headings; | |||||
528 | ||||||
529 | # Capture all headings and their order | |||||
530 | 4 | 13 | while ($html =~ /<(h[1-6])\b[^>]*>(.*?)<\/\1>/gi) { | |||
531 | 3 | 4 | my $tag = lc $1; | |||
532 | 3 | 7 | my $text = $2 // ''; | |||
533 | 3 | 4 | $text =~ s/\s+/ /g; # normalize whitespace | |||
534 | 3 | 6 | $text =~ s/^\s+|\s+$//g; | |||
535 | ||||||
536 | 3 | 4 | $counts{$tag}++; | |||
537 | 3 | 13 | push @headings, { level => substr($tag, 1), text => $text }; | |||
538 | } | |||||
539 | ||||||
540 | 4 | 2 | my @issues; | |||
541 | 4 | 4 | my $status = 'ok'; | |||
542 | ||||||
543 | # Check for no headings | |||||
544 | 4 | 6 | if (!%counts) { | |||
545 | return { | |||||
546 | 1 | 2 | name => 'Headings', | |||
547 | status => 'warn', | |||||
548 | notes => 'no headings found', | |||||
549 | }; | |||||
550 | } | |||||
551 | ||||||
552 | # Check H1 presence/uniqueness | |||||
553 | 3 | 11 | if (!$counts{h1}) { | |||
554 | 0 | 0 | push @issues, 'missing <h1>'; | |||
555 | 0 | 0 | $status = 'warn'; | |||
556 | } | |||||
557 | elsif ($counts{h1} > 1) { | |||||
558 | 0 | 0 | push @issues, 'multiple <h1> tags'; | |||
559 | 0 | 0 | $status = 'warn'; | |||
560 | } | |||||
561 | ||||||
562 | # Check heading hierarchy (no skipped levels) | |||||
563 | 3 | 3 | my $last_level = 0; | |||
564 | 3 | 4 | for my $h (@headings) { | |||
565 | 3 | 4 | my $level = $h->{level}; | |||
566 | 3 | 4 | if ($last_level && $level > $last_level + 1) { | |||
567 | 0 | 0 | push @issues, "skipped heading level before <h$level>"; | |||
568 | 0 | 0 | $status = 'warn'; | |||
569 | } | |||||
570 | 3 | 3 | $last_level = $level; | |||
571 | } | |||||
572 | ||||||
573 | # Check heading text length (too short or too long) | |||||
574 | 3 | 3 | for my $h (@headings) { | |||
575 | 3 | 3 | my $len = length($h->{text}); | |||
576 | 3 | 6 | if ($len < 2) { | |||
577 | 0 | 0 | push @issues, "<h$h->{level}> too short"; | |||
578 | 0 | 0 | $status = 'warn'; | |||
579 | } | |||||
580 | elsif ($len > 120) { | |||||
581 | 0 | 0 | push @issues, "<h$h->{level}> too long"; | |||
582 | 0 | 0 | $status = 'warn'; | |||
583 | } | |||||
584 | } | |||||
585 | ||||||
586 | # Summarize counts and issues | |||||
587 | 3 3 | 6 7 | my $summary = join ', ', map { "$_: $counts{$_}" } sort keys %counts; | |||
588 | 3 | 7 | $summary .= @issues ? " | Issues: " . join('; ', @issues) : ''; | |||
589 | ||||||
590 | return { | |||||
591 | 3 | 10 | name => 'Headings', | |||
592 | status => $status, | |||||
593 | notes => $summary, | |||||
594 | }; | |||||
595 | } | |||||
596 | ||||||
597 | sub _check_links { | |||||
598 | 4 | 6 | my ($self, $html) = @_; | |||
599 | ||||||
600 | 4 | 3 | my $base_host; | |||
601 | 4 | 10 | if ($self->{url} && $self->{url} =~ m{^https?://}i) { | |||
602 | 0 | 0 | $base_host = Mojo::URL->new($self->{url})->host; | |||
603 | } | |||||
604 | ||||||
605 | 4 | 9 | my ($total, $internal, $external, $badtext) = (0,0,0,0); | |||
606 | ||||||
607 | # common "bad" link text patterns (exact match or just punctuation around) | |||||
608 | 4 | 8 | my $bad_rx = qr/^(?:click\s*here|read\s*more|more|link|here|details)$/i; | |||
609 | ||||||
610 | 4 | 10 | while ($html =~ m{<a\b([^>]*)>(.*?)</a>}gis) { | |||
611 | 0 | 0 | my $attrs = $1; | |||
612 | 0 | 0 | my $text = $2 // ''; | |||
613 | ||||||
614 | 0 | 0 | $total++; | |||
615 | ||||||
616 | # get href (prefer quoted values) | |||||
617 | 0 | 0 | my ($href) = $attrs =~ /\bhref\s*=\s*"(.*?)"/i; | |||
618 | 0 | 0 | $href //= ($attrs =~ /\bhref\s*=\s*'(.*?)'/i ? $1 : undef); | |||
619 | 0 | 0 | $href //= ($attrs =~ /\bhref\s*=\s*([^\s>]+)/i ? $1 : undef); | |||
620 | ||||||
621 | # classify internal vs external | |||||
622 | 0 | 0 | if (defined $href && $href =~ m{^\s*https?://}i) { | |||
623 | # attempt to compare host | |||||
624 | 0 | 0 | my ($host) = $href =~ m{^\s*https?://([^/:\s]+)}i; | |||
625 | 0 | 0 | if (defined $base_host && defined $host) { | |||
626 | 0 | 0 | if (lc $host eq lc $base_host) { | |||
627 | 0 | 0 | $internal++; | |||
628 | } else { | |||||
629 | 0 | 0 | $external++; | |||
630 | } | |||||
631 | } else { | |||||
632 | # no base host to compare; treat as external if absolute URL | |||||
633 | 0 | 0 | $external++; | |||
634 | } | |||||
635 | } else { | |||||
636 | # relative URL or fragment or mailto/etc -> treat as internal | |||||
637 | 0 | 0 | $internal++; | |||
638 | } | |||||
639 | ||||||
640 | # normalize visible text: strip tags, trim whitespace, collapse spaces | |||||
641 | 0 | 0 | $text =~ s/<[^>]+>//g; | |||
642 | 0 | 0 | $text =~ s/^\s+|\s+$//g; | |||
643 | 0 | 0 | $text =~ s/\s+/ /g; | |||
644 | ||||||
645 | # check for bad link text (exact-ish) | |||||
646 | 0 | 0 | if ($text =~ $bad_rx) { | |||
647 | 0 | 0 | $badtext++; | |||
648 | } | |||||
649 | } | |||||
650 | ||||||
651 | 4 | 20 | my $status = ($external || $badtext) ? 'warn' : ($total ? 'ok' : 'warn'); | |||
652 | ||||||
653 | 4 | 6 | my $notes; | |||
654 | 4 | 8 | if ($total) { | |||
655 | 0 | 0 | $notes = sprintf("%d total (%d internal, %d external). %d link(s) with poor anchor text", | |||
656 | $total, $internal, $external, $badtext); | |||||
657 | } else { | |||||
658 | 4 | 4 | $notes = 'no links found'; | |||
659 | } | |||||
660 | ||||||
661 | return { | |||||
662 | 4 | 13 | name => 'Links', | |||
663 | status => $status, | |||||
664 | notes => $notes, | |||||
665 | }; | |||||
666 | } | |||||
667 | ||||||
668 | # Checks for essential Open Graph tags that improve social media sharing | |||||
669 | sub _check_open_graph { | |||||
670 | 4 | 4 | my ($self, $html) = @_; | |||
671 | ||||||
672 | 4 | 6 | my %og_tags; | |||
673 | 4 | 6 | my @required = qw(title description image url); | |||
674 | ||||||
675 | # Extract all Open Graph meta tags | |||||
676 | 4 | 10 | while ($html =~ /<meta\s+(?:property|name)=["']og:([^"']+)["']\s+content=["']([^"']*)["']/gis) { | |||
677 | 0 | 0 | $og_tags{$1} = $2; | |||
678 | } | |||||
679 | ||||||
680 | 4 16 | 5 19 | my @missing = grep { !exists $og_tags{$_} || !$og_tags{$_} } @required; | |||
681 | 4 | 6 | my $found = keys %og_tags; | |||
682 | ||||||
683 | 4 | 5 | my $status = @missing ? 'warn' : 'ok'; | |||
684 | 4 | 9 | my $notes; | |||
685 | ||||||
686 | 4 | 7 | if ($found == 0) { | |||
687 | 4 | 2 | $notes = 'no Open Graph tags found'; | |||
688 | 4 | 4 | $status = 'warn'; | |||
689 | } elsif (@missing) { | |||||
690 | 0 | 0 | $notes = sprintf('%d OG tags found, missing: %s', $found, join(', ', @missing)); | |||
691 | } else { | |||||
692 | 0 | 0 | $notes = sprintf('all essential OG tags present (%d total)', $found); | |||
693 | } | |||||
694 | ||||||
695 | return { | |||||
696 | 4 | 14 | name => 'Open Graph', | |||
697 | status => $status, | |||||
698 | notes => $notes, | |||||
699 | resolution => 'Add missing tags to <head>: <meta property="og:title" content="Your Page Title">, <meta property="og:description" content="Brief page description">' | |||||
700 | }; | |||||
701 | } | |||||
702 | ||||||
703 | # Checks for Twitter Card meta tags for better Twitter sharing | |||||
704 | sub _check_twitter_cards { | |||||
705 | 4 | 4 | my ($self, $html) = @_; | |||
706 | ||||||
707 | 4 | 2 | my %twitter_tags; | |||
708 | 4 | 4 | my @recommended = qw(card title description); | |||
709 | ||||||
710 | # Extract Twitter Card meta tags | |||||
711 | 4 | 12 | while ($html =~ /<meta\s+(?:property|name)=["']twitter:([^"']+)["']\s+content=["']([^"']*)["']/gis) { | |||
712 | 0 | 0 | $twitter_tags{$1} = $2; | |||
713 | } | |||||
714 | ||||||
715 | 4 12 | 4 16 | my @missing = grep { !exists $twitter_tags{$_} || !$twitter_tags{$_} } @recommended; | |||
716 | 4 | 3 | my $found = keys %twitter_tags; | |||
717 | ||||||
718 | 4 | 4 | my $status = @missing ? 'warn' : 'ok'; | |||
719 | 4 | 4 | my $notes; | |||
720 | ||||||
721 | 4 | 5 | if ($found == 0) { | |||
722 | 4 | 4 | $notes = 'no Twitter Card tags found'; | |||
723 | 4 | 3 | $status = 'warn'; | |||
724 | } elsif (@missing) { | |||||
725 | 0 | 0 | $notes = sprintf('%d Twitter tags found, missing: %s', $found, join(', ', @missing)); | |||
726 | } else { | |||||
727 | 0 | 0 | $notes = sprintf('essential Twitter Card tags present (%d total)', $found); | |||
728 | } | |||||
729 | ||||||
730 | return { | |||||
731 | 4 | 25 | name => 'Twitter Cards', | |||
732 | status => $status, | |||||
733 | notes => $notes, | |||||
734 | resolution => 'Add missing tags to <head>: <meta name="twitter:card" content="summary">, <meta name="twitter:title" content="Your Page Title">' | |||||
735 | }; | |||||
736 | } | |||||
737 | ||||||
738 | # Checks HTML size and warns if too large (impacts loading speed) | |||||
739 | sub _check_page_size { | |||||
740 | 4 | 5 | my ($self, $html) = @_; | |||
741 | ||||||
742 | 4 | 2 | my $size_bytes = length($html); | |||
743 | 4 | 7 | my $size_kb = int($size_bytes / 1024); | |||
744 | ||||||
745 | 4 | 4 | my $status = 'ok'; | |||
746 | 4 | 5 | my $notes = "${size_kb}KB HTML size"; | |||
747 | 4 | 2 | my $resolution = ''; | |||
748 | ||||||
749 | 4 | 15 | if ($size_bytes > 1_048_576) { # > 1MB | |||
750 | 0 | 0 | $status = 'error'; | |||
751 | 0 | 0 | $notes .= ' (too large, over 1MB)'; | |||
752 | 0 | 0 | $resolution = 'Consider optimizing: minify CSS/JS, compress images, remove unused elements, enable server compression'; | |||
753 | } elsif ($size_bytes > 102_400) { # > 100KB | |||||
754 | 0 | 0 | $status = 'warn'; | |||
755 | 0 | 0 | $notes .= ' (large, consider optimization)'; | |||
756 | } elsif ($size_bytes < 1024) { # < 1KB | |||||
757 | 4 | 4 | $status = 'warn'; | |||
758 | 4 | 3 | $notes .= ' (suspiciously small)'; | |||
759 | } else { | |||||
760 | 0 | 0 | $notes .= ' (good size)'; | |||
761 | } | |||||
762 | ||||||
763 | return { | |||||
764 | 4 | 16 | name => 'Page Size', | |||
765 | status => $status, | |||||
766 | notes => $notes, | |||||
767 | resolution => $resolution | |||||
768 | }; | |||||
769 | } | |||||
770 | ||||||
771 | # Calculates approximate Flesch Reading Ease score for content readability | |||||
772 | sub _check_readability { | |||||
773 | 4 | 5 | my ($self, $html) = @_; | |||
774 | ||||||
775 | # Extract text content (remove scripts, styles, and HTML tags) | |||||
776 | 4 | 2 | my $text = $html; | |||
777 | 4 | 8 | $text =~ s/<script\b[^>]*>.*?<\/script>//gis; | |||
778 | 4 | 6 | $text =~ s/<style\b[^>]*>.*?<\/style>//gis; | |||
779 | 4 | 38 | $text =~ s/<[^>]+>//g; | |||
780 | 4 | 6 | $text =~ s/\s+/ /g; | |||
781 | 4 | 11 | $text =~ s/^\s+|\s+$//g; | |||
782 | ||||||
783 | return { | |||||
784 | 4 | 19 | name => 'Readability', | |||
785 | status => 'warn', | |||||
786 | notes => 'insufficient text for analysis', | |||||
787 | resolution => 'Add more content to the page - aim for at least 300 words of meaningful text', | |||||
788 | } if length($text) < 100; | |||||
789 | ||||||
790 | # Count sentences (approximate) | |||||
791 | 0 | my $sentences = () = $text =~ /[.!?]+/g; | ||||
792 | 0 | $sentences = 1 if $sentences == 0; # avoid division by zero | ||||
793 | ||||||
794 | # Count words | |||||
795 | 0 | my @words = split /\s+/, $text; | ||||
796 | 0 | my $word_count = @words; | ||||
797 | ||||||
798 | return { | |||||
799 | 0 | name => 'Readability', | ||||
800 | status => 'warn', | |||||
801 | notes => 'insufficient content for analysis', | |||||
802 | resolution => 'Add more substantial content - aim for at least 300 words for proper SEO value', | |||||
803 | } if $word_count < 50; | |||||
804 | ||||||
805 | # Count syllables (very basic approximation) | |||||
806 | 0 | my $syllables = 0; | ||||
807 | 0 | for my $word (@words) { | ||||
808 | 0 | $word = lc($word); | ||||
809 | 0 | $word =~ s/[^a-z]//g; # remove punctuation | ||||
810 | 0 | next if length($word) == 0; | ||||
811 | ||||||
812 | # Simple syllable counting heuristic | |||||
813 | 0 | my $vowels = () = $word =~ /[aeiouy]/g; | ||||
814 | 0 | $syllables += $vowels > 0 ? $vowels : 1; | ||||
815 | 0 | $syllables-- if $word =~ /e$/; # silent e | ||||
816 | } | |||||
817 | 0 | $syllables = $word_count if $syllables < $word_count; # minimum 1 syllable per word | ||||
818 | ||||||
819 | # Flesch Reading Ease formula | |||||
820 | 0 | my $avg_sentence_length = $word_count / $sentences; | ||||
821 | 0 | my $avg_syllables_per_word = $syllables / $word_count; | ||||
822 | 0 | my $flesch_score = 206.835 - (1.015 * $avg_sentence_length) - (84.6 * $avg_syllables_per_word); | ||||
823 | ||||||
824 | 0 | my $status = 'ok'; | ||||
825 | 0 | my $level; | ||||
826 | my $notes; | |||||
827 | 0 | my $resolution = ''; | ||||
828 | ||||||
829 | 0 | if ($flesch_score >= 90) { | ||||
830 | 0 | $level = 'very easy'; | ||||
831 | } elsif ($flesch_score >= 80) { | |||||
832 | 0 | $level = 'easy'; | ||||
833 | } elsif ($flesch_score >= 70) { | |||||
834 | 0 | $level = 'fairly easy'; | ||||
835 | } elsif ($flesch_score >= 60) { | |||||
836 | 0 | $level = 'standard'; | ||||
837 | } elsif ($flesch_score >= 50) { | |||||
838 | 0 | $level = 'fairly difficult'; | ||||
839 | 0 | $status = 'warn'; | ||||
840 | 0 | $resolution = 'Consider simplifying: use shorter sentences (aim for 15-20 words), choose simpler words, break up long paragraphs, add bullet points or lists'; | ||||
841 | } elsif ($flesch_score >= 30) { | |||||
842 | 0 | $level = 'difficult'; | ||||
843 | 0 | $status = 'warn'; | ||||
844 | 0 | $resolution = 'Improve readability: use much shorter sentences (10-15 words), replace complex words with simpler alternatives, add more paragraph breaks, use active voice'; | ||||
845 | } else { | |||||
846 | 0 | $level = 'very difficult'; | ||||
847 | 0 | $status = 'warn'; | ||||
848 | 0 | $resolution = 'Significantly simplify content: break long sentences into multiple short ones, replace jargon with plain language, add explanations for technical terms, use more white space and formatting'; | ||||
849 | } | |||||
850 | ||||||
851 | 0 | $notes = sprintf('Flesch score: %.1f (%s) - %d words, %d sentences', | ||||
852 | $flesch_score, $level, $word_count, $sentences); | |||||
853 | ||||||
854 | return { | |||||
855 | 0 | name => 'Readability', | ||||
856 | status => $status, | |||||
857 | notes => $notes, | |||||
858 | resolution => $resolution, | |||||
859 | }; | |||||
860 | } | |||||
861 | ||||||
862 - 936 | =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/SEO-Inspector/coverage/> =item * L<https://github.com/nigelhorne/SEO-Checker> =item * L<https://github.com/sethblack/python-seo-analyzer> =back =head1 REPOSITORY L<https://github.com/nigelhorne/SEO-Inspector> =head1 SUPPORT This module is provided as-is without any warranty. Please report any bugs or feature requests to C<bug-seo-inspector at rt.cpan.org>, or through the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SEO-Inspector>. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. You can find documentation for this module with the perldoc command. perldoc SEO::Inspector You can also look for information at: =over 4 =item * MetaCPAN L<https://metacpan.org/dist/SEO-Inspector> =item * RT: CPAN's request tracker L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=SEO-Inspector> =item * CPAN Testers' Matrix L<http://matrix.cpantesters.org/?dist=SEO-Inspector> =item * CPAN Testers Dependencies L<http://deps.cpantesters.org/?module=SEO::Inspector> =back =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut | |||||
937 | ||||||
938 | 1; | |||||
939 |