| File: | blib/lib/Geo/Address/Parser/Rules/IRL.pm |
| Coverage: | 80.4% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package Geo::Address::Parser::Rules::IRL; | |||||
| 2 | ||||||
| 3 | 1 1 1 | 125366 1 16 | use strict; | |||
| 4 | 1 1 1 | 1 1 18 | use warnings; | |||
| 5 | 1 1 1 | 3 1 2 | use utf8; | |||
| 6 | ||||||
| 7 | # use Geo::Coder::Abbreviations; | |||||
| 8 | 1 1 1 | 206 1200 351 | use Text::Capitalize qw(capitalize_title); | |||
| 9 | ||||||
| 10 - 36 | =head1 NAME Geo::Address::Parser::Rules::IRL - Parsing rules for Irish addresses =head1 DESCRIPTION Parses a flat Irish address string into components: name, road, city, and postcode. =head1 EXPORTS =head2 parse_address($text) Returns a hashref with keys: =over =item * name =item * road =item * city =item * postcode =back =cut | |||||
| 37 | ||||||
| 38 | our $VERSION = '0.07'; | |||||
| 39 | ||||||
| 40 | # heuristics for detecting building/venue names | |||||
| 41 | my $BUILDING_RE = qr/\b(?:house|hall|mill|centre|center|museum|church|hotel|inn|club|school|library|theatre)\b/i; | |||||
| 42 | ||||||
| 43 | # Eircode-ish pattern (basic) | |||||
| 44 | my $eircode_re = qr/\b[A-Z0-9]{3}\s?[A-Z0-9]{4}\b/i; | |||||
| 45 | ||||||
| 46 | sub parse_address { | |||||
| 47 | 4 | 10257 | my ($class, $text) = @_; | |||
| 48 | 4 | 5 | return unless defined $text; | |||
| 49 | ||||||
| 50 | # Basic normalisation | |||||
| 51 | 4 | 16 | $text =~ s/^\s+|\s+$//g; | |||
| 52 | 4 | 8 | $text =~ s/\s{2,}/ /g; | |||
| 53 | ||||||
| 54 | # Expand abbreviations if available | |||||
| 55 | # my $abbrev; | |||||
| 56 | # eval { $abbrev = Geo::Coder::Abbreviations->new; 1 } or $abbrev = undef; | |||||
| 57 | # if ($abbrev) { | |||||
| 58 | # eval { $text = $abbrev->expand($text) // $text; 1 } or do { /* keep original */ }; | |||||
| 59 | # } | |||||
| 60 | ||||||
| 61 | # Split into comma parts and trim | |||||
| 62 | 4 13 | 5 32 | my @parts = map { s/^\s+|\s+$//gr } split /,/, $text; | |||
| 63 | 4 13 | 5 11 | @parts = grep { length $_ } @parts; # drop empty parts | |||
| 64 | ||||||
| 65 | # Remove trailing explicit country token (Ireland/Ãire) | |||||
| 66 | 4 | 13 | if (@parts and $parts[-1] =~ /^(?:ireland|éire)$/i) { | |||
| 67 | 2 | 1 | pop @parts; | |||
| 68 | } | |||||
| 69 | ||||||
| 70 | # Try to extract an Eircode from the last part (or anywhere in last part) | |||||
| 71 | 4 | 4 | my $postal_code; | |||
| 72 | 4 | 38 | if (@parts and $parts[-1] =~ /($eircode_re)/) { | |||
| 73 | 1 | 2 | $postal_code = uc $1; | |||
| 74 | 1 | 6 | $parts[-1] =~ s/\Q$1\E//i; | |||
| 75 | 1 | 1 | $parts[-1] =~ s/^\s+|\s+$//g; | |||
| 76 | 1 | 3 | pop @parts if $parts[-1] eq ''; | |||
| 77 | } | |||||
| 78 | ||||||
| 79 | # Detect "Co. CountyName" in the last part | |||||
| 80 | 4 | 2 | my $region; | |||
| 81 | 4 | 9 | if (@parts and $parts[-1] =~ /^co\.?\s*(.+)$/i) { | |||
| 82 | 2 | 5 | $region = capitalize_title(lc $1); | |||
| 83 | 2 | 106 | pop @parts; | |||
| 84 | } | |||||
| 85 | ||||||
| 86 | # Prepare result fields | |||||
| 87 | 4 | 3 | my ($name, $road, $city); | |||
| 88 | 4 | 4 | my $n = scalar @parts; | |||
| 89 | ||||||
| 90 | 4 | 5 | if ($n == 0) { | |||
| 91 | # nothing left; return at least country/postal if present | |||||
| 92 | return { | |||||
| 93 | 0 | 0 | name => undef, | |||
| 94 | road => undef, | |||||
| 95 | city => undef, | |||||
| 96 | region => $region, | |||||
| 97 | postal_code => $postal_code, | |||||
| 98 | country => 'Ireland', | |||||
| 99 | }; | |||||
| 100 | } elsif ($n == 1) { | |||||
| 101 | # Single token: assume it's a road/locality | |||||
| 102 | 0 | 0 | $road = capitalize_title(lc $parts[0]); | |||
| 103 | 0 | 0 | $city = undef; | |||
| 104 | } elsif ($n == 2) { | |||||
| 105 | # Two tokens â ambiguous: decide if first is a building name | |||||
| 106 | 4 | 10 | if ($parts[0] =~ $BUILDING_RE) { | |||
| 107 | 1 | 2 | $name = capitalize_title(lc $parts[0]); | |||
| 108 | 1 | 71 | $road = capitalize_title(lc $parts[1]); # treat locality as road too | |||
| 109 | 1 | 46 | $city = $road; | |||
| 110 | } else { | |||||
| 111 | # likely "road, city" | |||||
| 112 | 3 | 4 | $road = capitalize_title(lc $parts[0]); | |||
| 113 | 3 | 246 | $city = capitalize_title(lc $parts[1]); | |||
| 114 | } | |||||
| 115 | } else { # n >= 3 | |||||
| 116 | # typical: [maybe-building-name..., road, city] | |||||
| 117 | 0 | 0 | $city = capitalize_title(lc $parts[-1]); | |||
| 118 | 0 | 0 | $road = capitalize_title(lc $parts[-2]); | |||
| 119 | ||||||
| 120 | # everything before that is the name (may be empty) | |||||
| 121 | 0 | 0 | my @name_parts = @parts[0 .. $n - 3]; | |||
| 122 | 0 0 | 0 0 | $name = join(', ', map { capitalize_title(lc $_) } @name_parts) if @name_parts; | |||
| 123 | } | |||||
| 124 | ||||||
| 125 | 4 | 174 | undef $road if($road eq $city); | |||
| 126 | ||||||
| 127 | # Fix Irish O' prefixes â e.g., O'connell => O'Connell | |||||
| 128 | 4 1 | 6 2 | $road =~ s/\bO'([a-z])/"O'" . uc($1)/ge if($road); | |||
| 129 | ||||||
| 130 | # Final result | |||||
| 131 | 4 | 10 | my %result = ( | |||
| 132 | name => $name, | |||||
| 133 | road => $road, | |||||
| 134 | city => $city, | |||||
| 135 | region => $region, | |||||
| 136 | postal_code => $postal_code, | |||||
| 137 | country => 'Ireland', | |||||
| 138 | ); | |||||
| 139 | ||||||
| 140 | 4 | 8 | return \%result; | |||
| 141 | } | |||||
| 142 | ||||||
| 143 | 1; | |||||