| File: | blib/lib/Geo/Address/Parser/Rules/UK.pm |
| Coverage: | 88.7% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | our @EXPORT_OK = qw(parse_address); | |||||
| 2 | package Geo::Address::Parser::Rules::UK; | |||||
| 3 | ||||||
| 4 | 1 1 1 | 211 1 15 | use strict; | |||
| 5 | 1 1 1 | 2 1 18 | use warnings; | |||
| 6 | ||||||
| 7 | 1 1 1 | 1 6 296 | use Exporter 'import'; | |||
| 8 | our @EXPORT_OK = qw(parse_address); | |||||
| 9 | ||||||
| 10 | our $VERSION = '0.07'; | |||||
| 11 | ||||||
| 12 - 38 | =head1 NAME Geo::Address::Parser::Rules::UK - Parsing rules for UK addresses =head1 DESCRIPTION Parses a flat UK address string into components: name, road, city, and postcode. =head1 EXPORTS =head2 parse_address($text) Returns a hashref with keys: =over =item * name =item * road =item * city =item * postcode =back =cut | |||||
| 39 | ||||||
| 40 | my $postcode_re = qr/\b([A-Z]{1,2}\d{1,2}[A-Z]?)\s*(\d[A-Z]{2})\b/i; | |||||
| 41 | ||||||
| 42 | my %uk_countries = map { $_ => 1 } qw(England Scotland Wales 'Northern Ireland'); | |||||
| 43 | ||||||
| 44 | my %uk_counties = map { $_ => 1 } ( | |||||
| 45 | # England | |||||
| 46 | 'Bedfordshire', 'Berkshire', 'Bristol', 'Buckinghamshire', 'Cambridgeshire', | |||||
| 47 | 'Cheshire', 'City of London', 'Cornwall', 'Cumbria', 'Derbyshire', | |||||
| 48 | 'Devon', 'Dorset', 'Durham', 'East Riding of Yorkshire', 'East Sussex', | |||||
| 49 | 'Essex', 'Gloucestershire', 'Greater London', 'Greater Manchester', 'Hampshire', | |||||
| 50 | 'Herefordshire', 'Hertfordshire', 'Isle of Wight', 'Kent', 'Lancashire', | |||||
| 51 | 'Leicestershire', 'Lincolnshire', 'Merseyside', 'Norfolk', 'North Yorkshire', | |||||
| 52 | 'Northamptonshire', 'Northumberland', 'Nottinghamshire', 'Oxfordshire', 'Rutland', | |||||
| 53 | 'Shropshire', 'Somerset', 'South Yorkshire', 'Staffordshire', 'Suffolk', | |||||
| 54 | 'Surrey', 'Tyne and Wear', 'Warwickshire', 'West Midlands', 'West Sussex', | |||||
| 55 | 'West Yorkshire', 'Wiltshire', 'Worcestershire', | |||||
| 56 | # Scotland | |||||
| 57 | 'Aberdeenshire', 'Angus', 'Argyll and Bute', 'Clackmannanshire', 'Dumfries and Galloway', | |||||
| 58 | 'Dundee', 'East Ayrshire', 'East Dunbartonshire', 'East Lothian', 'East Renfrewshire', | |||||
| 59 | 'Edinburgh', 'Falkirk', 'Fife', 'Glasgow', 'Highland', | |||||
| 60 | 'Inverclyde', 'Midlothian', 'Moray', 'Na h-Eileanan Siar', 'North Ayrshire', | |||||
| 61 | 'North Lanarkshire', 'Orkney Islands', 'Perth and Kinross', 'Renfrewshire', 'Scottish Borders', | |||||
| 62 | 'Shetland Islands', 'South Ayrshire', 'South Lanarkshire', 'Stirling', 'West Dunbartonshire', | |||||
| 63 | 'West Lothian', | |||||
| 64 | # Wales | |||||
| 65 | 'Blaenau Gwent', 'Bridgend', 'Caerphilly', 'Cardiff', 'Carmarthenshire', | |||||
| 66 | 'Ceredigion', 'Conwy', 'Denbighshire', 'Flintshire', 'Gwynedd', | |||||
| 67 | 'Isle of Anglesey', 'Merthyr Tydfil', 'Monmouthshire', 'Neath Port Talbot', 'Newport', | |||||
| 68 | 'Pembrokeshire', 'Powys', 'Rhondda Cynon Taf', 'Swansea', 'Torfaen', | |||||
| 69 | 'Vale of Glamorgan', 'Wrexham', | |||||
| 70 | # Northern Ireland | |||||
| 71 | 'Antrim', 'Armagh', 'Belfast', 'Castlereagh', 'Coleraine', | |||||
| 72 | 'Cookstown', 'Craigavon', 'Down', 'Dungannon', 'Fermanagh', | |||||
| 73 | 'Larne', 'Limavady', 'Lisburn', 'Londonderry', 'Magherafelt', | |||||
| 74 | 'Moyle', 'Newry and Mourne', 'Newtownabbey', 'North Down', 'Omagh', | |||||
| 75 | 'Strabane', 'Tyrone' | |||||
| 76 | ); | |||||
| 77 | ||||||
| 78 | sub parse_address { | |||||
| 79 | 2 | 2 | my ($class, $text) = @_; | |||
| 80 | 2 | 2 | return unless defined $text; | |||
| 81 | ||||||
| 82 | 2 9 | 3 19 | my @parts = map { s/^\s+|\s+$//gr } split /,/, $text; | |||
| 83 | 2 9 | 3 6 | @parts = grep { length $_ } @parts; | |||
| 84 | ||||||
| 85 | 2 | 2 | my ($name, $road, $city, $county, $postcode, $country); | |||
| 86 | ||||||
| 87 | # Remove trailing country if present | |||||
| 88 | 2 | 5 | if(@parts && exists $uk_countries{$parts[-1]}) { | |||
| 89 | 1 | 1 | $country = 'UK'; | |||
| 90 | 1 | 1 | pop @parts; | |||
| 91 | } | |||||
| 92 | ||||||
| 93 | # Look for postcode at end | |||||
| 94 | 2 | 6 | if(@parts && $parts[-1] =~ /$postcode_re/) { | |||
| 95 | 1 | 2 | $postcode = uc("$1 $2"); | |||
| 96 | 1 | 1 | pop @parts; | |||
| 97 | } | |||||
| 98 | ||||||
| 99 | # Check if last remaining token is a county | |||||
| 100 | 2 | 4 | if(@parts && exists $uk_counties{$parts[-1]}) { | |||
| 101 | 1 | 1 | $county = pop @parts; | |||
| 102 | } | |||||
| 103 | ||||||
| 104 | # Assign city: last remaining token | |||||
| 105 | 2 | 2 | if(@parts) { | |||
| 106 | 2 | 2 | $city = pop @parts; | |||
| 107 | } | |||||
| 108 | ||||||
| 109 | # Determine road and name | |||||
| 110 | 2 | 1 | if(@parts) { | |||
| 111 | # Heuristic: if first remaining token contains a number, treat it as road | |||||
| 112 | 2 | 7 | if($parts[-1] =~ /\d/) { | |||
| 113 | 1 | 1 | $road = pop @parts; | |||
| 114 | } | |||||
| 115 | } | |||||
| 116 | ||||||
| 117 | # Remaining tokens form name | |||||
| 118 | 2 | 4 | $name = join(', ', @parts) if @parts; | |||
| 119 | ||||||
| 120 | return { | |||||
| 121 | 2 | 8 | name => $name, | |||
| 122 | road => $road, | |||||
| 123 | city => $city, | |||||
| 124 | county => $county, | |||||
| 125 | postcode => $postcode, | |||||
| 126 | country => $country // 'UK', | |||||
| 127 | }; | |||||
| 128 | } | |||||
| 129 | ||||||
| 130 | 1; | |||||