File Coverage

File:blib/lib/Geo/Coder/Free/OpenAddresses.pm
Coverage:10.0%

linestmtbrancondsubpodtimecode
1package Geo::Coder::Free::OpenAddresses;
2
3# Includes both openaddresses and Whos On First data
4
5
8
8
8
15
7
122
use strict;
6
8
8
8
15
6
129
use warnings;
7
8
8
8
8
11
9
58
use Geo::Coder::Free;   # for _abbreviate
9
8
8
8
604
14
26
use Geo::Coder::Free::DB::OpenAddr;     # SQLite database
10
8
8
8
31585
11
27
use Geo::Coder::Free::DB::openaddresses;        # The original CSV files
11
8
8
8
30667
3030
96
use Geo::Hash;
12
8
8
8
17
4
58
use Geo::Location::Point;
13
8
8
8
15
5
52
use Module::Info;
14
8
8
8
12
6
147
use Carp;
15
8
8
8
13
6
48
use File::Spec;
16
8
8
8
11
7
80
use File::pfopen;
17
8
8
8
13
5
53
use Locale::CA;
18
8
8
8
14
5
50
use Locale::US;
19
8
8
8
175
36217
474
use Locale::SubCountry;
20
8
8
8
15
5
69
use CHI;
21
8
8
8
263
17033
80
use Lingua::EN::AddressParse;
22
8
8
8
17
5
303
use Locale::Country;
23
8
8
8
217
18321
105
use Geo::StreetAddress::US;
24
8
8
8
14
6
77
use Digest::MD5;
25
8
8
8
12
6
228
use Encode;
26
8
8
8
12
5
252
use Storable;
27
28# Some locations aren't found because of inconsistencies in the way things are stored - these are some values I know
29# FIXME: Should be in a configuration file
30our %known_locations = (
31        'Newport Pagnell, Buckinghamshire, England' => {
32                'latitude' => 52.08675,
33                'longitude' => -0.72270
34        },
35);
36
37our %unknown_locations;
38
39
8
8
8
10
8
243
use constant    LIBPOSTAL_UNKNOWN => 0;
40
8
8
8
9
8
110
use constant    LIBPOSTAL_INSTALLED => 1;
41
8
8
8
9
7
25430
use constant    LIBPOSTAL_NOT_INSTALLED => -1;
42our $libpostal_is_installed = LIBPOSTAL_UNKNOWN;
43
44 - 53
=head1 NAME

Geo::Coder::Free::OpenAddresses -
Provides a geocoding functionality to a local SQLite database containing geo-coding data.

=head1 VERSION

Version 0.41

=cut
54
55our $VERSION = '0.41';
56
57 - 98
=head1 SYNOPSIS

    use Geo::Coder::Free::OpenAddresses;

    # Use a local download of http://results.openaddresses.io/
    my $geocoder;
    if(my $openaddr = $ENV{'OPENADDR_HOME'}) {
        $geocoder = Geo::Coder::Free::OpenAddresses->new(openaddr => $openaddr);
    } else {
        $geocoder = Geo::Coder::Free::OpenAddresses->new(openaddr => '/usr/share/geo-coder-free/data');
    }
    my $location = $geocoder->geocode(location => '1600 Pennsylvania Avenue NW, Washington DC, USA');

    my @matches = $geocoder->geocode({ scantext => 'arbitrary text', region => 'GB' });

=head1 DESCRIPTION

Geo::Coder::Free::OpenAddresses provides an interface to the free geolocation databases at
L<http://results.openaddresses.io>,
L<https://github.com/whosonfirst-data>,
L<https://github.com/dr5hn/countries-states-cities-database.git> and
L<https://download.geofabrik.de/europe-latest.osm.bz2>.
The SQLite database is in a file held in $OPENADDR_HOME/openaddresses.sql.

Refer to the source URL for licencing information for these files.

To install,
run the createdatabases.PL script which imports the data into an SQLite database.
This process will take some time.

=head1 METHODS

=head2 new

    $geocoder = Geo::Coder::Free::OpenAddresses->new(openaddr => $ENV{'OPENADDR_HOME'});

Takes an optional parameter "openaddr", which is the directory of the file
openaddresses.sql.

Takes an optional parameter cache, which points to an object that understands get() and set() messages to store data in

=cut
99
100sub new {
101
4
1
14
        my($proto, %param) = @_;
102
4
9
        my $class = ref($proto) || $proto;
103
104        # Geo::Coder::Free->new not Geo::Coder::Free::new
105
4
6
        return unless($class);
106
107
4
8
        if(my $openaddr = $param{'openaddr'}) {
108
4
60
                Carp::croak(__PACKAGE__, ": Can't find the directory $openaddr")
109                        if((!-d $openaddr) || (!-r $openaddr));
110
2
6
                return bless { openaddr => $openaddr, cache => $param{'cache'} }, $class;
111        }
112
0
        Carp::croak(__PACKAGE__, ": Usage: new(openaddr => '/path/to/openaddresses')");
113}
114
115 - 139
=head2 geocode

    $location = $geocoder->geocode(location => $location);

    print 'Latitude: ', $location->lat(), "\n";
    print 'Longitude: ', $location->long(), "\n";

    # TODO:
    # @locations = $geocoder->geocode('Portland, USA');
    # diag 'There are Portlands in ', join (', ', map { $_->{'state'} } @locations);

    @locations = $geo_coder->geocode(scantext => 'arbitrary text', region => 'US', ignore_words => [ 'foo', 'bar' ]);

When looking for a house number in a street, if that address isn't found but that
street is found, a place in the street is given.
So "106 Wells Street, Fort Wayne, Allen, Indiana, USA" isn't found, a match for
"Wells Street, Fort Wayne, Allen, Indiana, USA" will be given instead.
Arguably that's incorrect, but it is the behaviour I want.
If "exact" is not given,
it will go on to look just for the town if the street isn't found.

The word "county" is removed from US county searches,
that either C<Leesburg, Loudoun County, Virginia, US> or C<Leesburg, Loudoun, Virginia, US> will work.

=cut
140
141sub geocode
142{
143
0
1
        my $self = shift;
144
145
0
        my %param;
146
0
        if(ref($_[0]) eq 'HASH') {
147
0
0
                %param = %{$_[0]};
148        } elsif(ref($_[0])) {
149
0
                Carp::croak('Usage: geocode(location => $location|scantext => $text)');
150        } elsif(scalar(@_) % 2 == 0) {
151
0
                %param = @_;
152        } else {
153
0
                $param{location} = shift;
154        }
155
156
0
        my %ignore_words;
157
0
        if($param{'ignore_words'}) {
158
0
0
0
                %ignore_words = map { lc($_) => 1 } @{$param{'ignore_words'}};
159        }
160
161
0
        if(my $scantext = $param{'scantext'}) {
162
0
                return if(length($scantext) < 6);
163                # FIXME:  wow this is inefficient
164
0
                $scantext =~ s/[^\w']+/ /g;
165
0
                my @words = split(/\s/, $scantext);
166
0
                my $count = scalar(@words);
167
0
                my $offset = 0;
168
0
                my @rc;
169
0
                my $region = $param{'region'};
170
0
                if($region) {
171
0
                        $region = uc($region);
172                }
173
0
                while($offset < $count) {
174
0
                        if(length($words[$offset]) < 2) {
175
0
                                $offset++;
176
0
                                next;
177                        }
178
0
                        if(exists($ignore_words{lc($words[$offset])})) {
179
0
                                $offset++;
180
0
                                next;
181                        }
182
0
                        my $l;
183
0
                        if(($l = $self->geocode(location => $words[$offset])) && ref($l)) {
184
0
                                push @rc, $l;
185                        }
186
0
                        if($offset < $count - 1) {
187
0
                                my $addr = join(', ', $words[$offset], $words[$offset + 1]);
188
0
                                if(length($addr) == 0) {
189
0
                                        $offset++;
190                                }
191                                # https://stackoverflow.com/questions/11160192/how-to-parse-freeform-street-postal-address-out-of-text-and-into-components
192                                # TODO: Support longer addresses
193
0
                                if($addr =~ /\s+(\d{2,5}\s+)(?![a|p]m\b)(([a-zA-Z|\s+]{1,5}){1,2})?([\s|\,|.]+)?(([a-zA-Z|\s+]{1,30}){1,4})(court|ct|street|st|drive|dr|lane|ln|road|rd|blvd)([\s|\,|.|\;]+)?(([a-zA-Z|\s+]{1,30}){1,2})([\s|\,|.]+)?\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|GU|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VI|VT|WA|WI|WV|WY)([\s|\,|.]+)?(\s+\d{5})?([\s|\,|.]+)/i) {
194
0
                                        unless($region && ($region ne 'US')) {
195
0
                                                if(($l = $self->geocode(location => "$addr, US")) && ref($l)) {
196
0
                                                        $l->confidence(0.8);
197
0
                                                        $l->country('US');
198
0
                                                        $l->location("$addr, USA");
199
0
                                                        push @rc, $l;
200                                                }
201                                        }
202                                } elsif($addr =~ /\s+(\d{2,5}\s+)(?![a|p]m\b)(([a-zA-Z|\s+]{1,5}){1,2})?([\s|\,|.]+)?(([a-zA-Z|\s+]{1,30}){1,4})(court|ct|street|st|drive|dr|lane|ln|road|rd|blvd)([\s|\,|.|\;]+)?(([a-zA-Z|\s+]{1,30}){1,2})([\s|\,|.]+)?\b(AB|BC|MB|NB|NL|NT|NS|ON|PE|QC|SK|YT)([\s|\,|.]+)?(\s+\d{5})?([\s|\,|.]+)/i) {
203
0
                                        unless($region && ($region ne 'CA')) {
204
0
                                                if(($l = $self->geocode(location => "$addr, Canada")) && ref($l)) {
205
0
                                                        $l->confidence(0.8);
206
0
                                                        $l->country('CA');
207
0
                                                        $l->location("$addr, Canada");
208
0
                                                        push @rc, $l;
209                                                }
210                                        }
211                                } elsif($addr =~ /([a-zA-Z|\s+]{1,30}){1,2}([\s|\,|.]+)?\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|GU|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VI|VT|WA|WI|WV|WY)/i) {
212
0
                                        unless($region && ($region ne 'US')) {
213
0
                                                if(($l = $self->geocode(location => "$addr, US")) && ref($l)) {
214
0
                                                        $l->confidence(0.6);
215
0
                                                        $l->city(uc($1));
216
0
                                                        $l->state(uc($3));
217
0
                                                        $l->country('US');
218
0
                                                        $l->location(uc("$addr, USA"));
219
0
                                                        push @rc, $l;
220                                                }
221                                        }
222                                } elsif($addr =~ /([a-zA-Z|\s+]{1,30}){1,2}([\s|\,|.]+)?\b(AB|BC|MB|NB|NL|NT|NS|ON|PE|QC|SK|YT)/i) {
223
0
                                        unless($region && ($region ne 'CA')) {
224
0
                                                if(($l = $self->geocode(location => "$addr, Canada")) && ref($l)) {
225
0
                                                        $l->confidence(0.6);
226
0
                                                        $l->city(uc($1));
227
0
                                                        $l->state(uc($3));
228
0
                                                        $l->country('Canada');
229
0
                                                        $l->location(uc("$addr, Canada"));
230
0
                                                        push @rc, $l;
231                                                }
232                                        }
233                                }
234
0
                                if($region && (($l = $self->geocode(location => "$addr, $region")) && ref($l))) {
235
0
                                        $l->confidence(0.2);
236
0
                                        $l->location("$addr, $region");
237                                        # ::diag(__LINE__, ": $addr, $region");
238
0
                                        push @rc, $l;
239                                } elsif((!$region) && (($l = $self->geocode(location => $addr)) && ref($l))) {
240
0
                                        $l->confidence(0.1);
241
0
                                        $l->location($addr);
242                                        # ::diag(__LINE__, ": $addr");
243
0
                                        push @rc, $l;
244                                }
245
0
                                if($offset < $count - 2) {
246
0
                                        $addr = join(', ', $words[$offset], $words[$offset + 1], $words[$offset + 2]);
247
0
                                        if(($l = $self->geocode(location => $addr)) && ref($l)) {
248
0
                                                $l->confidence(1.0);
249
0
                                                $l->location($addr);
250
0
                                                push @rc, $l;
251                                        }
252                                }
253                        }
254
0
                        $offset++;
255                }
256
0
                return @rc;
257                # my @locations;
258
259                # foreach my $l(@rc) {
260                        # ::diag(__LINE__, ': ', Data::Dumper->new([$l])->Dump());
261                        # push @locations, Location::GeoTool->create_coord($l->{'latitude'}, $l->{'longitude'}, $l->{'location'}, 'Degree');
262                # }
263
264                # return @locations;
265        }
266
267        my $location = $param{location}
268
0
                or Carp::croak('Usage: geocode(location => $location|scantext => $text)');
269
270        # ::diag($location);
271
272
0
        $location =~ s/,\s+,\s+/, /g;
273
274
0
        if($location =~ /^,\s*(.+)/) {
275
0
                $location = $1;
276        }
277
278        # Fail when the input is just a set of numbers
279
0
        if($location !~ /\D/) {
280                # Carp::croak('Usage: ', __PACKAGE__, ": invalid input to geocode(), $location");
281
0
                return;
282        }
283
0
        return if(length($location) <= 1);
284
285
0
        if($location =~ /^(.+),?\s*Washington\s*DC$/i) {
286
0
                $location = "$1, Washington, DC, USA";
287        } elsif($location =~ /^(.*),?\s*Saint Louis, (Missouri|MO)(.*)$/) {
288                # createdatabase.PL also maps this
289
0
                $location = "$1, St. Louis, MO$3";
290        }
291
292
0
        if(my $rc = $known_locations{$location}) {
293                # return $known_locations{$location};
294                return Geo::Location::Point->new({
295                        'lat' => $rc->{'latitude'},
296                        'long' => $rc->{'longitude'},
297
0
                        'lng' => $rc->{'longitude'},
298                        'location' => $location,
299                        'database' => 'OpenAddresses'
300                });
301        }
302
303
0
        $self->{'location'} = $location;
304
305
0
        my $county;
306        my $state;
307
0
        my $country;
308
0
        my $street;
309
310
0
        $location =~ s/\.//g;
311
312
0
        if($location !~ /,/) {
313
0
                if($location =~ /^(.+?)\s+(United States|USA|US)$/i) {
314
0
                        my $l = $1;
315
0
                        if(my $rc = $self->_get($l, 'US')) {
316
0
                                $rc->{'country'} = 'US';
317
0
                                return $rc;
318                        }
319
0
                        $l =~ s/\s+//g;
320
0
                        if(my $rc = $self->_get($l, 'US')) {
321
0
                                $rc->{'country'} = 'US';
322
0
                                return $rc;
323                        }
324                } elsif($location =~ /^(.+?)\s+(England|Scotland|Wales|Northern Ireland|UK|GB)$/i) {
325
0
                        my $l = $1;
326
0
                        $l =~ s/\s+//g;
327
0
                        if(my $rc = $self->_get($l, 'GB')) {
328
0
                                $rc->{'country'} = 'GB';
329
0
                                return $rc;
330                        }
331                } elsif($location =~ /^(.+?)\s+Canada$/i) {
332
0
                        my $l = $1;
333
0
                        $l =~ s/\s+//g;
334
0
                        if(my $rc = $self->_get($l, 'CA')) {
335
0
                                $rc->{'country'} = 'CA';
336
0
                                return $rc;
337                        }
338                }
339        }
340
0
        my $ap;
341
0
        if(($location =~ /USA$/) || ($location =~ /United States$/)) {
342
0
                $ap = $self->{'ap'}->{'us'} // Lingua::EN::AddressParse->new(country => 'US', auto_clean => 1, force_case => 1, force_post_code => 0);
343
0
                $self->{'ap'}->{'us'} = $ap;
344        } elsif($location =~ /(England|Scotland|Wales|Northern Ireland|UK|GB)$/i) {
345
0
                $ap = $self->{'ap'}->{'gb'} // Lingua::EN::AddressParse->new(country => 'GB', auto_clean => 1, force_case => 1, force_post_code => 0);
346
0
                $self->{'ap'}->{'gb'} = $ap;
347        } elsif($location =~ /Canada$/) {
348
0
                $ap = $self->{'ap'}->{'ca'} // Lingua::EN::AddressParse->new(country => 'CA', auto_clean => 1, force_case => 1, force_post_code => 0);
349
0
                $self->{'ap'}->{'ca'} = $ap;
350        } elsif($location =~ /Australia$/) {
351
0
                $ap = $self->{'ap'}->{'au'} // Lingua::EN::AddressParse->new(country => 'AU', auto_clean => 1, force_case => 1, force_post_code => 0);
352
0
                $self->{'ap'}->{'au'} = $ap;
353        }
354
0
        if($ap) {
355
0
                my $l = $location;
356
0
                if($l =~ /(.+), (England|UK)$/i) {
357
0
                        $l = "$1, GB";
358                }
359
0
                if($ap->parse($l)) {
360                        # Carp::croak($ap->report());
361                        # ::diag('Address parse failed: ', $ap->report());
362                } else {
363
0
                        my %c = $ap->components();
364                        # ::diag(Data::Dumper->new([\%c])->Dump());
365
0
                        my %addr = ( 'location' => $l );
366
0
                        $street = $c{'street_name'};
367
0
                        if(my $type = $c{'street_type'}) {
368
0
                                if(my $a = Geo::Coder::Free::_abbreviate($type)) {
369
0
                                        $street .= " $a";
370                                } else {
371
0
                                        $street .= " $type";
372                                }
373
0
                                if(my $suffix = $c{'street_direction_suffix'}) {
374
0
                                        $street .= " $suffix";
375                                }
376
0
                                $street =~ s/^0+//;     # Turn 04th St into 4th St
377
0
                                $addr{'road'} = $street;
378                        }
379
0
                        if(length($c{'subcountry'}) == 2) {
380
0
                                $addr{'state'} = $c{'subcountry'};
381                        } else {
382
0
                                if($c{'country'} =~ /Canada/i) {
383
0
                                        $addr{'country'} = 'CA';
384
0
                                        if(my $twoletterstate = Locale::CA->new()->{province2code}{uc($c{'subcountry'})}) {
385
0
                                                $addr{'state'} = $twoletterstate;
386                                        }
387                                } elsif($c{'country'} =~ /^(United States|USA|US)$/i) {
388
0
                                        $addr{'country'} = 'US';
389
0
                                        if(my $twoletterstate = Locale::US->new()->{state2code}{uc($c{'subcountry'})}) {
390
0
                                                $addr{'state'} = $twoletterstate;
391                                        }
392                                } elsif($c{'country'}) {
393
0
                                        $addr{'country'} = $c{'country'};
394
0
                                        if($c{'subcountry'}) {
395
0
                                                $addr{'state'} = $c{'subcountry'};
396                                        }
397                                }
398                        }
399
0
                        $addr{'house_number'} = $c{'property_identifier'};
400
0
                        $addr{'city'} = $c{'suburb'};
401                        # ::diag(Data::Dumper->new([\%addr])->Dump());
402
0
                        if($addr{'house_number'}) {
403
0
                                if(my $rc = $self->_search(\%addr, ('house_number', 'road', 'city', 'state', 'country'))) {
404
0
                                        return $rc;
405                                }
406                        }
407
0
                        if((!$addr{'house_number'}) || !$param{'exact'}) {
408
0
                                if(my $rc = $self->_search(\%addr, ('road', 'city', 'state', 'country'))) {
409
0
                                        return $rc;
410                                }
411                        }
412                }
413        }
414
415
0
        if($location =~ /^(.+?)[,\s]+(United States|USA|US)$/i) {
416                # Try Geo::StreetAddress::US, which is rather buggy
417
418
0
                my $l = $1;
419
0
                $l =~ s/,/ /g;
420
0
                $l =~ s/\s\s+/ /g;
421
422                # Work around for RT#122617
423
0
                if(($location !~ /\sCounty,/i) && (my $href = (Geo::StreetAddress::US->parse_location($l) || Geo::StreetAddress::US->parse_address($l)))) {
424                        # ::diag(Data::Dumper->new([$href])->Dump());
425
0
                        if($state = $href->{'state'}) {
426
0
                                if(length($state) > 2) {
427
0
                                        if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
428
0
                                                $state = $twoletterstate;
429                                        }
430                                }
431
0
                                my $city;
432
0
                                if($href->{city}) {
433
0
                                        $city = uc($href->{city});
434                                }
435
0
                                if($street = $href->{street}) {
436
0
                                        if($href->{'type'} && (my $type = Geo::Coder::Free::_abbreviate($href->{'type'}))) {
437
0
                                                $street .= " $type";
438                                        }
439
0
                                        if($href->{suffix}) {
440
0
                                                $street .= ' ' . $href->{suffix};
441                                        }
442
0
                                        if(my $prefix = $href->{prefix}) {
443
0
                                                $street = "$prefix $street";
444                                        }
445
0
                                        if($href->{'number'}) {
446
0
                                                if(my $rc = $self->_get($href->{'number'}, "$street$city$state", 'US')) {
447
0
                                                        $rc->{'country'} = 'US';
448
0
                                                        return $rc;
449                                                }
450                                        }
451
0
                                        if(my $rc = $self->_get("$street$city$state", 'US')) {
452
0
                                                $rc->{'country'} = 'US';
453
0
                                                return $rc;
454                                        }
455                                }
456                        }
457                }
458
459                # Hack to find "name, street, town, state, US"
460
0
                my @addr = split(/,\s*/, $location);
461
0
                if(scalar(@addr) == 5) {
462                        # ::diag(__PACKAGE__, ': ', __LINE__, ": $location");
463
0
                        $state = $addr[3];
464
0
                        if(length($state) > 2) {
465
0
                                if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
466
0
                                        $state = $twoletterstate;
467                                }
468                        }
469
0
                        if(length($state) == 2) {
470
0
                                $addr[1] = Geo::Coder::Free::_normalize($addr[1]);
471                                # ::diag(Data::Dumper->new([\@addr])->Dump());
472
0
                                if(my $rc = $self->_get($addr[0], $addr[1], $addr[2], $state, 'US')) {
473                                        # ::diag(Data::Dumper->new([$rc])->Dump());
474
0
                                        $rc->{'country'} = 'US';
475
0
                                        return $rc;
476                                }
477                        }
478                        # Hack to find "street, town, county, state, US"
479
0
                        if(length($state) == 2) {
480
0
                                $addr[0] = Geo::Coder::Free::_normalize($addr[0]);
481
0
                                $addr[2] =~ s/\s+COUNTY$//i;
482                                # ::diag(Data::Dumper->new([\@addr])->Dump());
483
0
                                if(my $rc = $self->_get($addr[0], $addr[1], $addr[2], $state, 'US')) {
484                                        # ::diag(Data::Dumper->new([$rc])->Dump());
485
0
                                        $rc->{'country'} = 'US';
486
0
                                        return $rc;
487                                }
488
0
                                if(my $rc = $self->_get($addr[0], $addr[1], $state, 'US')) {
489                                        # ::diag(Data::Dumper->new([$rc])->Dump());
490
0
                                        $rc->{'country'} = 'US';
491
0
                                        return $rc;
492                                }
493                        }
494                }
495        }
496
497
0
        if($location =~ /(.+),\s*([\s\w]+),\s*([\w\s]+)$/) {
498
0
                my $city = $1;
499
0
                $state = $2;
500
0
                $country = $3;
501
0
                $state =~ s/\s$//g;
502
0
                $country =~ s/\s$//g;
503
504
0
                my $c;
505
506
0
                if((uc($country) eq 'ENGLAND') ||
507                   (uc($country) eq 'SCOTLAND') ||
508                   (uc($country) eq 'WALES')) {
509
0
                        $country = 'Great Britain';
510
0
                        $c = 'gb';
511                } else {
512
0
                        $c = country2code($country);
513                }
514
0
                if($c) {
515
0
                        if($c eq 'us') {
516
0
                                if(length($state) > 2) {
517
0
                                        if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
518
0
                                                $state = $twoletterstate;
519                                        }
520                                }
521
0
                                my $rc;
522
523
0
                                if($city !~ /,/) {
524
0
                                        $city = uc($city);
525
0
                                        if($city =~ /^(.+)\sCOUNTY$/) {
526                                                # Simple case looking up a county in a state in the US
527
0
                                                if($rc = $self->_get("$1$state", 'US')) {
528
0
                                                        $rc->{'country'} = 'US';
529
0
                                                        return $rc;
530                                                }
531                                        } else {
532                                                # Simple case looking up a city in a state in the US
533
0
                                                if($rc = $self->_get("$city$state", 'US')) {
534
0
                                                        $rc->{'country'} = 'US';
535
0
                                                        return $rc;
536                                                }
537                                        }
538                                } elsif(my $href = Geo::StreetAddress::US->parse_address("$city, $state")) {
539                                # warn __LINE__;
540                                # use Data::Dumper;
541                                # warn Dumper($href);
542                                        # Well formed, simple street address in the US
543                                        # ::diag(Data::Dumper->new([\$href])->Dump());
544
0
                                        $state = $href->{'state'};
545
0
                                        if(length($state) > 2) {
546
0
                                                if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
547
0
                                                        $state = $twoletterstate;
548                                                }
549                                        }
550
0
                                        if($href->{city}) {
551
0
                                                $city = uc($href->{city});
552                                        }
553                                        # Unabbreviated - look up both, helps with fallback to Maxmind
554
0
                                        my $fullstreet = $href->{'street'};
555                                # warn __LINE__;
556
0
                                        if($street = $fullstreet) {
557                                # warn __LINE__;
558
0
                                                $fullstreet .= ' ' . $href->{'type'};
559
0
                                                if(my $type = Geo::Coder::Free::_abbreviate($href->{'type'})) {
560
0
                                                        $street .= " $type";
561                                                }
562
0
                                                if($href->{suffix}) {
563
0
                                                        $street .= ' ' . $href->{suffix};
564
0
                                                        $fullstreet .= ' ' . $href->{suffix};
565                                                }
566                                        }
567
0
                                        if($street) {
568                                # warn __LINE__;
569
0
                                                if(my $prefix = $href->{prefix}) {
570
0
                                                        $street = "$prefix $street";
571
0
                                                        $fullstreet = "$prefix $fullstreet";
572                                                }
573
0
                                                if($href->{'number'}) {
574                                                        # ::diag($href->{'number'}, "$street$city$state", 'US');
575
0
                                                        if($rc = $self->_get($href->{'number'}, "$street$city$state", 'US')) {
576
0
                                                                $rc->{'country'} = 'US';
577
0
                                                                return $rc;
578                                                        }
579
0
                                                        if($rc = $self->_get($href->{'number'}, "$fullstreet$city$state", 'US')) {
580
0
                                                                $rc->{'country'} = 'US';
581
0
                                                                return $rc;
582                                                        }
583                                                }
584                                                # ::diag("$street$city$state", 'US');
585                                                # warn("$street$city$state", 'US');
586
0
                                                if($rc = $self->_get("$street$city$state", 'US')) {
587
0
                                                        $rc->{'country'} = 'US';
588
0
                                                        return $rc;
589                                                }
590
0
                                                $street =~ s/\s+//g;
591
0
                                                if($rc = $self->_get("$street$city$state", 'US')) {
592
0
                                                        $rc->{'country'} = 'US';
593
0
                                                        return $rc;
594                                                }
595                                                # ::diag("$fullstreet$city$state", 'US');
596                                                # warn("$fullstreet$city$state", 'US');
597
0
                                                if($rc = $self->_get("$fullstreet$city$state", 'US')) {
598
0
                                                        $rc->{'country'} = 'US';
599
0
                                                        return $rc;
600                                                }
601
0
                                                $fullstreet =~ s/\s+//g;
602                                        }
603
0
                                        warn "Fast lookup of US location '$location' failed";
604                                } else {
605
0
                                        if($city =~ /^(\d.+),\s*([\w\s]+),\s*([\w\s]+)/) {
606
0
                                                my $lookup = "$1, $2, $state";
607
0
                                                if(my $href = (Geo::StreetAddress::US->parse_address($lookup) || Geo::StreetAddress::US->parse_location($lookup))) {
608                                                        # Street, City, County
609                                                        # 105 S. West Street, Spencer, Owen, Indiana, USA
610                                                        # ::diag(Data::Dumper->new([\$href])->Dump());
611
0
                                                        $county = $3;
612
0
                                                        $county =~ s/\s*county$//i;
613
0
                                                        if($href->{'state'}) {
614
0
                                                                $state = $href->{'state'};
615                                                        } else {
616
0
                                                                Carp::croak(__PACKAGE__, ": Geo::StreetAddress::US couldn't find the state in '$lookup'");
617                                                        }
618
0
                                                        if(length($state) > 2) {
619
0
                                                                if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
620
0
                                                                        $state = $twoletterstate;
621                                                                }
622                                                        }
623
0
                                                        my %args = (county => uc($county), state => $state, country => 'US');
624
0
                                                        if($href->{city}) {
625
0
                                                                $city = $args{city} = uc($href->{city});
626                                                        }
627
0
                                                        if($href->{number}) {
628
0
                                                                $args{number} = $href->{number};
629                                                        }
630
0
                                                        if($street = $href->{street}) {
631
0
                                                                if(my $type = Geo::Coder::Free::_abbreviate($href->{'type'})) {
632
0
                                                                        $street .= " $type";
633                                                                }
634
0
                                                                if($href->{suffix}) {
635
0
                                                                        $street .= ' ' . $href->{suffix};
636                                                                }
637
0
                                                                if(my $prefix = $href->{prefix}) {
638
0
                                                                        $street = "$prefix $street";
639                                                                }
640
0
                                                                $args{street} = uc($street);
641
0
                                                                if($href->{'number'}) {
642
0
                                                                        if($county) {
643
0
                                                                                if($rc = $self->_get($href->{'number'}, "$street$city$county$state", 'US')) {
644
0
                                                                                        $rc->{'country'} = 'US';
645
0
                                                                                        return $rc;
646                                                                                }
647                                                                        }
648
0
                                                                        if($rc = $self->_get($href->{'number'}, "$street$city$state", 'US')) {
649
0
                                                                                $rc->{'country'} = 'US';
650
0
                                                                                return $rc;
651                                                                        }
652
0
                                                                        if($county) {
653
0
                                                                                if($rc = $self->_get("$street$city$county$state", 'US')) {
654
0
                                                                                        $rc->{'country'} = 'US';
655
0
                                                                                        return $rc;
656                                                                                }
657                                                                        }
658
0
                                                                        if($rc = $self->_get("$street$city$state", 'US')) {
659
0
                                                                                $rc->{'country'} = 'US';
660
0
                                                                                return $rc;
661                                                                        }
662                                                                }
663                                                        }
664
0
                                                        return; # Not found
665                                                }
666
0
                                                die $city;      # TODO: do something here
667                                        } elsif($city =~ /^(\w[\w\s]+),\s*([\w\s]+)/) {
668                                                # Perhaps it just has the street's name?
669                                                # Rockville Pike, Rockville, MD, USA
670
0
                                                my $first = uc($1);
671
0
                                                my $second = uc($2);
672
0
                                                if($second =~ /(\d+)\s+(.+)/) {
673
0
                                                        $second = "$1$2";
674                                                }
675
0
                                                if($rc = $self->_get("$first$second$state", 'US')) {
676
0
                                                        $rc->{'country'} = 'US';
677
0
                                                        return $rc;
678                                                }
679                                                # Perhaps it's a city in a county?
680                                                # Silver Spring, Montgomery County, MD, USA
681
0
                                                $second =~ s/\s+COUNTY$//;
682
0
                                                if($rc = $self->_get("$first$second$state", 'US')) {
683
0
                                                        $rc->{'country'} = 'US';
684
0
                                                        return $rc;
685                                                }
686                                                # Not all the database has the county
687
0
                                                if($rc = $self->_get("$first$state", 'US')) {
688
0
                                                        $rc->{'country'} = 'US';
689
0
                                                        return $rc;
690                                                }
691                                                # Brute force last ditch approach
692
0
                                                my $copy = uc($location);
693
0
                                                $copy =~ s/,\s+//g;
694
0
                                                $copy =~ s/\s*USA$//;
695
0
                                                if($rc = $self->_get($copy, 'US')) {
696
0
                                                        $rc->{'country'} = 'US';
697
0
                                                        return $rc;
698                                                }
699
0
                                                if($copy =~ s/(\d+)\s+/$1/) {
700
0
                                                        if($rc = $self->_get($copy, 'US')) {
701
0
                                                                $rc->{'country'} = 'US';
702
0
                                                                return $rc;
703                                                        }
704                                                }
705                                        }
706                                        # warn "Can't yet parse US location '$location'";
707                                }
708                        } elsif($c eq 'ca') {
709
0
                                if(length($state) > 2) {
710
0
                                        if(my $twoletterstate = Locale::CA->new()->{province2code}{uc($state)}) {
711
0
                                                $state = $twoletterstate;
712                                        }
713                                }
714
0
                                my $rc;
715
0
                                if($city !~ /,/) {
716                                        # Simple case looking up a city in a state in Canada
717
0
                                        $city = uc($city);
718
0
                                        if($rc = $self->_get("$city$state", 'CA')) {
719
0
                                                $rc->{'country'} = 'CA';
720
0
                                                return $rc;
721                                        }
722                                # } elsif(my $href = Geo::StreetAddress::Canada->parse_address("$city, $state")) {
723                                } elsif(my $href = 0) {
724                                        # Well formed, simple street address in Canada
725
0
                                        $state = $href->{'province'};
726
0
                                        if(length($state) > 2) {
727
0
                                                if(my $twoletterstate = Locale::CA->new()->{province2code}{uc($state)}) {
728
0
                                                        $state = $twoletterstate;
729                                                }
730                                        }
731
0
                                        my %args = (state => $state, country => 'CA');
732
0
                                        if($href->{city}) {
733
0
                                                $args{city} = uc($href->{city});
734                                        }
735
0
                                        if($href->{number}) {
736
0
                                                $args{number} = $href->{number};
737                                        }
738
0
                                        if($street = $href->{street}) {
739
0
                                                if(my $type = Geo::Coder::Free::_abbreviate($href->{'type'})) {
740
0
                                                        $street .= " $type";
741                                                }
742
0
                                                if($href->{suffix}) {
743
0
                                                        $street .= ' ' . $href->{suffix};
744                                                }
745                                        }
746
0
                                        if($street) {
747
0
                                                if(my $prefix = $href->{prefix}) {
748
0
                                                        $street = "$prefix $street";
749                                                }
750
0
                                                $args{street} = uc($street);
751                                        }
752
0
                                        warn "Fast lookup of Canadian location '$location' failed";
753                                } else {
754
0
                                        if($city =~ /^(\w[\w\s]+),\s*([\w\s]+)/) {
755                                                # Perhaps it just has the street's name?
756                                                # Rockville Pike, Rockville, MD, USA
757
0
                                                my $first = uc($1);
758
0
                                                my $second = uc($2);
759
0
                                                if($rc = $self->_get("$first$second$state", 'CA')) {
760
0
                                                        $rc->{'country'} = 'CA';
761
0
                                                        return $rc;
762                                                }
763                                                # Perhaps it's a city in a county?
764                                                # Silver Spring, Montgomery County, MD, USA
765
0
                                                $second =~ s/\s+COUNTY$//;
766
0
                                                if($rc = $self->_get("$first$second$state", 'CA')) {
767
0
                                                        $rc->{'country'} = 'CA';
768
0
                                                        return $rc;
769                                                }
770                                                # Not all the database has the county
771
0
                                                if($rc = $self->_get("$first$state", 'CA')) {
772
0
                                                        $rc->{'country'} = 'CA';
773
0
                                                        return $rc;
774                                                }
775                                                # Brute force last ditch approach
776
0
                                                my $copy = uc($location);
777
0
                                                $copy =~ s/,\s+//g;
778
0
                                                $copy =~ s/\s*Canada$//i;
779
0
                                                if($rc = $self->_get($copy, 'CA')) {
780
0
                                                        $rc->{'country'} = 'CA';
781
0
                                                        return $rc;
782                                                }
783
0
                                                if($copy =~ s/(\d+)\s+/$1/) {
784
0
                                                        if($rc = $self->_get($copy, 'CA')) {
785
0
                                                                $rc->{'country'} = 'CA';
786
0
                                                                return $rc;
787                                                        }
788                                                }
789                                        }
790                                        # warn "Can't yet parse Canadian location '$location'";
791                                }
792                        } else {
793                                # Currently only handles Town, Region, Country
794                                # TODO: add addresses support
795
0
                                if(($c eq 'au') && (length($state) > 3)) {
796
0
                                        if(my $abbrev = Locale::SubCountry->new('AU')->code(ucfirst(lc($state)))) {
797
0
                                                if($abbrev ne 'unknown') {
798
0
                                                        $state = $abbrev;
799                                                }
800                                        }
801                                }
802
0
                                if($city =~ /^(\w[\w\s]+),\s*([,\w\s]+)/) {
803                                        # City includes a street name
804
0
                                        $street = uc($1);
805
0
                                        $city = uc($2);
806
0
                                        my $number;
807
0
                                        if($street =~ /^(\d+)\s+(.+)/) {
808
0
                                                $number = $1;
809
0
                                                $street = $2;
810                                        }
811
812                                        # TODO: Configurable - or better still remove the need
813
0
                                        if($city eq 'MINSTER, THANET') {
814
0
                                                $city = 'RAMSGATE';
815                                        }
816
0
                                        $street = Geo::Coder::Free::_normalize($street);
817
0
                                        if($number) {
818
0
                                                if(my $rc = $self->_get("$number$street$city$state$c")) {
819
0
                                                        return $rc;
820                                                }
821                                                # If we can't find the number, at least find the road
822                                        }
823
0
                                        if(my $rc = $self->_get("$street$city$state$c")) {
824
0
                                                return $rc;
825                                        }
826                                }
827
0
                                if((!$street) || !$param{'exact'}) {
828
0
                                        if(my $rc = $self->_get("$city$state$c")) {
829                                                # return {
830                                                        # 'number' => undef,
831                                                        # 'street' => undef,
832                                                        # 'city' => $city,
833                                                        # 'state' => $state,
834                                                        # 'country' => $country,
835                                                        # %{$rc}
836                                                # };
837
0
                                                return $rc;
838                                        }
839                                }
840                        }
841                }
842        } elsif($location =~ /([a-z\s]+),?\s*(United States|USA|US|Canada)$/i) {
843                # Looking for a state/province in Canada or the US
844
0
                $state = $1;
845
0
                $country = $2;
846
0
                if($country =~ /Canada/i) {
847
0
                        $country = 'CA';
848
0
                        if(length($state) > 2) {
849
0
                                if(my $twoletterstate = Locale::CA->new()->{province2code}{uc($state)}) {
850
0
                                        $state = $twoletterstate;
851                                }
852                        }
853                } else {
854
0
                        $country = 'US';
855
0
                        if(length($state) > 2) {
856
0
                                if(my $twoletterstate = Locale::US->new()->{state2code}{uc($state)}) {
857
0
                                        $state = $twoletterstate;
858                                }
859                        }
860                }
861
0
                if(my $rc = $self->_get("$state$country")) {
862
0
                        $rc->{'country'} = $country;
863
0
                        return $rc;
864                }
865        }
866
867
0
        if($country) {
868
0
                require Geo::Address::Parser && Geo::Address::Parser->import() unless Geo::Address::Parser->can('parse');
869
870
0
                if($country eq 'US') {
871
0
                        my $addr_parser = Geo::Address::Parser->new(country => 'US');
872
0
                        if(my $fields = $addr_parser->parse($location)) {
873
0
0
                                for my $key (keys %{$fields}) {
874
0
                                        delete $fields->{$key} unless defined $fields->{$key};
875                                }
876
0
0
                                if(my $rc = $self->_search($fields, keys %{$fields})) {
877
0
                                        return $rc;
878                                }
879                        }
880                }
881        }
882
883        # Finally try libpostal,
884        # which is good but uses a lot of memory
885        # ::diag("try libpostal on $location");
886
0
        if($libpostal_is_installed == LIBPOSTAL_UNKNOWN) {
887
0
0
                if(eval { require Geo::libpostal; } ) {
888
0
                        Geo::libpostal->import();
889
0
                        $libpostal_is_installed = LIBPOSTAL_INSTALLED;
890                } else {
891
0
                        $libpostal_is_installed = LIBPOSTAL_NOT_INSTALLED;
892                }
893        }
894
895        # ::diag(__PACKAGE__, ': ', __LINE__, ": libpostal_is_installed = $libpostal_is_installed ($location)");
896        # print(__PACKAGE__, ': ', __LINE__, ": libpostal_is_installed = $libpostal_is_installed ($location)\n");
897
898
0
        if(($libpostal_is_installed == LIBPOSTAL_INSTALLED) && (my %addr = Geo::libpostal::parse_address($location))) {
899                # print Data::Dumper->new([\%addr])->Dump();
900
0
                if($addr{'country'} && $addr{'state'} && ($addr{'country'} =~ /^(Canada|United States|USA|US)$/i)) {
901
0
                        if($street = $addr{'road'}) {
902
0
                                $street = Geo::Coder::Free::_normalize($street);
903
0
                                $addr{'road'} = $street;
904                        }
905
0
                        if($addr{'country'} =~ /Canada/i) {
906
0
                                $addr{'country'} = 'Canada';
907
0
                                if(length($addr{'state'}) > 2) {
908
0
                                        if(my $twoletterstate = Locale::CA->new()->{province2code}{uc($addr{'state'})}) {
909
0
                                                $addr{'state'} = $twoletterstate;
910                                        }
911                                }
912                        } else {
913
0
                                $addr{'country'} = 'US';
914
0
                                if(length($addr{'state'}) > 2) {
915
0
                                        if(my $twoletterstate = Locale::US->new()->{state2code}{uc($addr{'state'})}) {
916
0
                                                $addr{'state'} = $twoletterstate;
917                                        }
918                                }
919                        }
920
0
                        if($addr{'state_district'}) {
921
0
                                $addr{'state_district'} =~ s/^(.+)\s+COUNTY/$1/i;
922
0
                                if(my $rc = $self->_search(\%addr, ('house_number', 'road', 'city', 'state_district', 'state', 'country'))) {
923
0
                                        return $rc;
924                                }
925                        }
926
0
                        if(my $rc = $self->_search(\%addr, ('house_number', 'road', 'city', 'state', 'country'))) {
927
0
                                return $rc;
928                        }
929
0
                        if($addr{'house_number'}) {
930
0
                                if(my $rc = $self->_search(\%addr, ('road', 'city', 'state', 'country'))) {
931
0
                                        return $rc;
932                                }
933                        }
934                }
935        }
936
0
        if($location =~ s/,//g) {
937
0
                return $self->geocode($location);
938        }
939
0
        undef;
940}
941
942# $data is a hashref to data such as returned by Geo::libpostal::parse_address
943# @columns is the key names to use in $data
944sub _search {
945
0
        my ($self, $data, @columns) = @_;
946
947
0
        my $location;
948
0
        foreach my $column(@columns) {
949
0
                if($data->{$column}) {
950
0
                        $location .= $data->{$column};
951                }
952        }
953
0
        if($location) {
954
0
                return $self->_get($location);
955        }
956}
957
958# State must be the abbreviated form
959sub _get {
960
0
        my ($self, @location) = @_;
961
962
0
        my $location = join('', @location);
963
0
        $location =~ s/^\s+//;
964
0
        $location =~ s/,\s*//g;
965
0
        $location =~ tr/ž/z/;     # Remove wide characters
966
0
        $location =~ s/\xc5\xbe/z/g;
967
0
        $location =~ s/\N{U+017E}/z/g;
968
0
        $location =~ s/\s+//g;
969
970        # ::diag(__PACKAGE__, ': ', __LINE__, ": _get: $location");
971
0
        my $digest;
972
0
        if(length($location) <= 16) {
973
0
                $digest = uc($location);
974        } else {
975
0
                $digest = substr Digest::MD5::md5_base64(uc($location)), 0, 16;
976        }
977
978        # print __PACKAGE__, ': ', __LINE__, ': ', uc($location), " = $digest\n";
979        # my @call_details = caller(0);
980        # print "\t", ' called from line ', $call_details[2], "\n";
981        # @call_details = caller(1);
982        # print "\t", ' called from line ', $call_details[2], "\n";
983
984
0
        if(defined($unknown_locations{$digest})) {
985
0
                return;
986        }
987        # my @call_details = caller(0);
988        # print "line ", $call_details[2], "\n";
989        # print "$location: $digest\n";
990        # ::diag("line " . $call_details[2]);
991        # ::diag("$location: $digest");
992
0
        if(my $cache = $self->{'cache'}) {
993
0
                if(my $rc = $cache->get_object($digest)) {
994                        # ::diag(__LINE__, ': retrieved from cache');
995
0
                        return Storable::thaw($rc->value());
996                }
997        }
998        my $openaddr_db = $self->{openaddr_db} ||
999                Geo::Coder::Free::DB::openaddresses->new(
1000                        cache => $self->{cache} || CHI->new(driver => 'Memory', datastore => {}),
1001                        directory => $self->{openaddr},
1002
0
                        id => 'md5',
1003                        no_entry => 1,
1004                );
1005
0
        $self->{openaddr_db} = $openaddr_db;
1006
0
        if(my $geohash = $openaddr_db->geohash(md5 => $digest)) {
1007
0
                $self->{'geo_hash'} ||= Geo::Hash->new();
1008
0
                my ($latitude, $longitude) = $self->{'geo_hash'}->decode($geohash);
1009
1010
0
                my $rc = Geo::Location::Point->new({
1011                        'lat' => $latitude,
1012                        'long' => $longitude,
1013                        'lng' => $longitude,
1014                        'location' => $location,
1015                        'database' => 'OpenAddresses'
1016                });
1017
1018
0
                if(my $cache = $self->{'cache'}) {
1019
0
                        $cache->set($digest, Storable::freeze($rc), '1 month');
1020                }
1021
1022
0
                return $rc;
1023        }
1024
0
        $unknown_locations{$digest} = 1;
1025
0
        return;
1026}
1027
1028 - 1034
=head2  reverse_geocode

    $location = $geocoder->reverse_geocode(latlng => '37.778907,-122.39732');

To be done.

=cut
1035
1036# At the moment this can't be supported as the DB only has a hash in it
1037sub reverse_geocode {
1038
0
1
        Carp::croak(__PACKAGE__, ': Reverse lookup is not yet supported');
1039}
1040
1041 - 1045
=head2  ua

Does nothing, here for compatibility with other geocoders

=cut
1046
1047
1
sub ua {
1048}
1049
1050 - 1108
=head1 AUTHOR

Nigel Horne <njh@bandsman.co.uk>

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

The contents of lib/Geo/Coder/Free/OpenAddresses/databases comes from
the places listed in the synopsis.

=head1 BUGS

Lots of lookups fail at the moment.

There are die()s where the code path has yet to be written.

The openaddresses data doesn't cover the globe.

Can't parse and handle "London, England".

Currently only searches US and Canadian data.

If you do search in the UK, only look up towns, full addresses aren't
included.  So these will print the same.

    use Geo::Coder::Free::OpenAddresses;

    $location = $geo_coder->geocode(location => '22 Central Road, Ramsgate, Kent, England');
    print $location->{latitude}, "\n";
    print $location->{longitude}, "\n";
    $location = $geo_coder->geocode(location => '7 Hillbrow Road, St Lawrence, Thanet, Kent, England');
    print $location->{latitude}, "\n";
    print $location->{longitude}, "\n";

When I added the WhosOnFirst data I should have renamed this as it contains
data from both sources.

The database shouldn't be called $OPENADDR_HOME/openaddresses.sql,
since the database now also includes data from WhosOnFirst.

The name openaddresses.sql shouldn't be hardcoded,
add support to "new" for the parameter "dbname".

The argument "openaddr",
would be less confusing if it were called "directory",

=head1 SEE ALSO

VWF, openaddresses.

=head1 LICENSE AND COPYRIGHT

Copyright 2017-2025 Nigel Horne.

The program code is released under the following licence: GPL for personal use on a single computer.
All other users (including Commercial, Charity, Educational, Government)
must apply in writing for a licence for use from Nigel Horne at `<njh at nigelhorne.com>`.

=cut
1109
11101;