File Coverage

File:blib/lib/Genealogy/Obituary/Parser.pm
Coverage:78.4%

linestmtbrancondsubpodtimecode
1package Genealogy::Obituary::Parser;
2
3
4
4
4
252863
4
50
use strict;
4
4
4
4
8
1
71
use warnings;
5
6
4
4
4
764
804111
70
use DateTime::Format::Text;
7
4
4
4
14
4
59
use Exporter 'import';
8
4
4
4
973
1182514
68
use Geo::Coder::Free;
9
4
4
4
869
17937
74
use Geo::Coder::List;
10
4
4
4
12
32
82
use Params::Get 0.13;
11
4
4
4
12
30
53
use Return::Set 0.03;
12
4
4
4
8
3
15390
use Params::Validate::Strict;
13
14our @EXPORT_OK = qw(parse_obituary);
15our $geocoder;
16
17# TODO: use Lingua::EN::Tagger;
18# TODO: add more general code, e.g. where it looks for father, also look for mother
19# TODO: parse https://funeral-notices.co.uk/notice/adams/5244000
20
21 - 29
=head1 NAME

Genealogy::Obituary::Parser - Extract structured family relationships from obituary text

=head1 VERSION

Version 0.03

=cut
30
31our $VERSION = '0.03';
32
33 - 88
=head1 SYNOPSIS

  use Genealogy::Obituary::Parse qw(parse_obituary);

  my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.';
  my $data = parse_obituary($text);

  # $data = {
  #   spouse   => ['Paul'],
  #   children => ['Anna', 'Lucy'],
  #   grandchildren => ['Jake', 'Emma'],
  # };

=head1 DESCRIPTION

This module parses freeform obituary text and extracts structured family relationship data
for use in genealogical applications.
It parses obituary text and extract structured family relationship data, including details about children, parents, spouse, siblings, grandchildren, and other relatives.

=head1 FUNCTIONS

=head2 parse_obituary($text)

The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash.
It returns a hash reference containing structured family information,
with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc.

Takes a string, or a ref to a string.

=head3 API SPECIFICATION

=head4 INPUT

  {
    'text' => {
      'type' => 'string',    # or stringref
      'min' => 1,
      'max' => 10000
    }
  }

=head4 OUTPUT

=over 4

=item * No matches: undef

=back

  {
    type => 'hashref',
    'min' => 1,
    'max' => 10
  }

=cut
89
90sub parse_obituary
91{
92
12
1
405548
        my $params = Params::Validate::Strict::validate_strict({
93                args => Params::Get::get_params('text', \@_),
94                schema => {
95                        'text' => {
96                                'type' => 'string',
97                                'min' => 1,
98                                'max' => 10000
99                        }
100                }
101        });
102
12
676
        my $text = $params->{'text'};
103
104
12
21
        if(ref($text) eq 'SCALAR') {
105
0
0
0
0
                $text = ${$text};
106        }
107
108        # Quick scan to get started
109        sub parse_obituary_quick {
110
0
0
0
                my $text = shift;
111
0
0
                my %data;
112
113
0
0
                my @patterns = (
114                        [ qr/\bdaughters?\s+([^.,;]+)/i,  'children' ],
115                        [ qr/\bsons?\s+([^.,;]+)/i, 'children' ],
116                        [ qr/\bchildren\s+([^.,;]+)/i, 'children' ],
117                        [ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ],
118                        [ qr/\bwife\s+([^.,;]+)/i, 'spouse' ],
119                        [ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ],
120                        [ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ],
121                        [ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ],
122                        [ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ],
123                        [ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ],
124                        [ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ],
125                        [ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ],
126                );
127
128
0
0
                for my $p (@patterns) {
129
0
0
                        my ($re, $field) = @$p;
130
0
0
                        while ($text =~ /$re/g) {
131
0
0
                                my $list = $1;
132
0
0
                                next unless $list;
133
134                                # Robust splitting on commas and "and"
135
0
0
0
0
0
0
                                my @names = grep { length } map { s/^\s+|\s+$//gr } split /\s*(?:,|(?:\band\b))\s*/i, $list;
136
0
0
0
0
0
0
                                push @{$data{$field}}, map { { 'name' => $_ } } @names;
137                        }
138                }
139
140
0
0
                return \%data;
141        }
142
143        # my %family = %{parse_obituary_quick($text)};
144
12
11
        my %family;
145
146        # Helper to extract people from a specific section and remove empty entries
147        sub extract_people_section {
148
6
0
10
                my $section = shift;
149
6
8
                return unless $section;
150
151
6
19
                $section =~ s/\s+and\s+/, /g;   # Ensure "and" is treated as a separator
152
6
23
                $section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g;
153
6
18
                my @entries = split /\s*,\s*/, $section;
154
155
6
8
                my @people;
156
6
6
                foreach my $entry (@entries) {
157
16
17
                        $entry =~ s/<<COMMA>>/, /g;
158
159
16
15
                        my ($name, $spouse, $location) = ('', '', '');
160
161                        # Match "Ian (Terry) Girvan of Surrey, BC"
162
16
29
                        if ($entry =~ /^(\w+)\s+\(([^)]+)\)\s+(\w+)\s+of\s+(.+)$/) {
163
1
1
1
1
2
2
                                $name = "$1 $3"; $spouse = $2; $location = $4;
164                        }
165                        # Match "Gwen Steeves (Leslie) of Riverview, NB"
166                        elsif ($entry =~ /^(.+?)\s+\(([^)]+)\)\s+of\s+(.+)$/) {
167
1
1
1
2
1
2
                                $name = $1; $spouse = $2; $location = $3;
168                        }
169                        # Match "Carol Girvan of Dartmouth, NS"
170                        elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) {
171
1
1
1
0
                                $name = $1; $location = $2;
172                        } else {
173                                # Match names only (e.g. for siblings)
174
13
12
                                $name = $entry;
175                        }
176
177
16
12
                        next if !$name; # Skip if name is empty
178
14
15
                        next if($name =~ /^father-in-law\sto\s/);       # Skip follow ons
179
13
13
                        last if($name =~ /^devoted\s/i);
180
12
12
                        last if($name =~ /^loved\s/i);
181
182                        # Create a hash and filter out blank fields
183
12
19
                        my %person = (
184                                name     => $name,
185                                spouse => $spouse,
186                                location => $location,
187                        );
188
189                        # Remove blank fields
190
12
17
36
14
19
50
                        %person = map { $_ => $person{$_} } grep { defined $person{$_} && $person{$_} ne '' } keys %person;
191
192
12
17
                        push @people, \%person;
193                }
194
6
11
                return \@people;
195        }
196
197        sub extract_names_from_phrase {
198
1
0
1
                my $phrase = shift;
199
1
1
                my @names;
200
201
1
1
                $phrase =~ s/[.;]//g;
202
203                # Case: "Christopher, Thomas, and Marsha Cloud"
204
1
4
                if ($phrase =~ /^((?:\w+\s*,\s*)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) {
205
0
0
                        my ($pre, $last_first, $last) = ($1, $2, $3);
206
0
0
                        my @firsts = split(/\s*,\s*/, $pre);
207
0
0
                        push @firsts, $last_first;
208
0
0
0
0
                        push @names, map { "$_ $last" } @firsts;
209
0
0
                        return @names;
210                }
211
212                # Case: "Christopher and Thomas Cloud"
213
1
4
                if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) {
214
0
0
                        my ($first_part, $second_first, $last) = ($1, $2, $3);
215
0
0
                        my @firsts = split(/\s*,\s*|\s+and\s+/, $first_part);
216
0
0
0
0
                        push @names, map { "$_ $last" } (@firsts, $second_first);
217
0
0
                        return @names;
218                }
219
220                # Fallback: Split by comma or 'and'
221
1
2
                $phrase =~ s/, and grandchildren.+//;   # Handle "Anna and Lucy, and grandchildren Jake and Emma"
222
1
4
                my @parts = split /\s*(?:,|and)\s*/, $phrase;
223
1
2
1
5
                push @names, grep { defined($_) && $_ ne '' } @parts;
224
1
2
                return @names;
225        }
226
227        # Correct extraction of children (skipping "his/her")
228
12
207
        if ($text =~ /survived by (his|her) children\s*([^\.;]+)/i) {
229
1
2
                my $children_text = $2;
230
1
2
                $family{children} = extract_people_section($children_text);
231        } elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) {       # Look for the phrase "Loving mum to"
232
1
1
                my $children_text = $1;
233
1
2
                $family{children} = extract_people_section($children_text);
234        } elsif ($text =~ /Loving father of\s*([^\.;]+)/i) {    # Look for the phrase "Loving father of"
235
1
1
                my $children_text = $1;
236
1
3
                $family{children} = extract_people_section($children_text);
237        } elsif($text =~ /mother of\s*([^\.;]+)?,/i) {  # Look for the phrase "mother of"
238
1
2
                my $children_text = $1;
239
1
2
                $children_text =~ s/, grandmother.+//;
240
1
2
                $family{children} = extract_people_section($children_text);
241        } elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) {
242
3
3
                my @children;
243                my @grandchildren;
244
245
3
11
                push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' };
246
3
30
                if($text =~ /\bdaughter,?\s([a-z]+)/i) {
247
1
1
                        push @children, { 'name' => $1, 'sex' => 'F' }
248                }
249
3
27
                if($text =~ /\bgranddaughter,?\s([a-z]+)/i) {
250
1
2
                        push @grandchildren, { 'name' => $1, 'sex' => 'F' };
251                }
252
3
5
                $family{children} = \@children if @children;
253
3
5
                $family{grandchildren} = \@grandchildren if @grandchildren;
254        } else {
255
5
4
                my @children;
256
257                # my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0);
258                # my $tagged = $tagger->add_tags($text);
259
260
5
32
                if($text =~ /\ssons,\s*(.*?);/s) {
261
2
3
                        my $sons_text = $1;
262
2
6
                        if($sons_text =~ /, all of (.+)$/) {
263
1
1
                                my $location = $1;
264
1
3
                                while($sons_text =~ /([\w. ]+?),\s/g) {
265
3
3
                                        my $son = $1;
266
3
4
                                        if($son =~ /(\w+)\s+and\s+(\w+)/) {
267
1
4
                                                push @children, {
268                                                        name => $1,
269                                                        location => $location,
270                                                        sex => 'M',
271                                                }, {
272                                                        name => $2,
273                                                        location => $location,
274                                                        sex => 'M',
275                                                };
276
1
1
                                                last;
277                                        } else {
278
2
6
                                                push @children, {
279                                                        name => $son,
280                                                        location => $location,
281                                                        sex => 'M',
282                                                };
283                                        }
284                                }
285                        } else {
286
1
5
                                while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
287
2
6
                                        push @children, {
288                                                name => $1,
289                                                location => $2,
290                                                sex => 'M',
291                                        };
292                                }
293                        }
294                }
295
5
26
                if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) {
296
1
4
                        push @children, {
297                                name => $1,
298                                location => $3,
299                                sex => 'F',
300                                spouse => { 'name' => $2, sex => 'M' }
301                        };
302                } elsif($text =~ /one daughter,\s*(.+?),\s*(.+?);/) {
303
1
2
                        my $name = $1;
304
1
1
                        my $location = $2;
305
1
2
                        if($name =~ /(\w+)\s+(\w+)/) {
306
1
3
                                push @children, {
307                                        name => $1,
308                                        location => $location,
309                                        sex => 'F',
310                                        spouse => { name => $2, sex => 'M' }
311                                };
312                        } else {
313
0
0
                                push @children, {
314                                        name => $1,
315                                        location => $location,
316                                        sex => 'F',
317                                };
318                        }
319                }
320
5
8
                $family{children} = \@children if @children;
321
322
5
8
                if(!$family{'children'}) {
323
3
23
                        while($text =~ /\b(son|daughter)s?,\s*([A-Z][a-z]+(?:\s+\([A-Z][a-z]+\))?)\s*(?:and their children ([^.;]+))?/g) {
324
2
3
                                my $sex = $1 eq 'son' ? 'M' : 'F';
325
2
2
                                my $child = $2;
326
2
3
                                my $grandkids = $3;
327
2
8
                                if(my @grandchildren = $grandkids ? split /\s*,\s*|\s+and\s+/, $grandkids : ()) {
328
0
0
                                        push @children, {
329                                                name => $child,
330                                                sex => $sex,
331                                                grandchildren => \@grandchildren,
332                                        };
333                                } elsif(($sex eq 'F') && ($child =~ /(.+)\s+\((.+)\)/)) {
334
1
7
                                        push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } }
335                                } elsif($child ne 'Mrs') {
336
1
14
                                        push @children, { name => $child, sex => $sex }
337                                }
338                        }
339                }
340
5
7
                $family{children} = \@children if @children;
341        }
342
343
12
19
        if(!$family{'children'}) {
344
2
5
                if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) {
345
0
0
                        my $raw = $1;
346
0
0
                        $raw =~ s/\sand their .+//;
347
0
0
                        my @children = extract_names_from_phrase($raw);
348
0
0
0
0
0
0
                        push @{$family{children}}, map { { name => $_, sex => 'M' } } @children;
349                }
350
2
7
                if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) {
351
1
1
                        my $raw = $1;
352
1
2
                        $raw =~ s/\sand their .+//;
353
1
3
                        my @children = extract_names_from_phrase($raw);
354
1
1
2
2
2
3
                        push @{$family{children}}, map { { name => $_, sex => 'F' } } @children;
355                }
356        }
357
358        # Extract grandchildren
359
12
17
        if(!$family{'grandchildren'}) {
360
11
26
                if($text =~ /grandchildren\s+([^\.;]+)/i) {
361
3
16
                        my @grandchildren = split /\s*(?:,|and)\s*/i, $1;
362
3
4
                        if(scalar(@grandchildren)) {
363
3
7
8
3
11
13
                                $family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ];
364                        }
365                }
366        }
367
12
4
19
9
        if($family{'grandchildren'} && scalar @{$family{grandchildren}}) {
368
4
25
                while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) {
369
0
0
0
0
                        shift @{$family{'grandchildren'}};
370                }
371
4
11
                if($family{'grandchildren'}->[0] =~ /brothers/) {
372
0
0
                        if(!exists $family{'brothers'}) {
373
0
0
0
0
                                shift @{$family{'grandchildren'}};
374
0
0
0
0
                                $family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}}));
375                        }
376
0
0
                        delete $family{grandchildren};
377                }
378        } else {
379
8
8
                delete $family{grandchildren};
380        }
381
12
4
21
6
        if((!defined($family{'grandchildren'})) || (($#{$family{'grandchildren'}}) <= 0)) {
382                # handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill"
383
9
20
                my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved|$)/;
384                # Normalize and split into individual names
385
9
8
                my @grandchildren;
386
9
10
                if($grandchildren_str) {
387
1
5
                        @grandchildren = split /,\s*|\s+and\s+/, $grandchildren_str;
388                }
389
9
18
                if(scalar(@grandchildren)) {
390
1
1
                        $family{'grandchildren'} = \@grandchildren;
391                } elsif($text =~ /grandm\w+\s/) {
392
1
2
                        my $t = $text;
393
1
5
                        $t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/;
394
1
6
                        $family{grandchildren} = [ split /\s*(?:,|and)\s*/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] || '' ];
395                }
396        }
397
398        # Extract siblings (sisters and brothers) correctly, skipping "her" or "his"
399
12
52
        if($text =~ /predeceased by (his|her) sisters?\s*([^;\.]+);?/i) {
400
2
2
                my $sisters_text = $2;
401
2
4
                $sisters_text =~ s/^,\s+//;
402
2
5
                $family{sisters} = extract_people_section($sisters_text);
403        } else {
404
10
30
                while($text =~ /\bsister[,\s]\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
405
2
2
                        my $name = $1;
406
2
8
                        $family{'sisters'} ||= [];
407
2
2
                        if($name eq 'Mrs') {
408
1
4
                                if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) {
409
1
1
                                        $name = $1;
410                                } else {
411
0
0
                                        undef $name;
412                                }
413                        }
414
2
18
                        if($name) {
415
2
2
1
32
                                push @{$family{sisters}}, {
416                                        name => $name,
417                                        status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living',
418                                };
419                        }
420                }
421
422
10
13
                if(!exists($family{'sisters'})) {
423
8
18
                        if($text =~ /\stwo\ssisters,\s*(.*?)\sand\s(.*?)[;:]/s) {
424
1
2
                                my($first, $second) = ($1, $2);
425
1
2
                                foreach my $sister($first, $second) {
426
2
4
                                        if($sister =~ /Mrs\.\s(.+?),\s(.+)/) {
427
2
3
                                                my $name = $1;
428
2
10
                                                my $location = $2;
429
2
3
                                                $location =~ s/,$//;
430
2
4
                                                if($name =~ /(\w+)\s+(\w+)/) {
431
2
2
2
5
                                                        push @{$family{sisters}}, {
432                                                                name => $1,
433                                                                location => $location,
434                                                                sex => 'F',
435                                                                spouse => { 'name' => $2, 'sex' => 'M' }
436                                                        };
437                                                } else {
438
0
0
0
0
                                                        push @{$family{sisters}}, {
439                                                                name => $name,
440                                                                location => $location,
441                                                                sex => 'F',
442                                                        };
443                                                }
444                                        } else {
445
0
0
0
0
                                                push @{$family{sisters}}, {
446                                                        name => $sister,
447                                                        sex => 'F',
448                                                };
449                                        }
450                                }
451                        }
452                }
453        }
454
455
12
55
        if($text =~ /predeceased by (his|her) brothers?\s*([^;\.]+);?/i) {
456
0
0
                my $brothers_text = $2;
457
0
0
                $brothers_text =~ s/^,\s+//;
458
0
0
                $family{brothers} = extract_people_section($brothers_text);
459                # TODO: mark all statuses to deceased
460        } else {
461
12
11
                my @siblings;
462
463
12
24
                while ($text =~ /\bbrother,\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
464
1
3
                        $family{'brothers'} ||= [];
465
1
1
1
30
                        push @{$family{brothers}}, {
466                                name => $1,
467                                status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living',
468                        };
469                }
470
12
36
                if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) {
471
7
25
                        if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) {
472
1
1
1
2
                                push @{$family{'siblings'}},
473                                        { 'name' => $1 },
474                                        { 'name' => $2 }
475                        }
476                }
477
478
12
14
                if(!exists($family{'brothers'})) {
479
11
25
                        if($text =~ /\sbrothers,\s*(.*?)[;\.]/s) {
480
1
1
                                my $brothers_text = $1;
481
1
2
                                if($brothers_text =~ /, all of (.+)$/) {
482
1
1
                                        my $location = $1;
483
1
3
                                        while($brothers_text =~ /([\w. ]+?),\s/g) {
484
2
2
                                                my $son = $1;
485
2
3
                                                if($son =~ /(\w+)\s+and\s+(\w+)/) {
486
1
1
2
3
                                                        push @{$family{brothers}}, {
487                                                                name => $1,
488                                                                location => $location,
489                                                                sex => 'M',
490                                                        }, {
491                                                                name => $2,
492                                                                location => $location,
493                                                                sex => 'M',
494                                                        };
495
1
1
                                                        last;
496                                                } else {
497
1
1
1
3
                                                        push @{$family{brothers}}, {
498                                                                name => $son,
499                                                                location => $location,
500                                                                sex => 'M',
501                                                        };
502                                                }
503                                        }
504                                } else {
505
0
0
                                        while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
506
0
0
0
0
                                                push @{$family{brothers}}, {
507                                                        name => $1,
508                                                        location => $2,
509                                                        sex => 'M',
510                                                };
511                                        }
512                                }
513                        }
514                }
515        }
516
517        # Detect nieces/nephews
518
12
66
        $family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : [];
519
520        # Extract parents and clean the names by removing unnecessary details
521
12
336
        if($text =~ /(son|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) {
522
2
3
                my $father = $2;
523
2
4
                my $mother = $3;
524
525                # Remove anything after the first comma in each parent's name
526
2
2
                $father =~ s/,.*//;
527
2
3
                $mother =~ s/,.*//;
528
529
2
7
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
530
2
4
                        $mother = "$1 $2";
531                }
532                $family{parents} = {
533
2
7
                        father => { name => $father },
534                        mother => { name => $mother },
535                };
536        } elsif($text =~ /parents were (\w+) and (\w+)/i) {
537                $family{parents} = {
538
1
4
                        father => { name => $1 },
539                        mother => { name => $2 },
540                };
541        }
542
543        # Extract spouse's death year and remove the "(year)" from the name
544
12
329
        if($text =~ /(wife|husband) of the late\s+([\w\s]+)\s+\((\d{4})\)/) {
545
1
2
                my $name = $2;
546
1
1
                my $death_year = $3;
547
548
1
3
                $family{'spouse'} ||= [];
549
550                # Remove the death year part from the spouse's name
551
1
1
                $name =~ s/\s*\(\d{4}\)//;
552
553
1
1
1
2
                push @{$family{'spouse'}}, {
554                        name => $name,
555                        death_year => $death_year
556                }
557        } elsif($text =~ /\bmarried ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.*?\b(?:at|in)\s+([^.,]+))?/i) {
558
1
7
                $family{'spouse'} ||= [];
559
560
1
1
1
4
                push @{$family{'spouse'}}, {
561                        name => $1,
562                        married => {
563                                date => $2,
564                                place => $3 // '',
565                        }
566                };
567        } elsif($text =~ /husband (?:to|of) the late\s([\w\s]+)[\s\.]/i) {
568
1
3
                $family{'spouse'} ||= [];
569
570
1
1
1
3
                push @{$family{'spouse'}}, { name => $1, status => 'deceased' }
571        } elsif($text =~ /\b(?:wife|husband) of ([^.,;]+)/i) {
572
2
6
                $family{'spouse'} ||= [];
573
574
2
2
1
5
                push @{$family{'spouse'}}, { name => $1 }
575        } elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) {
576
1
1
1
5
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' }
577        } elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) {
578
2
2
2
7
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' }
579        }
580
581        # Ensure spouse location is properly handled
582
12
29
        if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') {
583
0
0
                delete $family{spouse}[0]{location};
584        }
585
586        # Extract the funeral information
587
12
224
        if($text =~ /funeral service.*?at\s+(.+?),?\s+on\s+(.*?),?\s+at\s+(.+?)\./) {
588                $family{funeral} = {
589
1
3
                        location => $1,
590                        date     => $2,
591                        time     => $3,
592                };
593        } elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) {
594                $family{funeral} = {
595
1
4
                        location => $1,
596                        date     => $2,
597                        time     => $3,
598                };
599
1
3
                if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) {
600
1
2
                        $family{'funeral'}->{'date'} = $1;
601
1
3
                        if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) {
602                                # Wednesday 9th March at 1.15pm.  Friends etc. etc.
603
1
2
                                $family{'funeral'}->{'date'} = $1;
604
1
1
                                $family{'funeral'}->{'time'} = $2;
605                        }
606                }
607        } elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) {
608                $family{funeral} = {
609
1
3
                        time     => $1,
610                        location => $2
611                };
612        } elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) {
613
0
0
                $family{funeral} = { location => $1 }
614        } elsif($text =~ /services.+\sat\s(.+),\swith\s/i) {
615
3
5
                $family{funeral} = { location => $1 }
616        }
617
618        # Extract father-in-law and mother-in-law information (if present)
619
12
30
        if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) {
620
1
2
                my $father_in_law = $1;
621
1
2
                $family{children_in_law} = [{ name => $father_in_law }];
622        } elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) {
623
1
1
                my $mother_in_law = $1;
624
1
7
                $family{children_in_law} = [ split /\s*(?:,|and)\s*/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] || '' ];
625
1
2
                if(scalar($family{children_in_law} == 0)) {
626
0
0
                        $family{children_in_law} = [{ name => $mother_in_law }];
627                }
628        }
629
630        # Extract aunt information
631
12
19
        if($text =~ /niece of\s+([A-Za-z]+)/) {
632
1
1
                my $aunt = $1;
633
1
2
                $family{aunt} = [{ 'name' => $aunt }];
634        }
635
636        # Birth info
637
12
1008
        if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) {
638                $family{birth} = {
639
1
2
                        place => $1,
640                        date => $2,
641                }
642        } elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) {
643
1
2
                $family{'birth'}->{'place'} = $1;
644
1
2
                if(my $location = _extract_location($1)) {
645
0
0
                        $family{'birth'}->{'location'} = $location;
646                }
647
1
11
                if(my $dt = _extract_date($2)) {
648
1
11
                        $family{'birth'}->{date} = $dt->ymd('/');
649                }
650
1
16
                $family{'birth'}->{'place'} =~ s/\s+$//;
651        } elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) {
652
1
1
                $family{'birth'}->{'place'} = $2;
653
1
1
                my $father = $3;
654
1
2
                my $mother = $4;
655
1
1
                eval {
656
1
2
                        if(my $dt = DateTime::Format::Text->parse_datetime($1)) {
657
1
196
                                $family{'birth'}->{date} = $dt->ymd('/');
658                        }
659                };
660                # TODO
661                # if($verbose && $@) {
662                        # Carp::carp($@);
663                # }
664
1
13
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
665
1
2
                        $mother = "$1 $2";
666                }
667
1
2
                if($father =~ /(.+?)\.\s\s/) {
668
0
0
                        $father = $1;
669                }
670                $family{parents} = {
671
1
3
                        father => { name => $father },
672                        mother => { name => $mother }
673                };
674
1
5
                if($text =~ /survived by (his|her) (father|mother)[\s,;]/i) {
675
1
2
                        $family{parents}->{$2}->{'status'} = 'living';
676                }
677        } elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) {
678
1
3
                if(my $dt = _extract_date($1)) {
679
1
8
                        $family{'birth'}->{date} = $dt->ymd('/');
680                }
681
1
10
                if($2) {
682
1
3
                        $family{'birth'}->{'location'} = $2;
683                }
684        }
685
686        # Date of death
687
12
133
        if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) {
688
2
3
                $family{death}->{date} = $1;
689
2
3
                $family{death}->{datetime} = _extract_date($1);
690        }
691
692        # Age at death
693
12
49
        if($text =~ /,\s(\d{1,3}), of\s/) {
694
2
5
                if($1 < 110) {
695
2
4
                        $family{'death'}->{'age'} = $1;
696                }
697        }
698
699        # Place of death
700
12
144
        if($text =~ /\b(?:passed away|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) {
701
3
4
                my $place = $2;
702
3
272
                if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) {
703
0
0
                        $place = $1;
704
0
0
                        $family{death}->{date} = $2;
705                } elsif($place =~ /(.+)\son\s(.+)/) {
706
1
2
                        $place = $1;
707
1
16
                        if(my $dt = _extract_date($2)) {
708
1
17
                                $family{death}->{date} = $dt->ymd('/');
709                        }
710                }
711
3
16
                $place =~ s/^\bthe residence,\s//;
712
3
6
                $place =~ s/\bafter a.*$//;
713
3
3
                $place =~ s/,\s+$//;
714
3
5
                $family{death}->{place} = $place;
715        }
716
717        # Remove blank fields from the main family hash
718
12
67
67
21
65
93
        %family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family;
719
720        # Remove empty arrays the family hash
721
12
22
        foreach my $key (keys %family) {
722
67
64
                if(ref($family{$key}) eq 'ARRAY') {
723
48
72
48
27
81
37
                        $family{$key} = [ grep { /\S/ } @{$family{$key}} ];
724
48
48
36
46
                        if(@{$family{$key}} == 0) {
725
11
13
                                delete $family{$key};
726                        }
727                }
728        }
729
730
12
21
        return if(!scalar keys(%family));
731
732
12
39
        return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 });
733}
734
735sub _extract_date
736{
737
5
7
        my $text = shift;
738
5
16
        my $parser = DateTime::Format::Text->new();
739
5
19
        my $dt;
740
741
5
5
2
9
        eval { $dt = $parser->parse_datetime($text); };
742
5
1323
        return $dt if $dt && !$@;
743
0
0
        return undef;
744}
745
746sub _extract_location {
747
1
2
        my $place_text = shift;
748
749
1
6
        $geocoder ||= Geo::Coder::List->new()->push(Geo::Coder::Free->new());
750
1
47940
        my @locations = $geocoder->geocode(location => $place_text);      # Use array to improve caching
751
752
1
185
        return unless scalar(@locations);
753
754
0
        my $result = $locations[0];
755
756
0
        if(ref($result)) {
757                return {
758
0
                        raw => $place_text,
759                        # city => $result->{components}{city} || $result->{components}{town},
760                        # region => $result->{components}{state},
761                        # country => $result->{components}{country},
762                        latitude => $result->latitude(),
763                        longitude => $result->longitude()
764                };
765        }
766        return {
767                raw => $place_text,
768                # city => $result->{components}{city} || $result->{components}{town},
769                # region => $result->{components}{state},
770                # country => $result->{components}{country},
771                latitude => $result->{'latitude'},
772
0
                longitude => $result->{'longitude'}
773        };
774}
775
776
777 - 793
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/>

=head1 SUPPORT

This module is provided as-is without any warranty.

=head1 LICENSE

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=cut
794
7951;