File Coverage

File:blib/lib/Genealogy/Obituary/Parser.pm
Coverage:75.6%

linestmtbrancondsubtimecode
1package Genealogy::Obituary::Parser;
2
3
5
5
5
299313
5
67
use strict;
4
5
5
5
5
5
81
use warnings;
5
6
5
5
5
8
4
108
use Carp;
7
5
5
5
938
951494
88
use DateTime::Format::Text;
8
5
5
5
15
5
64
use Exporter 'import';
9
5
5
5
9
4
157
use JSON::MaybeXS;
10
5
5
5
935
22243
107
use Params::Get 0.13;
11
5
5
5
822
15708
94
use Return::Set 0.03;
12
5
5
5
12
4
21181
use Params::Validate::Strict;
13
14our @EXPORT_OK = qw(parse_obituary);
15our $geocoder;
16
17# TODO: use Lingua::EN::Tagger;
18# TODO: add more general code, e.g., where it looks for father, also look for mother
19# TODO: parse https://funeral-notices.co.uk/notice/adams/5244000
20
21 - 29
=head1 NAME

Genealogy::Obituary::Parser - Extract structured family relationships from obituary text

=head1 VERSION

Version 0.04

=cut
30
31our $VERSION = '0.04';
32
33 - 93
=head1 SYNOPSIS

  use Genealogy::Obituary::Parser qw(parse_obituary);

  my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.';
  my $data = parse_obituary($text);

  # $data = {
  #   spouse   => ['Paul'],
  #   children => ['Anna', 'Lucy'],
  #   grandchildren => ['Jake', 'Emma'],
  # };

=head1 DESCRIPTION

This module parses free-form obituary text and extracts structured family relationship data
for use in genealogical applications.
It parses obituary text and extracts structured family relationship data,
including details about children, parents, spouse, siblings, grandchildren, and other relatives.

=head1 FUNCTIONS

=head2 parse_obituary($text)

The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash.
It returns a hash reference containing structured family information,
with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc.

Takes a string, or a ref to a string.

=head3 API SPECIFICATION

=head4 INPUT

  {
    'text' => {
      'type' => 'string',    # or stringref
      'min' => 1,
      'max' => 10000
    }, 'geocoder' => {       # used to geocode locations to verify they exist
      'type' => 'object',
      'can' => 'geocode',
      'optional' => 1,
    }
  }

=head4 OUTPUT

=over 4

=item * No matches: undef

=back

  {
    type => 'hashref',
    'min' => 1,
    'max' => 10
  }

=cut
94
95sub parse_obituary
96{
97
13
399880
        my $params = Params::Validate::Strict::validate_strict({
98                args => Params::Get::get_params('text', \@_),
99                schema => {
100                        'text' => {
101                                'type' => 'string',
102                                'min' => 1,
103                                'max' => 10000
104                        }, 'geocoder' => {
105                                'type' => 'object',
106                                'can' => 'geocode',
107                                'optional' => 1,
108                        }
109                }
110        });
111
112
13
1100
        my $text = $params->{'text'};
113
114
13
18
        if(my $geo = $params->{'geocoder'}) {
115
0
0
                $geocoder = $geo;
116        }
117
118
13
14
        Carp::croak(__PACKAGE__, ': Usage: parse_obituary($text)') unless defined($text);
119
120
13
14
        if(ref($text) eq 'SCALAR') {
121
0
0
0
0
                $text = ${$text};
122        }
123
124
13
132
        $text =~ s/\s+, +/, /g; # Collapse multiple commas
125
126        # Quick scan to get started
127        sub parse_obituary_quick {
128
0
0
                my $text = shift;
129
0
0
                my %data;
130
131
0
0
                my @patterns = (
132                        [ qr/\bdaughters?\s+([^.,;]+)/i, 'children' ],
133                        [ qr/\bsons?\s+([^.,;]+)/i, 'children' ],
134                        [ qr/\bchildren\s+([^.,;]+)/i, 'children' ],
135                        [ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ],
136                        [ qr/\bwife\s+([^.,;]+)/i, 'spouse' ],
137                        [ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ],
138                        [ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ],
139                        [ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ],
140                        [ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ],
141                        [ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ],
142                        [ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ],
143                        [ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ],
144                );
145
146
0
0
                for my $p (@patterns) {
147
0
0
                        my ($re, $field) = @$p;
148
0
0
                        while ($text =~ /$re/g) {
149
0
0
                                my $list = $1;
150
0
0
                                next unless $list;
151
152                                # Robust splitting on commas and "and"
153
0
0
0
0
0
0
                                my @names = grep { length } map { s/^\s+|\s+$//gr } split /\s*(?:,|(?:\band\b))\s*/i, $list;
154
0
0
0
0
0
0
                                push @{$data{$field}}, map { { 'name' => $_ } } @names;
155                        }
156                }
157
158
0
0
                return \%data;
159        }
160
161        # my %family = %{parse_obituary_quick($text)};
162
13
8
        my %family;
163
164        # Helper to extract people from a specific section and remove empty entries
165        sub extract_people_section {
166
7
6
                my $section = shift;
167
7
16
                return unless $section;
168
169
7
21
                $section =~ s/\s+and\s+/, /g;   # Ensure "and" is treated as a separator
170
7
20
                $section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g;
171
7
20
                my @entries = split /\s*,\s*/, $section;
172
173
7
5
                my @people;
174
7
7
                foreach my $entry (@entries) {
175
19
16
                        $entry =~ s/<<COMMA>>/, /g;
176
177
19
18
                        my ($name, $spouse, $location) = ('', '', '');
178
179                        # Match "Ian (Terry) Girvan of Surrey, BC"
180
19
25
                        if ($entry =~ /^(\w+)\s+\(([^)]+)\)\s+(\w+)\s+of\s+(.+)$/) {
181
1
1
1
2
1
3
                                $name = "$1 $3"; $spouse = $2; $location = $4;
182                        }
183                        # Match "Gwen Steeves (Leslie) of Riverview, NB"
184                        elsif ($entry =~ /^(.+?)\s+\(([^)]+)\)\s+of\s+(.+)$/) {
185
1
1
1
2
1
1
                                $name = $1; $spouse = $2; $location = $3;
186                        }
187                        # Match "Carol Girvan of Dartmouth, NS"
188                        elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) {
189
1
1
1
1
                                $name = $1; $location = $2;
190                        } else {
191                                # Match names only (e.g. for siblings)
192
16
15
                                $name = $entry;
193                        }
194
195
19
13
                        next if !$name; # Skip if name is empty
196
17
16
                        next if($name =~ /^father-in-law\sto\s/);       # Skip follow ons
197
16
15
                        last if($name =~ /^devoted\s/i);
198
15
11
                        last if($name =~ /^loved\s/i);
199
200                        # Create a hash and filter out blank fields
201
15
17
                        my %person = (
202                                name     => $name,
203                                spouse => $spouse,
204                                location => $location,
205                        );
206
207                        # Remove blank fields
208
15
20
45
16
25
74
                        %person = map { $_ => $person{$_} } grep { defined $person{$_} && $person{$_} ne '' } keys %person;
209
210
15
17
                        push @people, \%person;
211                }
212
7
12
                return \@people;
213        }
214
215        sub extract_names_from_phrase {
216
1
1
                my $phrase = $_[0];
217
1
0
                my @names;
218
219
1
1
                $phrase =~ s/[.;]//g;
220
221                # Case: "Christopher, Thomas, and Marsha Cloud"
222
1
2
                if ($phrase =~ /^((?:\w+\s*,\s*)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) {
223
0
0
                        my ($pre, $last_first, $last) = ($1, $2, $3);
224
0
0
                        my @firsts = split(/\s*,\s*/, $pre);
225
0
0
                        push @firsts, $last_first;
226
0
0
0
0
                        push @names, map { "$_ $last" } @firsts;
227
0
0
                        return @names;
228                }
229
230                # Case: "Christopher and Thomas Cloud"
231
1
3
                if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) {
232
0
0
                        my ($first_part, $second_first, $last) = ($1, $2, $3);
233
0
0
                        my @firsts = split(/\s*,\s*|\s+and\s+/, $first_part);
234
0
0
0
0
                        push @names, map { "$_ $last" } (@firsts, $second_first);
235
0
0
                        return @names;
236                }
237
238                # Fallback: Split by comma or 'and'
239
1
2
                $phrase =~ s/, and grandchildren.+//;   # Handle "Anna and Lucy, and grandchildren Jake and Emma"
240
241                # Handle "Name1, Name2 and Name3" correctly
242
1
3
                if($phrase =~ /(.+?),\s*(\w+)\s+and\s+(\w+)/) {
243
0
0
                        my ($firsts, $second, $third) = ($1, $2, $3);
244
0
0
                        my @firsts = split /\s*,\s*/, $firsts;
245
0
0
                        push @names, @firsts, $second, $third;
246                } else {
247
1
2
                        my @parts = split /\s*(?:,|and)\s*/, $phrase;
248
1
2
1
5
                        push @names, grep { defined($_) && $_ ne '' } @parts;
249                }
250
251
1
1
                return @names;
252        }
253
254        # Correct extraction of children (skipping "his/her")
255
13
206
        if ($text =~ /survived by (his|her) children\s*([^\.;]+)/i) {
256
2
2
                my $children_text = $2;
257
2
3
                $family{children} = extract_people_section($children_text);
258        } elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) {       # Look for the phrase "Loving mum to"
259
1
2
                my $children_text = $1;
260
1
1
                $family{children} = extract_people_section($children_text);
261        } elsif ($text =~ /Loving father of\s*([^\.;]+)/i) {    # Look for the phrase "Loving father of"
262
1
2
                my $children_text = $1;
263
1
2
                $family{children} = extract_people_section($children_text);
264        } elsif($text =~ /mother of\s*([^\.;]+)?,/i) {  # Look for the phrase "mother of"
265
1
1
                my $children_text = $1;
266
1
2
                $children_text =~ s/, grandmother.+//;
267
1
2
                $family{children} = extract_people_section($children_text);
268        } elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) {
269
3
3
                my @children;
270                my @grandchildren;
271
272
3
9
                push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' };
273
3
27
                if($text =~ /\bdaughter,?\s([a-z]+)/i) {
274
1
2
                        push @children, { 'name' => $1, 'sex' => 'F' }
275                }
276
3
27
                if($text =~ /\bgranddaughter,?\s([a-z]+)/i) {
277
1
2
                        push @grandchildren, { 'name' => $1, 'sex' => 'F' };
278                }
279
3
6
                $family{children} = \@children if @children;
280
3
4
                $family{grandchildren} = \@grandchildren if @grandchildren;
281        } elsif($text =~ /Surviving are (?:a )?daughters?,\s*Mrs\.\s+(\w+)\s+\(([^)]+)\)\s+(\w+),\s+([^;]+?);/i) {
282                # Handle "Surviving are a daughter, Mrs. Walter (Ruth Ann) Gerke, Fort Wayne"
283
0
0
                my @children;
284
0
0
                my $spouse_first = $1;
285
0
0
                my $daughter_name = $2;
286
0
0
                my $spouse_last = $3;
287
0
0
                my $location = $4;
288
0
0
                $location =~ s/,\s*$//;
289
290
0
0
                push @children, {
291                        name => $daughter_name,
292                        location => $location,
293                        sex => 'F',
294                        spouse => {
295                                name => "$spouse_first $spouse_last",
296                                sex => 'M'
297                        }
298                };
299
0
0
                $family{children} = \@children;
300        } else {
301
5
4
                my @children;
302
303                # my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0);
304                # my $tagged = $tagger->add_tags($text);
305
306
5
12
                if($text =~ /\ssons,\s*(.*?);/s) {
307
2
4
                        my $sons_text = $1;
308
2
4
                        if($sons_text =~ /, all of (.+)$/) {
309
1
1
                                my $location = $1;
310
1
2
                                while($sons_text =~ /([\w. ]+?),\s/g) {
311
3
3
                                        my $son = $1;
312
3
4
                                        if($son =~ /(\w+)\s+and\s+(\w+)/) {
313
1
2
                                                push @children, {
314                                                        name => $1,
315                                                        location => $location,
316                                                        sex => 'M',
317                                                }, {
318                                                        name => $2,
319                                                        location => $location,
320                                                        sex => 'M',
321                                                };
322
1
2
                                                last;
323                                        } else {
324
2
5
                                                push @children, {
325                                                        name => $son,
326                                                        location => $location,
327                                                        sex => 'M',
328                                                };
329                                        }
330                                }
331                        } else {
332
1
3
                                while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
333
2
7
                                        push @children, {
334                                                name => $1,
335                                                location => $2,
336                                                sex => 'M',
337                                        };
338                                }
339                        }
340                }
341
5
22
                if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) {
342
1
3
                        push @children, {
343                                name => $1,
344                                location => $3,
345                                sex => 'F',
346                                spouse => { 'name' => $2, sex => 'M' }
347                        };
348                } elsif($text =~ /one daughter,\s*(.+?),\s*(.+?);/) {
349
1
1
                        my $name = $1;
350
1
1
                        my $location = $2;
351
1
2
                        if($name =~ /(\w+)\s+(\w+)/) {
352
1
2
                                push @children, {
353                                        name => $1,
354                                        location => $location,
355                                        sex => 'F',
356                                        spouse => { name => $2, sex => 'M' }
357                                };
358                        } else {
359
0
0
                                push @children, {
360                                        name => $1,
361                                        location => $location,
362                                        sex => 'F',
363                                };
364                        }
365                }
366
5
9
                $family{children} = \@children if @children;
367
368
5
5
                if(!$family{'children'}) {
369
3
18
                        while($text =~ /\b(son|daughter)s?,\s*([A-Z][a-z]+(?:\s+\([A-Z][a-z]+\))?)\s*(?:and their children ([^.;]+))?/g) {
370
2
3
                                my $sex = $1 eq 'son' ? 'M' : 'F';
371
2
2
                                my $child = $2;
372
2
2
                                my $grandkids = $3;
373
2
7
                                if(my @grandchildren = $grandkids ? split /\s*,\s*|\s+and\s+/, $grandkids : ()) {
374
0
0
                                        push @children, {
375                                                name => $child,
376                                                sex => $sex,
377                                                grandchildren => \@grandchildren,
378                                        };
379                                } elsif(($sex eq 'F') && ($child =~ /(.+)\s+\((.+)\)/)) {
380
1
6
                                        push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } }
381                                } elsif($child ne 'Mrs') {
382
1
13
                                        push @children, { name => $child, sex => $sex }
383                                }
384                        }
385                }
386
5
16
                $family{children} = \@children if @children;
387        }
388
389
13
16
        if(!$family{'children'}) {
390
2
3
                if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) {
391
0
0
                        my $raw = $1;
392
0
0
                        $raw =~ s/\sand their .+//;
393
0
0
                        my @children = extract_names_from_phrase($raw);
394
0
0
0
0
0
0
                        push @{$family{children}}, map { { name => $_, sex => 'M' } } @children;
395                }
396
2
6
                if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) {
397
1
1
                        my $raw = $1;
398
1
1
                        $raw =~ s/\sand their .+//;
399
1
1
                        my @children = extract_names_from_phrase($raw);
400
1
1
2
1
1
3
                        push @{$family{children}}, map { { name => $_, sex => 'F' } } @children;
401                }
402        }
403
404        # Extract grandchildren
405
13
16
        if(!$family{'grandchildren'}) {
406
12
19
                if($text =~ /grandchildren\s+([^\.;]+)/i) {
407
3
11
                        my @grandchildren = split /\s*(?:,|and)\s*/i, $1;
408
3
3
                        if(scalar(@grandchildren)) {
409
3
7
8
3
9
15
                                $family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ];
410                        }
411                }
412        }
413
13
4
19
5
        if($family{'grandchildren'} && scalar @{$family{grandchildren}}) {
414
4
16
                while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) {
415
0
0
0
0
                        shift @{$family{'grandchildren'}};
416                }
417
4
7
                if($family{'grandchildren'}->[0] =~ /brothers/) {
418
0
0
                        if(!exists $family{'brothers'}) {
419
0
0
0
0
                                shift @{$family{'grandchildren'}};
420
0
0
0
0
                                $family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}}));
421                        }
422
0
0
                        delete $family{grandchildren};
423                }
424        } else {
425
9
7
                delete $family{grandchildren};
426        }
427
13
4
17
6
        if((!defined($family{'grandchildren'})) || (($#{$family{'grandchildren'}}) <= 0)) {
428                # handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill"
429
10
20
                my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved|$)/;
430                # Normalize and split into individual names
431
10
6
                my @grandchildren;
432
10
11
                if($grandchildren_str) {
433
1
5
                        @grandchildren = split /,\s*|\s+and\s+/, $grandchildren_str;
434                }
435
10
17
                if(scalar(@grandchildren)) {
436
1
1
                        $family{'grandchildren'} = \@grandchildren;
437                } elsif($text =~ /grandm\w+\s/) {
438
1
1
                        my $t = $text;
439
1
5
                        $t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/;
440
1
6
                        $family{grandchildren} = [ split /\s*(?:,|and)\s*/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] || '' ];
441                }
442        }
443
444        # Extract siblings (sisters and brothers) correctly, skipping "her" or "his"
445
13
47
        if($text =~ /predeceased by (his|her) sisters?\s*([^;\.]+);?/i) {
446
2
2
                my $sisters_text = $2;
447
2
2
                $sisters_text =~ s/^,\s+//;
448
2
4
                $family{sisters} = extract_people_section($sisters_text);
449        } else {
450
11
24
                while($text =~ /\bsister[,\s]\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
451
2
3
                        my $name = $1;
452
2
9
                        $family{'sisters'} ||= [];
453
2
3
                        if($name eq 'Mrs') {
454
1
4
                                if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) {
455
1
1
                                        $name = $1;
456                                } else {
457
0
0
                                        undef $name;
458                                }
459                        }
460
2
3
                        if($name) {
461
2
2
1
31
                                push @{$family{sisters}}, {
462                                        name => $name,
463                                        sex => 'F',
464                                        status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living',
465                                };
466                        }
467                }
468
469
11
16
                if(!exists($family{'sisters'})) {
470
9
13
                        if($text =~ /\stwo\ssisters,\s*(.*?)\sand\s(.*?)[;:]/s) {
471
1
2
                                my($first, $second) = ($1, $2);
472
1
1
                                foreach my $sister($first, $second) {
473
2
3
                                        if($sister =~ /Mrs\.\s(.+?),\s(.+)/) {
474
2
2
                                                my $name = $1;
475
2
2
                                                my $location = $2;
476
2
2
                                                $location =~ s/,$//;
477
2
3
                                                if($name =~ /(\w+)\s+(\w+)/) {
478
2
2
1
5
                                                        push @{$family{sisters}}, {
479                                                                name => $1,
480                                                                location => $location,
481                                                                sex => 'F',
482                                                                spouse => { 'name' => $2, 'sex' => 'M' }
483                                                        };
484                                                } else {
485
0
0
0
0
                                                        push @{$family{sisters}}, {
486                                                                name => $name,
487                                                                location => $location,
488                                                                sex => 'F',
489                                                        };
490                                                }
491                                        } else {
492
0
0
0
0
                                                push @{$family{sisters}}, {
493                                                        name => $sister,
494                                                        sex => 'F',
495                                                };
496                                        }
497                                }
498                        }
499                }
500
501
11
14
                if($family{'sisters'}) {
502                        # Deduplicate by serializing hashes for comparison
503
3
3
                        my %seen;
504                        my @sisters = grep {
505
4
41
                                my $key = JSON::MaybeXS->new->canonical(1)->encode($_);
506
4
11
                                !$seen{$key}++
507
3
3
3
4
                        } @{$family{sisters}};
508
509
3
6
                        $family{sisters} = \@sisters;
510                }
511        }
512
513
13
53
        if($text =~ /predeceased by (his|her) brothers?\s*([^;\.]+);?/i) {
514
0
0
                my $brothers_text = $2;
515
0
0
                $brothers_text =~ s/^,\s+//;
516
0
0
                $family{brothers} = extract_people_section($brothers_text);
517                # TODO: mark all statuses to deceased
518        } else {
519
13
21
                while ($text =~ /\bbrother,\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
520
1
2
                        $family{'brothers'} ||= [];
521
1
1
1
33
                        push @{$family{brothers}}, {
522                                name => $1,
523                                status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living',
524                        };
525                }
526
13
34
                if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) {
527
8
24
                        if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) {
528
1
1
1
2
                                push @{$family{'siblings'}},
529                                        { 'name' => $1 },
530                                        { 'name' => $2 }
531                        }
532                }
533
534
13
16
                if(!exists($family{'brothers'})) {
535
12
16
                        if($text =~ /\sbrothers,\s*(.*?)[;\.]/s) {
536
1
1
                                my $brothers_text = $1;
537
1
1
                                if($brothers_text =~ /, all of (.+)$/) {
538
1
1
                                        my $location = $1;
539
1
3
                                        while($brothers_text =~ /([\w. ]+?),\s/g) {
540
2
2
                                                my $son = $1;
541
2
2
                                                if($son =~ /(\w+)\s+and\s+(\w+)/) {
542
1
1
1
2
                                                        push @{$family{brothers}}, {
543                                                                name => $1,
544                                                                location => $location,
545                                                                sex => 'M',
546                                                        }, {
547                                                                name => $2,
548                                                                location => $location,
549                                                                sex => 'M',
550                                                        };
551
1
1
                                                        last;
552                                                } else {
553
1
1
0
3
                                                        push @{$family{brothers}}, {
554                                                                name => $son,
555                                                                location => $location,
556                                                                sex => 'M',
557                                                        };
558                                                }
559                                        }
560                                } else {
561
0
0
                                        while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
562
0
0
0
0
                                                push @{$family{brothers}}, {
563                                                        name => $1,
564                                                        location => $2,
565                                                        sex => 'M',
566                                                };
567                                        }
568                                }
569                        }
570                }
571        }
572
573
13
141
        if(!exists($family{'brothers'}) && $text =~ /\b(?:two|three|four)\s+brothers?,\s*(.+?)(?:,\s*a\s+(?:sister|half-sister)|;)/i) {
574                # Pattern for "two brothers, Name and Name"
575
0
0
                my $brothers_text = $1;
576
0
0
                my @brothers;
577
578                # Handle "Charles F. Harris and Berton Harris"
579
0
0
                if($brothers_text =~ /\band\b/) {
580
0
0
                        my @names = split /\s+and\s+/, $brothers_text;
581
0
0
                        foreach my $name (@names) {
582
0
0
                                $name =~ s/^\s+|\s+$//g;
583
0
0
                                $name =~ s/,\s*$//;
584
0
0
                                push @brothers, {
585                                        name => $name,
586                                        sex => 'M',
587                                        status => 'living'
588                                };
589                        }
590                }
591
0
0
                $family{brothers} = \@brothers if(scalar @brothers);
592        }
593
594        # Detect nieces/nephews
595
13
67
        $family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : [];
596
597        # Extract parents and clean the names by removing unnecessary details
598
13
339
        if($text =~ /(son|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) {
599
2
3
                my $father = $2;
600
2
1
                my $mother = $3;
601
602                # Remove anything after the first comma in each parent's name
603
2
2
                $father =~ s/,.*//;
604
2
2
                $mother =~ s/,.*//;
605
606
2
6
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
607
2
3
                        $mother = "$1 $2";
608                }
609                $family{parents} = {
610
2
4
                        father => { name => $father },
611                        mother => { name => $mother },
612                };
613        } elsif($text =~ /parents were (\w+) and (\w+)/i) {
614                $family{parents} = {
615
1
3
                        father => { name => $1 },
616                        mother => { name => $2 },
617                };
618        }
619
620        # Extract spouse's death year and remove the "(year)" from the name
621
13
301
        if($text =~ /(wife|husband) of the late\s+([\w\s]+)\s+\((\d{4})\)/) {
622
1
1
                my $name = $2;
623
1
1
                my $death_year = $3;
624
625
1
10
                $family{'spouse'} ||= [];
626
627                # Remove the death year part from the spouse's name
628
1
1
                $name =~ s/\s*\(\d{4}\)//;
629
630
1
1
0
2
                push @{$family{'spouse'}}, {
631                        name => $name,
632                        death_year => $death_year
633                }
634        } elsif($text =~ /\bmarried ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.*?\b(?:at|in)\s+([^.,]+))?/i) {
635
1
3
                $family{'spouse'} ||= [];
636
637
1
2
                my($name, $date, $place) = ($1, $2, $3);
638
1
1
                $name =~ s/\s+on\s.+$//;
639
640
1
1
1
3
                push @{$family{'spouse'}}, {
641                        name => $name,
642                        married => {
643                                date => $date,
644                                place => $place // '',
645                        }
646                };
647        } elsif($text =~ /husband (?:to|of) the late\s([\w\s]+)[\s\.]/i) {
648
1
3
                $family{'spouse'} ||= [];
649
650
1
1
0
3
                push @{$family{'spouse'}}, { name => $1, status => 'deceased' }
651        } elsif($text =~ /\b(?:wife|husband) of ([^.,;]+)/i) {
652
2
5
                $family{'spouse'} ||= [];
653
654
2
2
1
15
                push @{$family{'spouse'}}, { name => $1 }
655        } elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) {
656
1
1
1
2
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' }
657        } elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) {
658
2
2
2
5
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' }
659        }
660
661        # Ensure spouse location is properly handled
662
13
36
        if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') {
663
0
0
                delete $family{spouse}[0]{location};
664        }
665
666        # Extract the funeral information
667
13
215
        if($text =~ /funeral service.*?at\s+(.+?),?\s+on\s+(.*?),?\s+at\s+(.+?)\./) {
668                $family{funeral} = {
669
1
3
                        location => $1,
670                        date     => $2,
671                        time     => $3,
672                };
673        } elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) {
674                $family{funeral} = {
675
1
3
                        location => $1,
676                        date     => $2,
677                        time     => $3,
678                };
679
1
3
                if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) {
680
1
1
                        $family{'funeral'}->{'date'} = $1;
681
1
2
                        if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) {
682                                # Wednesday 9th March at 1.15pm.  Friends etc. etc.
683
1
1
                                $family{'funeral'}->{'date'} = $1;
684
1
2
                                $family{'funeral'}->{'time'} = $2;
685                        }
686                }
687        } elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) {
688                $family{funeral} = {
689
1
3
                        time     => $1,
690                        location => $2
691                };
692        } elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) {
693
0
0
                $family{funeral} = { location => $1 }
694        } elsif($text =~ /services.+\sat\s(.+),\swith\s/i) {
695
3
6
                $family{funeral} = { location => $1 }
696        }
697
698        # Extract father-in-law and mother-in-law information (if present)
699
13
25
        if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) {
700
1
1
                my $father_in_law = $1;
701
1
2
                $family{children_in_law} = [{ name => $father_in_law }];
702        } elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) {
703
1
1
                my $mother_in_law = $1;
704
1
6
                $family{children_in_law} = [ split /\s*(?:,|and)\s*/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] || '' ];
705
1
3
                if(scalar($family{children_in_law} == 0)) {
706
0
0
                        $family{children_in_law} = [{ name => $mother_in_law }];
707                }
708        }
709
710        # Extract aunt information
711
13
27
        if($text =~ /niece of\s+([A-Za-z]+)/) {
712
1
1
                my $aunt = $1;
713
1
2
                $family{aunt} = [{ 'name' => $aunt }];
714        }
715
716        # Birth info
717
13
1024
        if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) {
718                $family{birth} = {
719
1
2
                        place => $1,
720                        date => $2,
721                }
722        } elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) {
723
1
7
                $family{'birth'}->{'place'} = $1;
724
1
2
                if(my $location = _extract_location($1)) {
725
0
0
                        $family{'birth'}->{'location'} = $location;
726                }
727
1
2
                if(my $dt = _extract_date($2)) {
728
1
13
                        $family{'birth'}->{date} = $dt->ymd('/');
729                }
730
1
18
                $family{'birth'}->{'place'} =~ s/\s+$//;
731        } elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) {
732
1
2
                $family{'birth'}->{'place'} = $2;
733
1
1
                my $father = $3;
734
1
1
                my $mother = $4;
735
1
1
                eval {
736
1
2
                        if(my $dt = DateTime::Format::Text->parse_datetime($1)) {
737
1
172
                                $family{'birth'}->{date} = $dt->ymd('/');
738                        }
739                };
740                # TODO
741                # if($verbose && $@) {
742                        # Carp::carp($@);
743                # }
744
1
11
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
745
1
2
                        $mother = "$1 $2";
746                }
747
1
2
                if($father =~ /(.+?)\.\s\s/) {
748
0
0
                        $father = $1;
749                }
750                $family{parents} = {
751
1
2
                        father => { name => $father },
752                        mother => { name => $mother }
753                };
754
1
6
                if($text =~ /survived by (his|her) (father|mother)[\s,;]/i) {
755
1
2
                        $family{parents}->{$2}->{'status'} = 'living';
756                }
757        } elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) {
758
1
2
                if(my $dt = _extract_date($1)) {
759
1
7
                        $family{'birth'}->{date} = $dt->ymd('/');
760                }
761
1
8
                if($2) {
762
1
2
                        $family{'birth'}->{'location'} = $2;
763                }
764        }
765
766        # Date of death
767
13
139
        if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) {
768
2
2
                $family{death}->{date} = $1;
769
2
3
                if(my $dt = _extract_date($1)) {
770
2
11
                        $family{death}->{datetime} = $dt;
771                }
772        }
773
774        # Age at death
775
13
21
        if($text =~ /,\s(\d{1,3}), of\s/) {
776
2
8
                if($1 < 110) {
777
2
3
                        $family{'death'}->{'age'} = $1;
778                }
779        }
780
781        # Place of death
782
13
163
        if($text =~ /\b(?:passed away|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) {
783
3
4
                my $place = $2;
784
3
284
                if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) {
785
0
0
                        $place = $1;
786
0
0
                        $family{death}->{date} = $2;
787                } elsif($place =~ /(.+)\son\s(.+)/) {
788
1
1
                        $place = $1;
789
1
2
                        if(my $dt = _extract_date($2)) {
790
1
7
                                $family{death}->{date} = $dt->ymd('/');
791                        }
792                }
793
3
12
                $place =~ s/^\bthe residence,\s//;
794
3
4
                $place =~ s/\bafter a.*$//;
795
3
3
                $place =~ s/,\s+$//;
796
3
4
                $family{death}->{place} = $place;
797        }
798
799        # Remove blank fields from the main family hash
800
13
69
69
20
72
128
        %family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family;
801
802        # Remove empty arrays the family hash
803
13
19
        foreach my $key (keys %family) {
804
69
69
                if(ref($family{$key}) eq 'ARRAY') {
805
50
75
50
32
88
39
                        $family{$key} = [ grep { /\S/ } @{$family{$key}} ];
806
50
50
36
57
                        if(@{$family{$key}} == 0) {
807
12
12
                                delete $family{$key};
808                        }
809                }
810        }
811
812
13
17
        return if(!scalar keys(%family));
813
814
13
28
        return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 });
815}
816
817sub _extract_date
818{
819
5
5
        my $text = shift;
820
5
10
        my $parser = DateTime::Format::Text->new();
821
5
19
        my $dt;
822
823
5
5
6
8
        eval { $dt = $parser->parse_datetime($text); };
824
5
1218
        return $dt if $dt && !$@;
825
0
0
        return undef;
826}
827
828sub _extract_location {
829
1
3
        my $place_text = shift;
830
831
1
3
        unless($geocoder) {
832
1
1
1
85
                eval { require Geo::Coder::Free };
833
1
335
                if($@) {
834
1
13
                        Carp::carp(__PACKAGE__, ' (', __LINE__, "): geocoding locations disabled: $@");
835
1
559
                        return;
836                }
837
0
                $geocoder = Geo::Coder::Free->new();
838        }
839
840
0
        my @locations = $geocoder->geocode(location => $place_text);      # Use array to improve caching
841
842
0
        return unless scalar(@locations);
843
844
0
        my $result = $locations[0];
845
846
0
        if(ref($result)) {
847                return {
848
0
                        raw => $place_text,
849                        # city => $result->{components}{city} || $result->{components}{town},
850                        # region => $result->{components}{state},
851                        # country => $result->{components}{country},
852                        latitude => $result->latitude(),
853                        longitude => $result->longitude()
854                };
855        }
856        return {
857                raw => $place_text,
858                # city => $result->{components}{city} || $result->{components}{town},
859                # region => $result->{components}{state},
860                # country => $result->{components}{country},
861                latitude => $result->{'latitude'},
862
0
                longitude => $result->{'longitude'}
863        };
864}
865
866
867 - 897
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/>

=head1 SUPPORT

This module is provided as-is without any warranty.

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
898
8991;