File Coverage

File:blib/lib/Genealogy/Obituary/Parser.pm
Coverage:78.1%

linestmtbrancondsubtimecode
1package Genealogy::Obituary::Parser;
2
3
10
10
10
648431
9
146
use strict;
4
10
10
10
19
4
228
use warnings;
5
6
10
10
10
17
6
214
use Carp;
7
10
10
10
1936
1994620
210
use DateTime::Format::Text;
8
10
10
10
44
10
171
use Exporter 'import';
9
10
10
10
48
9
371
use JSON::MaybeXS;
10
10
10
10
2493
46103
251
use Params::Get 0.13;
11
10
10
10
1834
152444
228
use Return::Set 0.03;
12
10
10
10
61
9
44753
use Params::Validate::Strict;
13
14our @EXPORT_OK = qw(parse_obituary);
15our $geocoder;
16
17# TODO: use Lingua::EN::Tagger;
18# TODO: add more general code, e.g., where it looks for father, also look for mother
19# TODO: parse https://funeral-notices.co.uk/notice/adams/5244000
20
21 - 29
=head1 NAME

Genealogy::Obituary::Parser - Extract structured family relationships from obituary text

=head1 VERSION

Version 0.04

=cut
30
31our $VERSION = '0.04';
32
33 - 93
=head1 SYNOPSIS

  use Genealogy::Obituary::Parser qw(parse_obituary);

  my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.';
  my $data = parse_obituary($text);

  # $data = {
  #   spouse   => ['Paul'],
  #   children => ['Anna', 'Lucy'],
  #   grandchildren => ['Jake', 'Emma'],
  # };

=head1 DESCRIPTION

This module parses free-form obituary text and extracts structured family relationship data
for use in genealogical applications.
It parses obituary text and extracts structured family relationship data,
including details about children, parents, spouse, siblings, grandchildren, and other relatives.

=head1 FUNCTIONS

=head2 parse_obituary($text)

The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash.
It returns a hash reference containing structured family information,
with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc.

Takes a string, or a ref to a string.

=head3 API SPECIFICATION

=head4 INPUT

  {
    'text' => {
      'type' => 'string',    # or stringref
      'min' => 1,
      'max' => 10000
    }, 'geocoder' => {       # used to geocode locations to verify they exist
      'type' => 'object',
      'can' => 'geocode',
      'optional' => 1,
    }
  }

=head4 OUTPUT

=over 4

=item * No matches: undef

=back

  {
    type => 'hashref',
    'min' => 1,
    'max' => 10
  }

=cut
94
95sub parse_obituary
96{
97
22
853766
        my $params = Params::Validate::Strict::validate_strict({
98                args => Params::Get::get_params('text', \@_),
99                schema => {
100                        'text' => {
101                                'type' => 'string',
102                                'min' => 1,
103                                'max' => 10000
104                        }, 'geocoder' => {
105                                'type' => 'object',
106                                'can' => 'geocode',
107                                'optional' => 1,
108                        }
109                }
110        });
111
112
22
2818
        my $text = $params->{'text'};
113
114
22
51
        if(my $geo = $params->{'geocoder'}) {
115
0
0
                $geocoder = $geo;
116        }
117
118
22
38
        Carp::croak(__PACKAGE__, ': Usage: parse_obituary($text)') unless defined($text);
119
120
22
38
        if(ref($text) eq 'SCALAR') {
121
0
0
0
0
                $text = ${$text};
122        }
123
124
22
232
        $text =~ s/\s+, +/, /g; # Collapse multiple commas
125
126        # Quick scan to get started
127        sub parse_obituary_quick {
128
0
0
                my $text = shift;
129
0
0
                my %data;
130
131
0
0
                my @patterns = (
132                        [ qr/\bdaughters?\s+([^.,;]+)/i, 'children' ],
133                        [ qr/\bsons?\s+([^.,;]+)/i, 'children' ],
134                        [ qr/\bchildren\s+([^.,;]+)/i, 'children' ],
135                        [ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ],
136                        [ qr/\bwife\s+([^.,;]+)/i, 'spouse' ],
137                        [ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ],
138                        [ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ],
139                        [ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ],
140                        [ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ],
141                        [ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ],
142                        [ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ],
143                        [ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ],
144                );
145
146
0
0
                for my $p (@patterns) {
147
0
0
                        my ($re, $field) = @$p;
148
0
0
                        while ($text =~ /$re/g) {
149
0
0
                                my $list = $1;
150
0
0
                                next unless $list;
151
152                                # Robust splitting on commas and "and"
153
0
0
0
0
0
0
                                my @names = grep { length } map { s/^\s+|\s+$//gr } split /\s*(?:,|(?:\band\b))\s*/i, $list;
154
0
0
0
0
0
0
                                push @{$data{$field}}, map { { 'name' => $_ } } @names;
155                        }
156                }
157
158
0
0
                return \%data;
159        }
160
161        # my %family = %{parse_obituary_quick($text)};
162
22
25
        my %family;
163
164        # Helper to extract people from a specific section and remove empty entries
165        sub extract_people_section {
166
7
13
                my $section = shift;
167
7
14
                return unless $section;
168
169
7
30
                $section =~ s/\s+and\s+/, /g;   # Ensure "and" is treated as a separator
170
7
27
                $section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g;
171
7
26
                my @entries = split /\s*,\s*/, $section;
172
173
7
4
                my @people;
174
7
9
                foreach my $entry (@entries) {
175
19
16
                        $entry =~ s/<<COMMA>>/, /g;
176
177
19
21
                        my ($name, $spouse, $location) = ('', '', '');
178
179                        # Match "Ian (Terry) Girvan of Surrey, BC"
180
19
42
                        if ($entry =~ /^(\w+)\s+\(([^)]+)\)\s+(\w+)\s+of\s+(.+)$/) {
181
1
1
1
2
1
4
                                $name = "$1 $3"; $spouse = $2; $location = $4;
182                        }
183                        # Match "Gwen Steeves (Leslie) of Riverview, NB"
184                        elsif ($entry =~ /^(.+?)\s+\(([^)]+)\)\s+of\s+(.+)$/) {
185
1
1
1
2
1
2
                                $name = $1; $spouse = $2; $location = $3;
186                        }
187                        # Match "Carol Girvan of Dartmouth, NS"
188                        elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) {
189
1
1
1
1
                                $name = $1; $location = $2;
190                        } else {
191                                # Match names only (e.g. for siblings)
192
16
12
                                $name = $entry;
193                        }
194
195
19
19
                        next if !$name; # Skip if name is empty
196
17
23
                        next if($name =~ /^father-in-law\sto\s/);       # Skip follow ons
197
16
16
                        last if($name =~ /^devoted\s/i);
198
15
16
                        last if($name =~ /^loved\s/i);
199
200                        # Create a hash and filter out blank fields
201
202
15
15
                        my %person = ( name => $name );
203
204
15
30
                        $person{spouse} = $spouse if defined $spouse && $spouse ne '';
205
15
20
                        $person{location} = $location if defined $location && $location ne '';
206
207
15
18
                        push @people, \%person;
208                }
209
7
15
                return \@people;
210        }
211
212        sub extract_names_from_phrase {
213
3
3
                my $phrase = $_[0];
214
3
2
                my @names;
215
216
3
5
                $phrase =~ s/[.;]//g;
217
218                # Case: "Christopher, Thomas, and Marsha Cloud"
219
3
7
                if ($phrase =~ /^((?:\w+\s*,\s*)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) {
220
0
0
                        my ($pre, $last_first, $last) = ($1, $2, $3);
221
0
0
                        my @firsts = split(/\s*,\s*/, $pre);
222
0
0
                        push @firsts, $last_first;
223
0
0
0
0
                        push @names, map { "$_ $last" } @firsts;
224
0
0
                        return @names;
225                }
226
227                # Case: "Christopher and Thomas Cloud"
228
3
11
                if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) {
229
0
0
                        my ($first_part, $second_first, $last) = ($1, $2, $3);
230
0
0
                        my @firsts = split(/\s*,\s*|\s+and\s+/, $first_part);
231
0
0
0
0
                        push @names, map { "$_ $last" } (@firsts, $second_first);
232
0
0
                        return @names;
233                }
234
235                # Fallback: Split by comma or 'and'
236
3
5
                $phrase =~ s/, and grandchildren.+//;   # Handle "Anna and Lucy, and grandchildren Jake and Emma"
237
238                # Handle "Name1, Name2 and Name3" correctly
239
3
12
                if($phrase =~ /(.+?),\s*(\w+)\s+and\s+(\w+)/) {
240
0
0
                        my ($firsts, $second, $third) = ($1, $2, $3);
241
0
0
                        my @firsts = split /\s*,\s*/, $firsts;
242
0
0
                        push @names, @firsts, $second, $third;
243                } else {
244
3
14
                        my @parts = split /\s*(?:,|and)\s*/, $phrase;
245
3
5
9
14
                        push @names, grep { defined($_) && $_ ne '' } @parts;
246                }
247
248
3
5
                return @names;
249        }
250
251        # Correct extraction of children (skipping "his/her")
252
22
453
        if ($text =~ /survived by (his|her) children\s*([^\.;]+)/i) {
253
2
4
                my $children_text = $2;
254
2
5
                $family{children} = extract_people_section($children_text);
255        } elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) {       # Look for the phrase "Loving mum to"
256
1
2
                my $children_text = $1;
257
1
2
                $family{children} = extract_people_section($children_text);
258        } elsif ($text =~ /Loving father of\s*([^\.;]+)/i) {    # Look for the phrase "Loving father of"
259
1
3
                my $children_text = $1;
260
1
3
                $family{children} = extract_people_section($children_text);
261        } elsif($text =~ /mother of\s*([^\.;]+)?,/i) {  # Look for the phrase "mother of"
262
1
2
                my $children_text = $1;
263
1
4
                $children_text =~ s/, grandmother.+//;
264
1
2
                $family{children} = extract_people_section($children_text);
265        } elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) {
266
3
5
                my @children;
267                my @grandchildren;
268
269
3
14
                push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' };
270
3
32
                if($text =~ /\bdaughter,?\s([a-z]+)/i) {
271
1
3
                        push @children, { 'name' => $1, 'sex' => 'F' }
272                }
273
3
43
                if($text =~ /\bgranddaughter,?\s([a-z]+)/i) {
274
1
3
                        push @grandchildren, { 'name' => $1, 'sex' => 'F' };
275                }
276
3
8
                $family{children} = \@children if @children;
277
3
6
                if(@grandchildren) {
278
1
2
                        @grandchildren = sort @grandchildren;
279
1
2
                        $family{grandchildren} = \@grandchildren;
280                }
281        } elsif($text =~ /Surviving are (?:a )?daughters?,\s*Mrs\.\s+(\w+)\s+\(([^)]+)\)\s+(\w+),\s+([^;]+?);/i) {
282                # Handle "Surviving are a daughter, Mrs. Walter (Ruth Ann) Gerke, Fort Wayne"
283
0
0
                my @children;
284
0
0
                my $spouse_first = $1;
285
0
0
                my $daughter_name = $2;
286
0
0
                my $spouse_last = $3;
287
0
0
                my $location = $4;
288
0
0
                $location =~ s/,\s*$//;
289
290
0
0
                push @children, {
291                        name => $daughter_name,
292                        location => $location,
293                        sex => 'F',
294                        spouse => {
295                                name => "$spouse_first $spouse_last",
296                                sex => 'M'
297                        }
298                };
299
0
0
                $family{children} = \@children;
300        } else {
301
14
14
                my @children;
302
303                # my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0);
304                # my $tagged = $tagger->add_tags($text);
305
306
14
45
                if($text =~ /\ssons,\s*(.*?);/s) {
307
8
26
                        my $sons_text = $1;
308
8
19
                        if($sons_text =~ /, all of (.+)$/) {
309
6
8
                                my $location = $1;
310
6
22
                                while($sons_text =~ /([\w. ]+?),\s/g) {
311
18
21
                                        my $son = $1;
312
18
19
                                        if($son =~ /(\w+)\s+and\s+(\w+)/) {
313
6
20
                                                push @children, {
314                                                        name => $1,
315                                                        location => $location,
316                                                        sex => 'M',
317                                                }, {
318                                                        name => $2,
319                                                        location => $location,
320                                                        sex => 'M',
321                                                };
322
6
7
                                                last;
323                                        } else {
324
12
32
                                                push @children, {
325                                                        name => $son,
326                                                        location => $location,
327                                                        sex => 'M',
328                                                };
329                                        }
330                                }
331                        } else {
332
2
15
                                while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
333
3
12
                                        push @children, {
334                                                name => $1,
335                                                location => $2,
336                                                sex => 'M',
337                                        };
338                                }
339                        }
340                }
341
14
103
                if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) {
342
1
7
                        push @children, {
343                                name => $1,
344                                location => $3,
345                                sex => 'F',
346                                spouse => { 'name' => $2, sex => 'M' }
347                        };
348                } elsif($text =~ /one daughter,\s*(.+?),\s*(.+?);/) {
349
7
10
                        my $name = $1;
350
7
5
                        my $location = $2;
351
7
18
                        if($name =~ /(\w+)\s+(\w+)/) {
352
7
20
                                push @children, {
353                                        name => $1,
354                                        location => $location,
355                                        sex => 'F',
356                                        spouse => { name => $2, sex => 'M' }
357                                };
358                        } else {
359
0
0
                                push @children, {
360                                        name => $1,
361                                        location => $location,
362                                        sex => 'F',
363                                };
364                        }
365                }
366
14
29
                $family{children} = \@children if @children;
367
368
14
29
                if(!$family{'children'}) {
369
6
55
                        while($text =~ /\b(son|daughter)s?,\s*([A-Z][a-z]+(?:\s+\([A-Z][a-z]+\))?)\s*(?:and their children ([^.;]+))?/g) {
370
2
3
                                my $sex = $1 eq 'son' ? 'M' : 'F';
371
2
3
                                my $child = $2;
372
2
2
                                my $grandkids = $3;
373
2
10
                                if(my @grandchildren = $grandkids ? split /\s*,\s*|\s+and\s+/, $grandkids : ()) {
374
0
0
                                        push @children, {
375                                                name => $child,
376                                                sex => $sex,
377                                                grandchildren => \@grandchildren,
378                                        };
379                                } elsif(($sex eq 'F') && ($child =~ /(.+)\s+\((.+)\)/)) {
380
1
7
                                        push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } }
381                                } elsif($child ne 'Mrs') {
382
1
14
                                        push @children, { name => $child, sex => $sex }
383                                }
384                        }
385                }
386
14
24
                $family{children} = \@children if @children;
387        }
388
389
22
61
        if(!$family{'children'}) {
390
5
51
                if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) {
391
1
2
                        my $raw = $1;
392
1
2
                        $raw =~ s/\sand their .+//;
393
1
2
                        my @children = extract_names_from_phrase($raw);
394
1
1
2
1
3
3
                        push @{$family{children}}, map { { name => $_, sex => 'M' } } @children;
395                }
396
5
18
                if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) {
397
2
2
                        my $raw = $1;
398
2
3
                        $raw =~ s/\sand their .+//;
399
2
5
                        my @children = extract_names_from_phrase($raw);
400
2
2
3
2
5
8
                        push @{$family{children}}, map { { name => $_, sex => 'F' } } @children;
401                }
402        }
403
404        # Extract grandchildren
405
22
35
        if(!$family{'grandchildren'}) {
406
21
51
                if($text =~ /grandchildren\s+([^\.;]+)/i) {
407
3
15
                        my @grandchildren = split /\s*(?:,|and)\s*/i, $1;
408
3
21
                        if(scalar(@grandchildren)) {
409
3
9
                                @grandchildren = sort @grandchildren;
410
3
7
8
5
12
13
                                $family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ];
411                        }
412                }
413        }
414
22
4
49
7
        if($family{'grandchildren'} && scalar @{$family{grandchildren}}) {
415
4
26
                while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) {
416
0
0
0
0
                        shift @{$family{'grandchildren'}};
417                }
418
4
13
                if($family{'grandchildren'}->[0] =~ /brothers/) {
419
0
0
                        if(!exists $family{'brothers'}) {
420
0
0
0
0
                                shift @{$family{'grandchildren'}};
421
0
0
0
0
                                $family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}}));
422                        }
423
0
0
                        delete $family{grandchildren};
424                }
425        } else {
426
18
21
                delete $family{grandchildren};
427        }
428
22
4
55
11
        if((!defined($family{'grandchildren'})) || (($#{$family{'grandchildren'}}) <= 0)) {
429                # handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill"
430
19
48
                my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved|$)/;
431                # Normalize and split into individual names
432
19
32
                my @grandchildren;
433
19
31
                if($grandchildren_str) {
434
1
5
                        @grandchildren = split /,\s*|\s+and\s+/, $grandchildren_str;
435                }
436
19
46
                if(scalar(@grandchildren)) {
437
1
3
                        @grandchildren = sort @grandchildren;
438
1
2
                        $family{'grandchildren'} = \@grandchildren;
439                } elsif($text =~ /grandm\w+\s/) {
440
1
2
                        my $t = $text;
441
1
6
                        $t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/;
442
1
8
                        my @grandchildren = sort ( split /\s*(?:,|and)\s*/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] || '' );
443
1
3
                        $family{grandchildren} = \@grandchildren;
444                }
445        }
446
447        # Extract siblings (sisters and brothers) correctly, skipping "her" or "his"
448
22
99
        if($text =~ /predeceased by (his|her) sisters?\s*([^;\.]+);?/i) {
449
2
8
                my $sisters_text = $2;
450
2
4
                $sisters_text =~ s/^,\s+//;
451
2
3
                $family{sisters} = extract_people_section($sisters_text);
452        } else {
453
20
62
                while($text =~ /\bsister[,\s]\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
454
2
4
                        my $name = $1;
455
2
8
                        $family{'sisters'} ||= [];
456
2
4
                        if($name eq 'Mrs') {
457
1
4
                                if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) {
458
1
1
                                        $name = $1;
459                                } else {
460
0
0
                                        undef $name;
461                                }
462                        }
463
2
3
                        if($name) {
464
2
2
2
40
                                push @{$family{sisters}}, {
465                                        name => $name,
466                                        sex => 'F',
467                                        status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living',
468                                };
469                        }
470                }
471
472
20
38
                if(!exists($family{'sisters'})) {
473
18
56
                        if($text =~ /\stwo\ssisters,\s*(.*?)\sand\s(.*?)[;:]/s) {
474
6
14
                                my($first, $second) = ($1, $2);
475
6
7
                                foreach my $sister($first, $second) {
476
12
22
                                        if($sister =~ /Mrs\.\s(.+?),\s(.+)/) {
477
12
13
                                                my $name = $1;
478
12
13
                                                my $location = $2;
479
12
16
                                                $location =~ s/,$//;
480
12
19
                                                if($name =~ /(\w+)\s+(\w+)/) {
481
12
12
11
40
                                                        push @{$family{sisters}}, {
482                                                                name => $1,
483                                                                location => $location,
484                                                                sex => 'F',
485                                                                spouse => { 'name' => $2, 'sex' => 'M' }
486                                                        };
487                                                } else {
488
0
0
0
0
                                                        push @{$family{sisters}}, {
489                                                                name => $name,
490                                                                location => $location,
491                                                                sex => 'F',
492                                                        };
493                                                }
494                                        } else {
495
0
0
0
0
                                                push @{$family{sisters}}, {
496                                                        name => $sister,
497                                                        sex => 'F',
498                                                };
499                                        }
500                                }
501                        }
502                }
503
504
20
36
                if($family{'sisters'}) {
505                        # Deduplicate by serializing hashes for comparison
506
8
10
                        my %seen;
507                        my @sisters = grep {
508
14
150
                                my $key = JSON::MaybeXS->new->canonical(1)->encode($_);
509
14
46
                                !$seen{$key}++
510
8
8
8
9
                        } @{$family{sisters}};
511
512
8
15
                        $family{sisters} = \@sisters;
513                }
514        }
515
516
22
104
        if($text =~ /predeceased by (his|her) brothers?\s*([^;\.]+);?/i) {
517
0
0
                my $brothers_text = $2;
518
0
0
                $brothers_text =~ s/^,\s+//;
519
0
0
                $family{brothers} = extract_people_section($brothers_text);
520                # TODO: mark all statuses to deceased
521        } else {
522
22
58
                while ($text =~ /\bbrother,\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) {
523
1
4
                        $family{'brothers'} ||= [];
524
1
1
1
48
                        push @{$family{brothers}}, {
525                                name => $1,
526                                status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living',
527                        };
528                }
529
22
95
                if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) {
530
12
48
                        if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) {
531
1
1
1
4
                                push @{$family{'siblings'}},
532                                        { 'name' => $1 },
533                                        { 'name' => $2 }
534                        }
535                }
536
537
22
32
                if(!exists($family{'brothers'})) {
538
21
54
                        if($text =~ /\sbrothers,\s*(.*?)[;\.]/s) {
539
7
9
                                my $brothers_text = $1;
540
7
14
                                if($brothers_text =~ /, all of (.+)$/) {
541
6
6
                                        my $location = $1;
542
6
14
                                        while($brothers_text =~ /([\w. ]+?),\s/g) {
543
12
10
                                                my $son = $1;
544
12
20
                                                if($son =~ /(\w+)\s+and\s+(\w+)/) {
545
6
6
5
23
                                                        push @{$family{brothers}}, {
546                                                                name => $1,
547                                                                location => $location,
548                                                                sex => 'M',
549                                                        }, {
550                                                                name => $2,
551                                                                location => $location,
552                                                                sex => 'M',
553                                                        };
554
6
10
                                                        last;
555                                                } else {
556
6
6
5
21
                                                        push @{$family{brothers}}, {
557                                                                name => $son,
558                                                                location => $location,
559                                                                sex => 'M',
560                                                        };
561                                                }
562                                        }
563                                } else {
564
1
3
                                        while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) {
565
1
1
1
3
                                                push @{$family{brothers}}, {
566                                                        name => $1,
567                                                        location => $2,
568                                                        sex => 'M',
569                                                };
570                                        }
571                                }
572                        }
573                }
574        }
575
576
22
201
        if(!exists($family{'brothers'}) && $text =~ /\b(?:two|three|four)\s+brothers?,\s*(.+?)(?:,\s*a\s+(?:sister|half-sister)|;)/i) {
577                # Pattern for "two brothers, Name and Name"
578
0
0
                my $brothers_text = $1;
579
0
0
                my @brothers;
580
581                # Handle "Charles F. Harris and Berton Harris"
582
0
0
                if($brothers_text =~ /\band\b/) {
583
0
0
                        my @names = split /\s+and\s+/, $brothers_text;
584
0
0
                        foreach my $name (@names) {
585
0
0
                                $name =~ s/^\s+|\s+$//g;
586
0
0
                                $name =~ s/,\s*$//;
587
0
0
                                push @brothers, {
588                                        name => $name,
589                                        sex => 'M',
590                                        status => 'living'
591                                };
592                        }
593                }
594
0
0
                $family{brothers} = \@brothers if(scalar @brothers);
595        }
596
597        # Detect nieces/nephews
598
22
121
        $family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : [];
599
600        # Extract parents and clean the names by removing unnecessary details
601
22
766
        if($text =~ /(son|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) {
602
2
2
                my $father = $2;
603
2
3
                my $mother = $3;
604
605                # Remove anything after the first comma in each parent's name
606
2
3
                $father =~ s/,.*//;
607
2
2
                $mother =~ s/,.*//;
608
609
2
9
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
610
2
3
                        $mother = "$1 $2";
611                }
612                $family{parents} = {
613
2
24
                        father => { name => $father },
614                        mother => { name => $mother },
615                };
616        } elsif($text =~ /parents were (\w+) and (\w+)/i) {
617                $family{parents} = {
618
1
4
                        father => { name => $1 },
619                        mother => { name => $2 },
620                };
621        }
622
623        # Extract spouse's death year and remove the "(year)" from the name
624
22
695
        if($text =~ /(wife|husband) of the late\s+([\w\s]+)\s+\((\d{4})\)/) {
625
1
1
                my $name = $2;
626
1
2
                my $death_year = $3;
627
628
1
3
                $family{'spouse'} ||= [];
629
630                # Remove the death year part from the spouse's name
631
1
2
                $name =~ s/\s*\(\d{4}\)//;
632
633
1
1
1
5
                push @{$family{'spouse'}}, {
634                        name => $name,
635                        death_year => $death_year
636                }
637        } elsif($text =~ /\bmarried ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.*?\b(?:at|in)\s+([^.,]+))?/i) {
638
1
4
                $family{'spouse'} ||= [];
639
640
1
2
                my($name, $date, $place) = ($1, $2, $3);
641
1
1
                $name =~ s/\s+on\s.+$//;
642
643
1
1
2
4
                push @{$family{'spouse'}}, {
644                        name => $name,
645                        married => {
646                                date => $date,
647                                place => $place // '',
648                        }
649                };
650        } elsif($text =~ /husband (?:to|of) the late\s([\w\s]+)[\s\.]/i) {
651
1
4
                $family{'spouse'} ||= [];
652
653
1
1
1
3
                push @{$family{'spouse'}}, { name => $1, status => 'deceased' }
654        } elsif($text =~ /\b(?:wife|husband) of ([^.,;]+)/i) {
655
3
9
                $family{'spouse'} ||= [];
656
657
3
3
2
7
                push @{$family{'spouse'}}, { name => $1 }
658        } elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) {
659
1
1
1
3
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' }
660        } elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) {
661
2
2
5
6
                push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' }
662        }
663
664        # Ensure spouse location is properly handled
665
22
58
        if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') {
666
0
0
                delete $family{spouse}[0]{location};
667        }
668
669        # Extract the funeral information
670
22
445
        if($text =~ /funeral service.*?at\s+(.+?),?\s+on\s+(.*?),?\s+at\s+(.+?)\./) {
671                $family{funeral} = {
672
1
5
                        location => $1,
673                        date     => $2,
674                        time     => $3,
675                };
676        } elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) {
677                $family{funeral} = {
678
1
6
                        location => $1,
679                        date     => $2,
680                        time     => $3,
681                };
682
1
3
                if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) {
683
1
2
                        $family{'funeral'}->{'date'} = $1;
684
1
4
                        if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) {
685                                # Wednesday 9th March at 1.15pm.  Friends etc. etc.
686
1
1
                                $family{'funeral'}->{'date'} = $1;
687
1
2
                                $family{'funeral'}->{'time'} = $2;
688                        }
689                }
690        } elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) {
691                $family{funeral} = {
692
1
3
                        time     => $1,
693                        location => $2
694                };
695        } elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) {
696
0
0
                $family{funeral} = { location => $1 }
697        } elsif($text =~ /services.+\sat\s(.+),\swith\s/i) {
698
9
16
                $family{funeral} = { location => $1 }
699        }
700
701        # Extract father-in-law and mother-in-law information (if present)
702
22
83
        if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) {
703
1
1
                my $father_in_law = $1;
704
1
3
                $family{children_in_law} = [{ name => $father_in_law }];
705        } elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) {
706
1
1
                my $mother_in_law = $1;
707
1
8
                $family{children_in_law} = [ split /\s*(?:,|and)\s*/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] || '' ];
708
1
2
                if(scalar($family{children_in_law} == 0)) {
709
0
0
                        $family{children_in_law} = [{ name => $mother_in_law }];
710                }
711        }
712
713        # Extract aunt information
714
22
50
        if($text =~ /niece of\s+([A-Za-z]+)/) {
715
1
1
                my $aunt = $1;
716
1
3
                $family{aunt} = [{ 'name' => $aunt }];
717        }
718
719        # Birth info
720
22
2106
        if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) {
721                $family{birth} = {
722
1
4
                        place => $1,
723                        date => $2,
724                }
725        } elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) {
726
1
2
                $family{'birth'}->{'place'} = $1;
727
1
3
                if(my $location = _extract_location($1)) {
728
0
0
                        $family{'birth'}->{'location'} = $location;
729                }
730
1
3
                if(my $dt = _extract_date($2)) {
731
1
21
                        $family{'birth'}->{date} = $dt->ymd('/');
732                }
733
1
38
                $family{'birth'}->{'place'} =~ s/\s+$//;
734        } elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) {
735
1
2
                $family{'birth'}->{'place'} = $2;
736
1
2
                my $father = $3;
737
1
2
                my $mother = $4;
738
1
1
                eval {
739
1
4
                        if(my $dt = DateTime::Format::Text->parse_datetime($1)) {
740
1
312
                                $family{'birth'}->{date} = $dt->ymd('/');
741                        }
742                };
743                # TODO
744                # if($verbose && $@) {
745                        # Carp::carp($@);
746                # }
747
1
16
                if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) {
748
1
2
                        $mother = "$1 $2";
749                }
750
1
3
                if($father =~ /(.+?)\.\s\s/) {
751
0
0
                        $father = $1;
752                }
753                $family{parents} = {
754
1
4
                        father => { name => $father },
755                        mother => { name => $mother }
756                };
757
1
6
                if($text =~ /survived by (his|her) (father|mother)[\s,;]/i) {
758
1
2
                        $family{parents}->{$2}->{'status'} = 'living';
759                }
760        } elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) {
761
2
5
                if(my $dt = _extract_date($1)) {
762
2
22
                        $family{'birth'}->{date} = $dt->ymd('/');
763                }
764
2
25
                if($2) {
765
2
7
                        $family{'birth'}->{'location'} = $2;
766                }
767        }
768
769        # Date of death
770
22
260
        if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) {
771
2
5
                $family{death}->{date} = $1;
772
2
5
                if(my $dt = _extract_date($1)) {
773
2
17
                        $family{death}->{datetime} = $dt;
774                }
775        }
776
777        # Age at death
778
22
47
        if($text =~ /,\s(\d{1,3}), of\s/) {
779
2
5
                if($1 < 110) {
780
2
5
                        $family{'death'}->{'age'} = $1;
781                }
782        }
783
784        # Place of death
785
22
336
        if($text =~ /\b(?:passed away|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) {
786
4
5
                my $place = $2;
787
4
359
                if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) {
788
0
0
                        $place = $1;
789
0
0
                        $family{death}->{date} = $2;
790                } elsif($place =~ /(.+)\son\s(.+)/) {
791
1
1
                        $place = $1;
792
1
2
                        if(my $dt = _extract_date($2)) {
793
1
9
                                $family{death}->{date} = $dt->ymd('/');
794                        }
795                }
796
4
20
                $place =~ s/^\bthe residence,\s//;
797
4
6
                $place =~ s/\bafter a.*$//;
798
4
7
                $place =~ s/,\s+$//;
799
4
6
                $family{death}->{place} = $place;
800        }
801
802        # Remove blank fields from the main family hash
803
22
105
105
41
118
178
        %family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family;
804
805        # Remove empty arrays the family hash
806
22
41
        foreach my $key (keys %family) {
807
105
110
                if(ref($family{$key}) eq 'ARRAY') {
808
78
132
78
54
153
77
                        $family{$key} = [ grep { /\S/ } @{$family{$key}} ];
809
78
78
57
102
                        if(@{$family{$key}} == 0) {
810
21
35
                                delete $family{$key};
811                        }
812                }
813        }
814
815
22
39
        return if(!scalar keys(%family));
816
817
21
21
19
36
        %family = %{_canonicalize(\%family)};
818
819
21
83
        return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 });
820}
821
822
823sub _canonicalize {
824
197
143
    my ($data) = @_;
825
826
197
170
    return $data unless ref $data eq 'HASH';
827
828
197
156
    for my $key (keys %$data) {
829
489
336
        my $val = $data->{$key};
830
831
489
446
        if (ref $val eq 'ARRAY') {
832
57
41
            my @hashes;
833            my @non_hashes;
834
835
57
51
            for my $item (@$val) {
836
132
184
                if (ref $item eq 'HASH' && exists $item->{name}) {
837
119
101
                    push @hashes, _canonicalize($item);  # canonicalize recursively
838                } else {
839
13
11
                    push @non_hashes, $item;
840                }
841            }
842
843            # Sort people hashes by name
844
57
101
85
133
            @hashes = sort { ($a->{name} // '') cmp ($b->{name} // '') } @hashes;
845
846            # Replace array in place
847
57
74
            $data->{$key} = [ @hashes, @non_hashes ];
848        }
849        elsif (ref $val eq 'HASH') {
850            # Normalize whitespace in names
851
57
52
            if (exists $val->{name}) {
852
29
35
                $val->{name} =~ s/\s+/ /g;
853
29
48
                $val->{name} =~ s/^\s+|\s+$//g;
854            }
855
57
53
            $data->{$key} = _canonicalize($val);
856        }
857    }
858
859
197
236
    return $data;
860}
861
862
863sub _extract_date
864{
865
6
10
        my $text = shift;
866
6
38
        my $parser = DateTime::Format::Text->new();
867
6
28
        my $dt;
868
869
6
6
24
15
        eval { $dt = $parser->parse_datetime($text); };
870
6
2312
        return $dt if $dt && !$@;
871
0
0
        return undef;
872}
873
874sub _extract_location {
875
1
1
        my $place_text = shift;
876
877
1
2
        unless($geocoder) {
878
1
1
1
119
                eval { require Geo::Coder::Free };
879
1
363
                if($@) {
880
1
15
                        Carp::carp(__PACKAGE__, ' (', __LINE__, "): geocoding locations disabled: $@");
881
1
645
                        return;
882                }
883
0
                $geocoder = Geo::Coder::Free->new();
884        }
885
886
0
        my @locations = $geocoder->geocode(location => $place_text);      # Use array to improve caching
887
888
0
        return unless scalar(@locations);
889
890
0
        my $result = $locations[0];
891
892
0
        if(ref($result)) {
893                return {
894
0
                        raw => $place_text,
895                        # city => $result->{components}{city} || $result->{components}{town},
896                        # region => $result->{components}{state},
897                        # country => $result->{components}{country},
898                        latitude => $result->latitude(),
899                        longitude => $result->longitude()
900                };
901        }
902        return {
903                raw => $place_text,
904                # city => $result->{components}{city} || $result->{components}{town},
905                # region => $result->{components}{state},
906                # country => $result->{components}{country},
907                latitude => $result->{'latitude'},
908
0
                longitude => $result->{'longitude'}
909        };
910}
911
912
913 - 943
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/>

=head1 SUPPORT

This module is provided as-is without any warranty.

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
944
9451;