File Coverage

File:	blib/lib/Genealogy/Obituary/Parser.pm
Coverage:	75.6%

line	stmt	bran	cond	sub	time	code
1						package Genealogy::Obituary::Parser;
2
3	5 5 5			5	299313 5 67	use strict;
4	5 5 5			5	5 5 81	use warnings;
5
6	5 5 5			5	8 4 108	use Carp;
7	5 5 5			5	938 951494 88	use DateTime::Format::Text;
8	5 5 5			5	15 5 64	use Exporter 'import';
9	5 5 5			5	9 4 157	use JSON::MaybeXS;
10	5 5 5			5	935 22243 107	use Params::Get 0.13;
11	5 5 5			5	822 15708 94	use Return::Set 0.03;
12	5 5 5			5	12 4 21181	use Params::Validate::Strict;
13
14						our @EXPORT_OK = qw(parse_obituary);
15						our $geocoder;
16
17						# TODO: use Lingua::EN::Tagger;
18						# TODO: add more general code, e.g., where it looks for father, also look for mother
19						# TODO: parse https://funeral-notices.co.uk/notice/adams/5244000
20
21 - 29						=head1 NAME Genealogy::Obituary::Parser - Extract structured family relationships from obituary text =head1 VERSION Version 0.04 =cut
30
31						our $VERSION = '0.04';
32
33 - 93						=head1 SYNOPSIS use Genealogy::Obituary::Parser qw(parse_obituary); my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.'; my $data = parse_obituary($text); # $data = { # spouse => ['Paul'], # children => ['Anna', 'Lucy'], # grandchildren => ['Jake', 'Emma'], # }; =head1 DESCRIPTION This module parses free-form obituary text and extracts structured family relationship data for use in genealogical applications. It parses obituary text and extracts structured family relationship data, including details about children, parents, spouse, siblings, grandchildren, and other relatives. =head1 FUNCTIONS =head2 parse_obituary($text) The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash. It returns a hash reference containing structured family information, with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc. Takes a string, or a ref to a string. =head3 API SPECIFICATION =head4 INPUT { 'text' => { 'type' => 'string', # or stringref 'min' => 1, 'max' => 10000 }, 'geocoder' => { # used to geocode locations to verify they exist 'type' => 'object', 'can' => 'geocode', 'optional' => 1, } } =head4 OUTPUT =over 4 =item * No matches: undef =back { type => 'hashref', 'min' => 1, 'max' => 10 } =cut
94
95						sub parse_obituary
96						{
97	13			13	399880	my $params = Params::Validate::Strict::validate_strict({
98						args => Params::Get::get_params('text', \@_),
99						schema => {
100						'text' => {
101						'type' => 'string',
102						'min' => 1,
103						'max' => 10000
104						}, 'geocoder' => {
105						'type' => 'object',
106						'can' => 'geocode',
107						'optional' => 1,
108						}
109						}
110						});
111
112	13				1100	my $text = $params->{'text'};
113
114	13	50			18	if(my $geo = $params->{'geocoder'}) {
115	0				0	$geocoder = $geo;
116						}
117
118	13	50			14	Carp::croak(__PACKAGE__, ': Usage: parse_obituary($text)') unless defined($text);
119
120	13	50			14	if(ref($text) eq 'SCALAR') {
121	0 0				0 0	$text = ${$text};
122						}
123
124	13				132	$text =~ s/\s+, +/, /g; # Collapse multiple commas
125
126						# Quick scan to get started
127						sub parse_obituary_quick {
128	0			0	0	my $text = shift;
129	0				0	my %data;
130
131	0				0	my @patterns = (
132						[ qr/\bdaughters?\s+([^.,;]+)/i, 'children' ],
133						[ qr/\bsons?\s+([^.,;]+)/i, 'children' ],
134						[ qr/\bchildren\s+([^.,;]+)/i, 'children' ],
135						[ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ],
136						[ qr/\bwife\s+([^.,;]+)/i, 'spouse' ],
137						[ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ],
138						[ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ],
139						[ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ],
140						[ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ],
141						[ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ],
142						[ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ],
143						[ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ],
144						);
145
146	0				0	for my $p (@patterns) {
147	0				0	my ($re, $field) = @$p;
148	0				0	while ($text =~ /$re/g) {
149	0				0	my $list = $1;
150	0	0			0	next unless $list;
151
152						# Robust splitting on commas and "and"
153	0 0 0				0 0 0	my @names = grep { length } map { s/^\s+\|\s+$//gr } split /\s(?:,\|(?:\band\b))\s/i, $list;
154	0 0 0				0 0 0	push @{$data{$field}}, map { { 'name' => $_ } } @names;
155						}
156						}
157
158	0				0	return \%data;
159						}
160
161						# my %family = %{parse_obituary_quick($text)};
162	13				8	my %family;
163
164						# Helper to extract people from a specific section and remove empty entries
165						sub extract_people_section {
166	7			7	6	my $section = shift;
167	7	50			16	return unless $section;
168
169	7				21	$section =~ s/\s+and\s+/, /g; # Ensure "and" is treated as a separator
170	7				20	$section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g;
171	7				20	my @entries = split /\s,\s/, $section;
172
173	7				5	my @people;
174	7				7	foreach my $entry (@entries) {
175	19				16	$entry =~ s/<<COMMA>>/, /g;
176
177	19				18	my ($name, $spouse, $location) = ('', '', '');
178
179						# Match "Ian (Terry) Girvan of Surrey, BC"
180	19	100 100 100			25	if ($entry =~ /^(\w+)\s+$([^)]+)$\s+(\w+)\s+of\s+(.+)$/) {
181	1 1 1				2 1 3	$name = "$1 $3"; $spouse = $2; $location = $4;
182						}
183						# Match "Gwen Steeves (Leslie) of Riverview, NB"
184						elsif ($entry =~ /^(.+?)\s+$([^)]+)$\s+of\s+(.+)$/) {
185	1 1 1				2 1 1	$name = $1; $spouse = $2; $location = $3;
186						}
187						# Match "Carol Girvan of Dartmouth, NS"
188						elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) {
189	1 1				1 1	$name = $1; $location = $2;
190						} else {
191						# Match names only (e.g. for siblings)
192	16				15	$name = $entry;
193						}
194
195	19	100			13	next if !$name; # Skip if name is empty
196	17	100			16	next if($name =~ /^father-in-law\sto\s/); # Skip follow ons
197	16	100			15	last if($name =~ /^devoted\s/i);
198	15	50			11	last if($name =~ /^loved\s/i);
199
200						# Create a hash and filter out blank fields
201	15				17	my %person = (
202						name => $name,
203						spouse => $spouse,
204						location => $location,
205						);
206
207						# Remove blank fields
208	15 20 45	50			16 25 74	%person = map { $_ => $person{$_} } grep { defined $person{$_} && $person{$_} ne '' } keys %person;
209
210	15				17	push @people, \%person;
211						}
212	7				12	return \@people;
213						}
214
215						sub extract_names_from_phrase {
216	1			1	1	my $phrase = $_[0];
217	1				0	my @names;
218
219	1				1	$phrase =~ s/[.;]//g;
220
221						# Case: "Christopher, Thomas, and Marsha Cloud"
222	1	50			2	if ($phrase =~ /^((?:\w+\s,\s)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) {
223	0				0	my ($pre, $last_first, $last) = ($1, $2, $3);
224	0				0	my @firsts = split(/\s,\s/, $pre);
225	0				0	push @firsts, $last_first;
226	0 0				0 0	push @names, map { "$_ $last" } @firsts;
227	0				0	return @names;
228						}
229
230						# Case: "Christopher and Thomas Cloud"
231	1	50			3	if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) {
232	0				0	my ($first_part, $second_first, $last) = ($1, $2, $3);
233	0				0	my @firsts = split(/\s,\s\|\s+and\s+/, $first_part);
234	0 0				0 0	push @names, map { "$_ $last" } (@firsts, $second_first);
235	0				0	return @names;
236						}
237
238						# Fallback: Split by comma or 'and'
239	1				2	$phrase =~ s/, and grandchildren.+//; # Handle "Anna and Lucy, and grandchildren Jake and Emma"
240
241						# Handle "Name1, Name2 and Name3" correctly
242	1	50			3	if($phrase =~ /(.+?),\s*(\w+)\s+and\s+(\w+)/) {
243	0				0	my ($firsts, $second, $third) = ($1, $2, $3);
244	0				0	my @firsts = split /\s,\s/, $firsts;
245	0				0	push @names, @firsts, $second, $third;
246						} else {
247	1				2	my @parts = split /\s(?:,\|and)\s/, $phrase;
248	1 2	50			1 5	push @names, grep { defined($_) && $_ ne '' } @parts;
249						}
250
251	1				1	return @names;
252						}
253
254						# Correct extraction of children (skipping "his/her")
255	13	100 100 100 100 100 50			206	if ($text =~ /survived by (his\|her) children\s*([^\.;]+)/i) {
256	2				2	my $children_text = $2;
257	2				3	$family{children} = extract_people_section($children_text);
258						} elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) { # Look for the phrase "Loving mum to"
259	1				2	my $children_text = $1;
260	1				1	$family{children} = extract_people_section($children_text);
261						} elsif ($text =~ /Loving father of\s*([^\.;]+)/i) { # Look for the phrase "Loving father of"
262	1				2	my $children_text = $1;
263	1				2	$family{children} = extract_people_section($children_text);
264						} elsif($text =~ /mother of\s*([^\.;]+)?,/i) { # Look for the phrase "mother of"
265	1				1	my $children_text = $1;
266	1				2	$children_text =~ s/, grandmother.+//;
267	1				2	$family{children} = extract_people_section($children_text);
268						} elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) {
269	3				3	my @children;
270						my @grandchildren;
271
272	3				9	push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' };
273	3	100			27	if($text =~ /\bdaughter,?\s([a-z]+)/i) {
274	1				2	push @children, { 'name' => $1, 'sex' => 'F' }
275						}
276	3	100			27	if($text =~ /\bgranddaughter,?\s([a-z]+)/i) {
277	1				2	push @grandchildren, { 'name' => $1, 'sex' => 'F' };
278						}
279	3	50			6	$family{children} = \@children if @children;
280	3	100			4	$family{grandchildren} = \@grandchildren if @grandchildren;
281						} elsif($text =~ /Surviving are (?:a )?daughters?,\s*Mrs\.\s+(\w+)\s+$([^)]+)$\s+(\w+),\s+([^;]+?);/i) {
282						# Handle "Surviving are a daughter, Mrs. Walter (Ruth Ann) Gerke, Fort Wayne"
283	0				0	my @children;
284	0				0	my $spouse_first = $1;
285	0				0	my $daughter_name = $2;
286	0				0	my $spouse_last = $3;
287	0				0	my $location = $4;
288	0				0	$location =~ s/,\s*$//;
289
290	0				0	push @children, {
291						name => $daughter_name,
292						location => $location,
293						sex => 'F',
294						spouse => {
295						name => "$spouse_first $spouse_last",
296						sex => 'M'
297						}
298						};
299	0				0	$family{children} = \@children;
300						} else {
301	5				4	my @children;
302
303						# my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0);
304						# my $tagged = $tagger->add_tags($text);
305
306	5	100			12	if($text =~ /\ssons,\s(.?);/s) {
307	2				4	my $sons_text = $1;
308	2	100			4	if($sons_text =~ /, all of (.+)$/) {
309	1				1	my $location = $1;
310	1				2	while($sons_text =~ /([\w. ]+?),\s/g) {
311	3				3	my $son = $1;
312	3	100			4	if($son =~ /(\w+)\s+and\s+(\w+)/) {
313	1				2	push @children, {
314						name => $1,
315						location => $location,
316						sex => 'M',
317						}, {
318						name => $2,
319						location => $location,
320						sex => 'M',
321						};
322	1				2	last;
323						} else {
324	2				5	push @children, {
325						name => $son,
326						location => $location,
327						sex => 'M',
328						};
329						}
330						}
331						} else {
332	1				3	while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and\|\z)/g) {
333	2				7	push @children, {
334						name => $1,
335						location => $2,
336						sex => 'M',
337						};
338						}
339						}
340						}
341	5	100 100			22	if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) {
342	1				3	push @children, {
343						name => $1,
344						location => $3,
345						sex => 'F',
346						spouse => { 'name' => $2, sex => 'M' }
347						};
348						} elsif($text =~ /one daughter,\s(.+?),\s(.+?);/) {
349	1				1	my $name = $1;
350	1				1	my $location = $2;
351	1	50			2	if($name =~ /(\w+)\s+(\w+)/) {
352	1				2	push @children, {
353						name => $1,
354						location => $location,
355						sex => 'F',
356						spouse => { name => $2, sex => 'M' }
357						};
358						} else {
359	0				0	push @children, {
360						name => $1,
361						location => $location,
362						sex => 'F',
363						};
364						}
365						}
366	5	100			9	$family{children} = \@children if @children;
367
368	5	100			5	if(!$family{'children'}) {
369	3				18	while($text =~ /\b(son\|daughter)s?,\s([A-Z][a-z]+(?:\s+$[A-Z][a-z]+$)?)\s(?:and their children ([^.;]+))?/g) {
370	2	100			3	my $sex = $1 eq 'son' ? 'M' : 'F';
371	2				2	my $child = $2;
372	2				2	my $grandkids = $3;
373	2	50 50 100 50	67		7	if(my @grandchildren = $grandkids ? split /\s,\s\|\s+and\s+/, $grandkids : ()) {
374	0				0	push @children, {
375						name => $child,
376						sex => $sex,
377						grandchildren => \@grandchildren,
378						};
379						} elsif(($sex eq 'F') && ($child =~ /(.+)\s+$(.+)$/)) {
380	1				6	push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } }
381						} elsif($child ne 'Mrs') {
382	1				13	push @children, { name => $child, sex => $sex }
383						}
384						}
385						}
386	5	100			16	$family{children} = \@children if @children;
387						}
388
389	13	100			16	if(!$family{'children'}) {
390	2	50			3	if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) {
391	0				0	my $raw = $1;
392	0				0	$raw =~ s/\sand their .+//;
393	0				0	my @children = extract_names_from_phrase($raw);
394	0 0 0				0 0 0	push @{$family{children}}, map { { name => $_, sex => 'M' } } @children;
395						}
396	2	100			6	if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) {
397	1				1	my $raw = $1;
398	1				1	$raw =~ s/\sand their .+//;
399	1				1	my @children = extract_names_from_phrase($raw);
400	1 1 2				1 1 3	push @{$family{children}}, map { { name => $_, sex => 'F' } } @children;
401						}
402						}
403
404						# Extract grandchildren
405	13	100			16	if(!$family{'grandchildren'}) {
406	12	100			19	if($text =~ /grandchildren\s+([^\.;]+)/i) {
407	3				11	my @grandchildren = split /\s(?:,\|and)\s/i, $1;
408	3	50			3	if(scalar(@grandchildren)) {
409	3 7 8	50			3 9 15	$family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ];
410						}
411						}
412						}
413	13 4	100	100		19 5	if($family{'grandchildren'} && scalar @{$family{grandchildren}}) {
414	4		33		16	while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) {
415	0 0				0 0	shift @{$family{'grandchildren'}};
416						}
417	4	50			7	if($family{'grandchildren'}->[0] =~ /brothers/) {
418	0	0			0	if(!exists $family{'brothers'}) {
419	0 0				0 0	shift @{$family{'grandchildren'}};
420	0 0				0 0	$family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}}));
421						}
422	0				0	delete $family{grandchildren};
423						}
424						} else {
425	9				7	delete $family{grandchildren};
426						}
427	13 4	100	100		17 6	if((!defined($family{'grandchildren'})) \|\| (($#{$family{'grandchildren'}}) <= 0)) {
428						# handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill"
429	10				20	my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved\|$)/;
430						# Normalize and split into individual names
431	10				6	my @grandchildren;
432	10	100			11	if($grandchildren_str) {
433	1				5	@grandchildren = split /,\s*\|\s+and\s+/, $grandchildren_str;
434						}
435	10	100 100			17	if(scalar(@grandchildren)) {
436	1				1	$family{'grandchildren'} = \@grandchildren;
437						} elsif($text =~ /grandm\w+\s/) {
438	1				1	my $t = $text;
439	1				5	$t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/;
440	1		50		6	$family{grandchildren} = [ split /\s(?:,\|and)\s/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] \|\| '' ];
441						}
442						}
443
444						# Extract siblings (sisters and brothers) correctly, skipping "her" or "his"
445	13	100			47	if($text =~ /predeceased by (his\|her) sisters?\s*([^;\.]+);?/i) {
446	2				2	my $sisters_text = $2;
447	2				2	$sisters_text =~ s/^,\s+//;
448	2				4	$family{sisters} = extract_people_section($sisters_text);
449						} else {
450	11				24	while($text =~ /\bsister[,\s]\s([A-Z][a-z]+(?:\s+[A-Z][a-z.]+))(?:,\s*([A-Z][a-z]+))?/g) {
451	2				3	my $name = $1;
452	2		50		9	$family{'sisters'} \|\|= [];
453	2	100			3	if($name eq 'Mrs') {
454	1	50			4	if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) {
455	1				1	$name = $1;
456						} else {
457	0				0	undef $name;
458						}
459						}
460	2	50			3	if($name) {
461	2 2	50			1 31	push @{$family{sisters}}, {
462						name => $name,
463						sex => 'F',
464						status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living',
465						};
466						}
467						}
468
469	11	100			16	if(!exists($family{'sisters'})) {
470	9	100			13	if($text =~ /\stwo\ssisters,\s(.?)\sand\s(.*?)[;:]/s) {
471	1				2	my($first, $second) = ($1, $2);
472	1				1	foreach my $sister($first, $second) {
473	2	50			3	if($sister =~ /Mrs\.\s(.+?),\s(.+)/) {
474	2				2	my $name = $1;
475	2				2	my $location = $2;
476	2				2	$location =~ s/,$//;
477	2	50			3	if($name =~ /(\w+)\s+(\w+)/) {
478	2 2				1 5	push @{$family{sisters}}, {
479						name => $1,
480						location => $location,
481						sex => 'F',
482						spouse => { 'name' => $2, 'sex' => 'M' }
483						};
484						} else {
485	0 0				0 0	push @{$family{sisters}}, {
486						name => $name,
487						location => $location,
488						sex => 'F',
489						};
490						}
491						} else {
492	0 0				0 0	push @{$family{sisters}}, {
493						name => $sister,
494						sex => 'F',
495						};
496						}
497						}
498						}
499						}
500
501	11	100			14	if($family{'sisters'}) {
502						# Deduplicate by serializing hashes for comparison
503	3				3	my %seen;
504						my @sisters = grep {
505	4				41	my $key = JSON::MaybeXS->new->canonical(1)->encode($_);
506	4				11	!$seen{$key}++
507	3 3				3 4	} @{$family{sisters}};
508
509	3				6	$family{sisters} = \@sisters;
510						}
511						}
512
513	13	50			53	if($text =~ /predeceased by (his\|her) brothers?\s*([^;\.]+);?/i) {
514	0				0	my $brothers_text = $2;
515	0				0	$brothers_text =~ s/^,\s+//;
516	0				0	$family{brothers} = extract_people_section($brothers_text);
517						# TODO: mark all statuses to deceased
518						} else {
519	13				21	while ($text =~ /\bbrother,\s([A-Z][a-z]+(?:\s+[A-Z][a-z.]+))(?:,\s*([A-Z][a-z]+))?/g) {
520	1		50		2	$family{'brothers'} \|\|= [];
521	1 1	50			1 33	push @{$family{brothers}}, {
522						name => $1,
523						status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living',
524						};
525						}
526	13	50	75		34	if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) {
527	8	100			24	if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) {
528	1 1				1 2	push @{$family{'siblings'}},
529						{ 'name' => $1 },
530						{ 'name' => $2 }
531						}
532						}
533
534	13	100			16	if(!exists($family{'brothers'})) {
535	12	100			16	if($text =~ /\sbrothers,\s(.?)[;\.]/s) {
536	1				1	my $brothers_text = $1;
537	1	50			1	if($brothers_text =~ /, all of (.+)$/) {
538	1				1	my $location = $1;
539	1				3	while($brothers_text =~ /([\w. ]+?),\s/g) {
540	2				2	my $son = $1;
541	2	100			2	if($son =~ /(\w+)\s+and\s+(\w+)/) {
542	1 1				1 2	push @{$family{brothers}}, {
543						name => $1,
544						location => $location,
545						sex => 'M',
546						}, {
547						name => $2,
548						location => $location,
549						sex => 'M',
550						};
551	1				1	last;
552						} else {
553	1 1				0 3	push @{$family{brothers}}, {
554						name => $son,
555						location => $location,
556						sex => 'M',
557						};
558						}
559						}
560						} else {
561	0				0	while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and\|\z)/g) {
562	0 0				0 0	push @{$family{brothers}}, {
563						name => $1,
564						location => $2,
565						sex => 'M',
566						};
567						}
568						}
569						}
570						}
571						}
572
573	13	50	67		141	if(!exists($family{'brothers'}) && $text =~ /\b(?:two\|three\|four)\s+brothers?,\s(.+?)(?:,\sa\s+(?:sister\|half-sister)\|;)/i) {
574						# Pattern for "two brothers, Name and Name"
575	0				0	my $brothers_text = $1;
576	0				0	my @brothers;
577
578						# Handle "Charles F. Harris and Berton Harris"
579	0	0			0	if($brothers_text =~ /\band\b/) {
580	0				0	my @names = split /\s+and\s+/, $brothers_text;
581	0				0	foreach my $name (@names) {
582	0				0	$name =~ s/^\s+\|\s+$//g;
583	0				0	$name =~ s/,\s*$//;
584	0				0	push @brothers, {
585						name => $name,
586						sex => 'M',
587						status => 'living'
588						};
589						}
590						}
591	0	0			0	$family{brothers} = \@brothers if(scalar @brothers);
592						}
593
594						# Detect nieces/nephews
595	13	100			67	$family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : [];
596
597						# Extract parents and clean the names by removing unnecessary details
598	13	100 100			339	if($text =~ /(son\|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) {
599	2				3	my $father = $2;
600	2				1	my $mother = $3;
601
602						# Remove anything after the first comma in each parent's name
603	2				2	$father =~ s/,.*//;
604	2				2	$mother =~ s/,.*//;
605
606	2	50			6	if($mother =~ /(.+)\s+$(.+)$\s+(.+)/) {
607	2				3	$mother = "$1 $2";
608						}
609						$family{parents} = {
610	2				4	father => { name => $father },
611						mother => { name => $mother },
612						};
613						} elsif($text =~ /parents were (\w+) and (\w+)/i) {
614						$family{parents} = {
615	1				3	father => { name => $1 },
616						mother => { name => $2 },
617						};
618						}
619
620						# Extract spouse's death year and remove the "(year)" from the name
621	13	100 100 100 100 100 100			301	if($text =~ /(wife\|husband) of the late\s+([\w\s]+)\s+$(\d{4})$/) {
622	1				1	my $name = $2;
623	1				1	my $death_year = $3;
624
625	1		50		10	$family{'spouse'} \|\|= [];
626
627						# Remove the death year part from the spouse's name
628	1				1	$name =~ s/\s*$\d{4}$//;
629
630	1 1				0 2	push @{$family{'spouse'}}, {
631						name => $name,
632						death_year => $death_year
633						}
634						} elsif($text =~ /\bmarried ([^,]+),.?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.?\b(?:at\|in)\s+([^.,]+))?/i) {
635	1		50		3	$family{'spouse'} \|\|= [];
636
637	1				2	my($name, $date, $place) = ($1, $2, $3);
638	1				1	$name =~ s/\s+on\s.+$//;
639
640	1 1		50		1 3	push @{$family{'spouse'}}, {
641						name => $name,
642						married => {
643						date => $date,
644						place => $place // '',
645						}
646						};
647						} elsif($text =~ /husband (?:to\|of) the late\s([\w\s]+)[\s\.]/i) {
648	1		50		3	$family{'spouse'} \|\|= [];
649
650	1 1				0 3	push @{$family{'spouse'}}, { name => $1, status => 'deceased' }
651						} elsif($text =~ /\b(?:wife\|husband) of ([^.,;]+)/i) {
652	2		50		5	$family{'spouse'} \|\|= [];
653
654	2 2				1 15	push @{$family{'spouse'}}, { name => $1 }
655						} elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) {
656	1 1				1 2	push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' }
657						} elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) {
658	2 2				2 5	push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' }
659						}
660
661						# Ensure spouse location is properly handled
662	13	0	40		36	if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') {
663	0				0	delete $family{spouse}[0]{location};
664						}
665
666						# Extract the funeral information
667	13	100 100 100 50 100			215	if($text =~ /funeral service.?at\s+(.+?),?\s+on\s+(.?),?\s+at\s+(.+?)\./) {
668						$family{funeral} = {
669	1				3	location => $1,
670						date => $2,
671						time => $3,
672						};
673						} elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) {
674						$family{funeral} = {
675	1				3	location => $1,
676						date => $2,
677						time => $3,
678						};
679	1	50			3	if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) {
680	1				1	$family{'funeral'}->{'date'} = $1;
681	1	50			2	if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) {
682						# Wednesday 9th March at 1.15pm. Friends etc. etc.
683	1				1	$family{'funeral'}->{'date'} = $1;
684	1				2	$family{'funeral'}->{'time'} = $2;
685						}
686						}
687						} elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) {
688						$family{funeral} = {
689	1				3	time => $1,
690						location => $2
691						};
692						} elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) {
693	0				0	$family{funeral} = { location => $1 }
694						} elsif($text =~ /services.+\sat\s(.+),\swith\s/i) {
695	3				6	$family{funeral} = { location => $1 }
696						}
697
698						# Extract father-in-law and mother-in-law information (if present)
699	13	100 100			25	if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) {
700	1				1	my $father_in_law = $1;
701	1				2	$family{children_in_law} = [{ name => $father_in_law }];
702						} elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) {
703	1				1	my $mother_in_law = $1;
704	1		50		6	$family{children_in_law} = [ split /\s(?:,\|and)\s/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] \|\| '' ];
705	1	50			3	if(scalar($family{children_in_law} == 0)) {
706	0				0	$family{children_in_law} = [{ name => $mother_in_law }];
707						}
708						}
709
710						# Extract aunt information
711	13	100			27	if($text =~ /niece of\s+([A-Za-z]+)/) {
712	1				1	my $aunt = $1;
713	1				2	$family{aunt} = [{ 'name' => $aunt }];
714						}
715
716						# Birth info
717	13	100 100 100 100			1024	if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) {
718						$family{birth} = {
719	1				2	place => $1,
720						date => $2,
721						}
722						} elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) {
723	1				7	$family{'birth'}->{'place'} = $1;
724	1	50			2	if(my $location = _extract_location($1)) {
725	0				0	$family{'birth'}->{'location'} = $location;
726						}
727	1	50			2	if(my $dt = _extract_date($2)) {
728	1				13	$family{'birth'}->{date} = $dt->ymd('/');
729						}
730	1				18	$family{'birth'}->{'place'} =~ s/\s+$//;
731						} elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) {
732	1				2	$family{'birth'}->{'place'} = $2;
733	1				1	my $father = $3;
734	1				1	my $mother = $4;
735	1				1	eval {
736	1	50			2	if(my $dt = DateTime::Format::Text->parse_datetime($1)) {
737	1				172	$family{'birth'}->{date} = $dt->ymd('/');
738						}
739						};
740						# TODO
741						# if($verbose && $@) {
742						# Carp::carp($@);
743						# }
744	1	50			11	if($mother =~ /(.+)\s+$(.+)$\s+(.+)/) {
745	1				2	$mother = "$1 $2";
746						}
747	1	50			2	if($father =~ /(.+?)\.\s\s/) {
748	0				0	$father = $1;
749						}
750						$family{parents} = {
751	1				2	father => { name => $father },
752						mother => { name => $mother }
753						};
754	1	50			6	if($text =~ /survived by (his\|her) (father\|mother)[\s,;]/i) {
755	1				2	$family{parents}->{$2}->{'status'} = 'living';
756						}
757						} elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) {
758	1	50			2	if(my $dt = _extract_date($1)) {
759	1				7	$family{'birth'}->{date} = $dt->ymd('/');
760						}
761	1	50			8	if($2) {
762	1				2	$family{'birth'}->{'location'} = $2;
763						}
764						}
765
766						# Date of death
767	13	100			139	if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) {
768	2				2	$family{death}->{date} = $1;
769	2	50			3	if(my $dt = _extract_date($1)) {
770	2				11	$family{death}->{datetime} = $dt;
771						}
772						}
773
774						# Age at death
775	13	100			21	if($text =~ /,\s(\d{1,3}), of\s/) {
776	2	50			8	if($1 < 110) {
777	2				3	$family{'death'}->{'age'} = $1;
778						}
779						}
780
781						# Place of death
782	13	100			163	if($text =~ /\b(?:passed away\|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) {
783	3				4	my $place = $2;
784	3	50 100			284	if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) {
785	0				0	$place = $1;
786	0				0	$family{death}->{date} = $2;
787						} elsif($place =~ /(.+)\son\s(.+)/) {
788	1				1	$place = $1;
789	1	50			2	if(my $dt = _extract_date($2)) {
790	1				7	$family{death}->{date} = $dt->ymd('/');
791						}
792						}
793	3				12	$place =~ s/^\bthe residence,\s//;
794	3				4	$place =~ s/\bafter a.*$//;
795	3				3	$place =~ s/,\s+$//;
796	3				4	$family{death}->{place} = $place;
797						}
798
799						# Remove blank fields from the main family hash
800	13 69 69	50			20 72 128	%family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family;
801
802						# Remove empty arrays the family hash
803	13				19	foreach my $key (keys %family) {
804	69	100			69	if(ref($family{$key}) eq 'ARRAY') {
805	50 75 50				32 88 39	$family{$key} = [ grep { /\S/ } @{$family{$key}} ];
806	50 50	100			36 57	if(@{$family{$key}} == 0) {
807	12				12	delete $family{$key};
808						}
809						}
810						}
811
812	13	50			17	return if(!scalar keys(%family));
813
814	13				28	return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 });
815						}
816
817						sub _extract_date
818						{
819	5			5	5	my $text = shift;
820	5				10	my $parser = DateTime::Format::Text->new();
821	5				19	my $dt;
822
823	5 5				6 8	eval { $dt = $parser->parse_datetime($text); };
824	5	50	33		1218	return $dt if $dt && !$@;
825	0				0	return undef;
826						}
827
828						sub _extract_location {
829	1			1	3	my $place_text = shift;
830
831	1	50			3	unless($geocoder) {
832	1 1				1 85	eval { require Geo::Coder::Free };
833	1	50			335	if($@) {
834	1				13	Carp::carp(__PACKAGE__, ' (', __LINE__, "): geocoding locations disabled: $@");
835	1				559	return;
836						}
837	0					$geocoder = Geo::Coder::Free->new();
838						}
839
840	0					my @locations = $geocoder->geocode(location => $place_text); # Use array to improve caching
841
842	0	0				return unless scalar(@locations);
843
844	0					my $result = $locations[0];
845
846	0	0				if(ref($result)) {
847						return {
848	0					raw => $place_text,
849						# city => $result->{components}{city} \|\| $result->{components}{town},
850						# region => $result->{components}{state},
851						# country => $result->{components}{country},
852						latitude => $result->latitude(),
853						longitude => $result->longitude()
854						};
855						}
856						return {
857						raw => $place_text,
858						# city => $result->{components}{city} \|\| $result->{components}{town},
859						# region => $result->{components}{state},
860						# country => $result->{components}{country},
861						latitude => $result->{'latitude'},
862	0					longitude => $result->{'longitude'}
863						};
864						}
865
866
867 - 897						=head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/> =head1 SUPPORT This module is provided as-is without any warranty. =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut
898
899						1;