File: | blib/lib/Genealogy/Obituary/Parser.pm |
Coverage: | 78.4% |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Genealogy::Obituary::Parser; | ||||||
2 | |||||||
3 | 4 4 4 | 252863 4 50 | use strict; | ||||
4 | 4 4 4 | 8 1 71 | use warnings; | ||||
5 | |||||||
6 | 4 4 4 | 764 804111 70 | use DateTime::Format::Text; | ||||
7 | 4 4 4 | 14 4 59 | use Exporter 'import'; | ||||
8 | 4 4 4 | 973 1182514 68 | use Geo::Coder::Free; | ||||
9 | 4 4 4 | 869 17937 74 | use Geo::Coder::List; | ||||
10 | 4 4 4 | 12 32 82 | use Params::Get 0.13; | ||||
11 | 4 4 4 | 12 30 53 | use Return::Set 0.03; | ||||
12 | 4 4 4 | 8 3 15390 | use Params::Validate::Strict; | ||||
13 | |||||||
14 | our @EXPORT_OK = qw(parse_obituary); | ||||||
15 | our $geocoder; | ||||||
16 | |||||||
17 | # TODO: use Lingua::EN::Tagger; | ||||||
18 | # TODO: add more general code, e.g. where it looks for father, also look for mother | ||||||
19 | # TODO: parse https://funeral-notices.co.uk/notice/adams/5244000 | ||||||
20 | |||||||
21 - 29 | =head1 NAME Genealogy::Obituary::Parser - Extract structured family relationships from obituary text =head1 VERSION Version 0.03 =cut | ||||||
30 | |||||||
31 | our $VERSION = '0.03'; | ||||||
32 | |||||||
33 - 88 | =head1 SYNOPSIS use Genealogy::Obituary::Parse qw(parse_obituary); my $text = 'She is survived by her husband Paul, daughters Anna and Lucy, and grandchildren Jake and Emma.'; my $data = parse_obituary($text); # $data = { # spouse => ['Paul'], # children => ['Anna', 'Lucy'], # grandchildren => ['Jake', 'Emma'], # }; =head1 DESCRIPTION This module parses freeform obituary text and extracts structured family relationship data for use in genealogical applications. It parses obituary text and extract structured family relationship data, including details about children, parents, spouse, siblings, grandchildren, and other relatives. =head1 FUNCTIONS =head2 parse_obituary($text) The routine processes the obituary content to identify and organize relevant family information into a clear, structured hash. It returns a hash reference containing structured family information, with each family member's data organized into distinct categories such as children, spouse, parents, siblings, etc. Takes a string, or a ref to a string. =head3 API SPECIFICATION =head4 INPUT { 'text' => { 'type' => 'string', # or stringref 'min' => 1, 'max' => 10000 } } =head4 OUTPUT =over 4 =item * No matches: undef =back { type => 'hashref', 'min' => 1, 'max' => 10 } =cut | ||||||
89 | |||||||
90 | sub parse_obituary | ||||||
91 | { | ||||||
92 | 12 | 1 | 405548 | my $params = Params::Validate::Strict::validate_strict({ | |||
93 | args => Params::Get::get_params('text', \@_), | ||||||
94 | schema => { | ||||||
95 | 'text' => { | ||||||
96 | 'type' => 'string', | ||||||
97 | 'min' => 1, | ||||||
98 | 'max' => 10000 | ||||||
99 | } | ||||||
100 | } | ||||||
101 | }); | ||||||
102 | 12 | 676 | my $text = $params->{'text'}; | ||||
103 | |||||||
104 | 12 | 21 | if(ref($text) eq 'SCALAR') { | ||||
105 | 0 0 | 0 0 | $text = ${$text}; | ||||
106 | } | ||||||
107 | |||||||
108 | # Quick scan to get started | ||||||
109 | sub parse_obituary_quick { | ||||||
110 | 0 | 0 | 0 | my $text = shift; | |||
111 | 0 | 0 | my %data; | ||||
112 | |||||||
113 | 0 | 0 | my @patterns = ( | ||||
114 | [ qr/\bdaughters?\s+([^.,;]+)/i, 'children' ], | ||||||
115 | [ qr/\bsons?\s+([^.,;]+)/i, 'children' ], | ||||||
116 | [ qr/\bchildren\s+([^.,;]+)/i, 'children' ], | ||||||
117 | [ qr/\bgrandchildren\s+([^.;]+)/i, 'grandchildren' ], | ||||||
118 | [ qr/\bwife\s+([^.,;]+)/i, 'spouse' ], | ||||||
119 | [ qr/\bhusband\s+([^.,;]+)/i, 'spouse' ], | ||||||
120 | [ qr/\bhis parents were\s+([^.,;]+)/i,'parents' ], | ||||||
121 | [ qr/\bhis father was\s+([^.,;]+)/i, 'parents' ], | ||||||
122 | [ qr/\bhis mother was\s+([^.,;]+)/i, 'parents' ], | ||||||
123 | [ qr/\bsister(?:s)?\s+([^.,;]+)/i, 'siblings' ], | ||||||
124 | [ qr/\bbrother(?:s)?\s+([^.,;]+)/i, 'siblings' ], | ||||||
125 | [ qr/\bsiblings\s+([^.,;]+)/i, 'siblings' ], | ||||||
126 | ); | ||||||
127 | |||||||
128 | 0 | 0 | for my $p (@patterns) { | ||||
129 | 0 | 0 | my ($re, $field) = @$p; | ||||
130 | 0 | 0 | while ($text =~ /$re/g) { | ||||
131 | 0 | 0 | my $list = $1; | ||||
132 | 0 | 0 | next unless $list; | ||||
133 | |||||||
134 | # Robust splitting on commas and "and" | ||||||
135 | 0 0 0 | 0 0 0 | my @names = grep { length } map { s/^\s+|\s+$//gr } split /\s*(?:,|(?:\band\b))\s*/i, $list; | ||||
136 | 0 0 0 | 0 0 0 | push @{$data{$field}}, map { { 'name' => $_ } } @names; | ||||
137 | } | ||||||
138 | } | ||||||
139 | |||||||
140 | 0 | 0 | return \%data; | ||||
141 | } | ||||||
142 | |||||||
143 | # my %family = %{parse_obituary_quick($text)}; | ||||||
144 | 12 | 11 | my %family; | ||||
145 | |||||||
146 | # Helper to extract people from a specific section and remove empty entries | ||||||
147 | sub extract_people_section { | ||||||
148 | 6 | 0 | 10 | my $section = shift; | |||
149 | 6 | 8 | return unless $section; | ||||
150 | |||||||
151 | 6 | 19 | $section =~ s/\s+and\s+/, /g; # Ensure "and" is treated as a separator | ||||
152 | 6 | 23 | $section =~ s/([A-Za-z]+),\s+([A-Z]{2})/$1<<COMMA>>$2/g; | ||||
153 | 6 | 18 | my @entries = split /\s*,\s*/, $section; | ||||
154 | |||||||
155 | 6 | 8 | my @people; | ||||
156 | 6 | 6 | foreach my $entry (@entries) { | ||||
157 | 16 | 17 | $entry =~ s/<<COMMA>>/, /g; | ||||
158 | |||||||
159 | 16 | 15 | my ($name, $spouse, $location) = ('', '', ''); | ||||
160 | |||||||
161 | # Match "Ian (Terry) Girvan of Surrey, BC" | ||||||
162 | 16 | 29 | if ($entry =~ /^(\w+)\s+\(([^)]+)\)\s+(\w+)\s+of\s+(.+)$/) { | ||||
163 | 1 1 1 | 1 2 2 | $name = "$1 $3"; $spouse = $2; $location = $4; | ||||
164 | } | ||||||
165 | # Match "Gwen Steeves (Leslie) of Riverview, NB" | ||||||
166 | elsif ($entry =~ /^(.+?)\s+\(([^)]+)\)\s+of\s+(.+)$/) { | ||||||
167 | 1 1 1 | 2 1 2 | $name = $1; $spouse = $2; $location = $3; | ||||
168 | } | ||||||
169 | # Match "Carol Girvan of Dartmouth, NS" | ||||||
170 | elsif ($entry =~ /^(.+?)\s+of\s+(.+)$/) { | ||||||
171 | 1 1 | 1 0 | $name = $1; $location = $2; | ||||
172 | } else { | ||||||
173 | # Match names only (e.g. for siblings) | ||||||
174 | 13 | 12 | $name = $entry; | ||||
175 | } | ||||||
176 | |||||||
177 | 16 | 12 | next if !$name; # Skip if name is empty | ||||
178 | 14 | 15 | next if($name =~ /^father-in-law\sto\s/); # Skip follow ons | ||||
179 | 13 | 13 | last if($name =~ /^devoted\s/i); | ||||
180 | 12 | 12 | last if($name =~ /^loved\s/i); | ||||
181 | |||||||
182 | # Create a hash and filter out blank fields | ||||||
183 | 12 | 19 | my %person = ( | ||||
184 | name => $name, | ||||||
185 | spouse => $spouse, | ||||||
186 | location => $location, | ||||||
187 | ); | ||||||
188 | |||||||
189 | # Remove blank fields | ||||||
190 | 12 17 36 | 14 19 50 | %person = map { $_ => $person{$_} } grep { defined $person{$_} && $person{$_} ne '' } keys %person; | ||||
191 | |||||||
192 | 12 | 17 | push @people, \%person; | ||||
193 | } | ||||||
194 | 6 | 11 | return \@people; | ||||
195 | } | ||||||
196 | |||||||
197 | sub extract_names_from_phrase { | ||||||
198 | 1 | 0 | 1 | my $phrase = shift; | |||
199 | 1 | 1 | my @names; | ||||
200 | |||||||
201 | 1 | 1 | $phrase =~ s/[.;]//g; | ||||
202 | |||||||
203 | # Case: "Christopher, Thomas, and Marsha Cloud" | ||||||
204 | 1 | 4 | if ($phrase =~ /^((?:\w+\s*,\s*)+\w+),?\s*and\s+(\w+)\s+(\w+)$/) { | ||||
205 | 0 | 0 | my ($pre, $last_first, $last) = ($1, $2, $3); | ||||
206 | 0 | 0 | my @firsts = split(/\s*,\s*/, $pre); | ||||
207 | 0 | 0 | push @firsts, $last_first; | ||||
208 | 0 0 | 0 0 | push @names, map { "$_ $last" } @firsts; | ||||
209 | 0 | 0 | return @names; | ||||
210 | } | ||||||
211 | |||||||
212 | # Case: "Christopher and Thomas Cloud" | ||||||
213 | 1 | 4 | if ($phrase =~ /^([\w\s]+?)\s+and\s+(\w+)\s+(\w+)$/) { | ||||
214 | 0 | 0 | my ($first_part, $second_first, $last) = ($1, $2, $3); | ||||
215 | 0 | 0 | my @firsts = split(/\s*,\s*|\s+and\s+/, $first_part); | ||||
216 | 0 0 | 0 0 | push @names, map { "$_ $last" } (@firsts, $second_first); | ||||
217 | 0 | 0 | return @names; | ||||
218 | } | ||||||
219 | |||||||
220 | # Fallback: Split by comma or 'and' | ||||||
221 | 1 | 2 | $phrase =~ s/, and grandchildren.+//; # Handle "Anna and Lucy, and grandchildren Jake and Emma" | ||||
222 | 1 | 4 | my @parts = split /\s*(?:,|and)\s*/, $phrase; | ||||
223 | 1 2 | 1 5 | push @names, grep { defined($_) && $_ ne '' } @parts; | ||||
224 | 1 | 2 | return @names; | ||||
225 | } | ||||||
226 | |||||||
227 | # Correct extraction of children (skipping "his/her") | ||||||
228 | 12 | 207 | if ($text =~ /survived by (his|her) children\s*([^\.;]+)/i) { | ||||
229 | 1 | 2 | my $children_text = $2; | ||||
230 | 1 | 2 | $family{children} = extract_people_section($children_text); | ||||
231 | } elsif ($text =~ /Loving mum to\s*([^\.;]+)/i) { # Look for the phrase "Loving mum to" | ||||||
232 | 1 | 1 | my $children_text = $1; | ||||
233 | 1 | 2 | $family{children} = extract_people_section($children_text); | ||||
234 | } elsif ($text =~ /Loving father of\s*([^\.;]+)/i) { # Look for the phrase "Loving father of" | ||||||
235 | 1 | 1 | my $children_text = $1; | ||||
236 | 1 | 3 | $family{children} = extract_people_section($children_text); | ||||
237 | } elsif($text =~ /mother of\s*([^\.;]+)?,/i) { # Look for the phrase "mother of" | ||||||
238 | 1 | 2 | my $children_text = $1; | ||||
239 | 1 | 2 | $children_text =~ s/, grandmother.+//; | ||||
240 | 1 | 2 | $family{children} = extract_people_section($children_text); | ||||
241 | } elsif($text =~ /sons,?\s*([a-z]+)\s+and\s+([a-z]+)/i) { | ||||||
242 | 3 | 3 | my @children; | ||||
243 | my @grandchildren; | ||||||
244 | |||||||
245 | 3 | 11 | push @children, { name => $1, sex => 'M' }, { name => $2, sex => 'M' }; | ||||
246 | 3 | 30 | if($text =~ /\bdaughter,?\s([a-z]+)/i) { | ||||
247 | 1 | 1 | push @children, { 'name' => $1, 'sex' => 'F' } | ||||
248 | } | ||||||
249 | 3 | 27 | if($text =~ /\bgranddaughter,?\s([a-z]+)/i) { | ||||
250 | 1 | 2 | push @grandchildren, { 'name' => $1, 'sex' => 'F' }; | ||||
251 | } | ||||||
252 | 3 | 5 | $family{children} = \@children if @children; | ||||
253 | 3 | 5 | $family{grandchildren} = \@grandchildren if @grandchildren; | ||||
254 | } else { | ||||||
255 | 5 | 4 | my @children; | ||||
256 | |||||||
257 | # my $tagger = Lingua::EN::Tagger->new(longest_noun_phrase => 0); | ||||||
258 | # my $tagged = $tagger->add_tags($text); | ||||||
259 | |||||||
260 | 5 | 32 | if($text =~ /\ssons,\s*(.*?);/s) { | ||||
261 | 2 | 3 | my $sons_text = $1; | ||||
262 | 2 | 6 | if($sons_text =~ /, all of (.+)$/) { | ||||
263 | 1 | 1 | my $location = $1; | ||||
264 | 1 | 3 | while($sons_text =~ /([\w. ]+?),\s/g) { | ||||
265 | 3 | 3 | my $son = $1; | ||||
266 | 3 | 4 | if($son =~ /(\w+)\s+and\s+(\w+)/) { | ||||
267 | 1 | 4 | push @children, { | ||||
268 | name => $1, | ||||||
269 | location => $location, | ||||||
270 | sex => 'M', | ||||||
271 | }, { | ||||||
272 | name => $2, | ||||||
273 | location => $location, | ||||||
274 | sex => 'M', | ||||||
275 | }; | ||||||
276 | 1 | 1 | last; | ||||
277 | } else { | ||||||
278 | 2 | 6 | push @children, { | ||||
279 | name => $son, | ||||||
280 | location => $location, | ||||||
281 | sex => 'M', | ||||||
282 | }; | ||||||
283 | } | ||||||
284 | } | ||||||
285 | } else { | ||||||
286 | 1 | 5 | while($sons_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) { | ||||
287 | 2 | 6 | push @children, { | ||||
288 | name => $1, | ||||||
289 | location => $2, | ||||||
290 | sex => 'M', | ||||||
291 | }; | ||||||
292 | } | ||||||
293 | } | ||||||
294 | } | ||||||
295 | 5 | 26 | if($text =~ /\sdaughters?,\s*Mrs\.\s+(.+?)\s+(\w+),\s+([^;]+)\sand/) { | ||||
296 | 1 | 4 | push @children, { | ||||
297 | name => $1, | ||||||
298 | location => $3, | ||||||
299 | sex => 'F', | ||||||
300 | spouse => { 'name' => $2, sex => 'M' } | ||||||
301 | }; | ||||||
302 | } elsif($text =~ /one daughter,\s*(.+?),\s*(.+?);/) { | ||||||
303 | 1 | 2 | my $name = $1; | ||||
304 | 1 | 1 | my $location = $2; | ||||
305 | 1 | 2 | if($name =~ /(\w+)\s+(\w+)/) { | ||||
306 | 1 | 3 | push @children, { | ||||
307 | name => $1, | ||||||
308 | location => $location, | ||||||
309 | sex => 'F', | ||||||
310 | spouse => { name => $2, sex => 'M' } | ||||||
311 | }; | ||||||
312 | } else { | ||||||
313 | 0 | 0 | push @children, { | ||||
314 | name => $1, | ||||||
315 | location => $location, | ||||||
316 | sex => 'F', | ||||||
317 | }; | ||||||
318 | } | ||||||
319 | } | ||||||
320 | 5 | 8 | $family{children} = \@children if @children; | ||||
321 | |||||||
322 | 5 | 8 | if(!$family{'children'}) { | ||||
323 | 3 | 23 | while($text =~ /\b(son|daughter)s?,\s*([A-Z][a-z]+(?:\s+\([A-Z][a-z]+\))?)\s*(?:and their children ([^.;]+))?/g) { | ||||
324 | 2 | 3 | my $sex = $1 eq 'son' ? 'M' : 'F'; | ||||
325 | 2 | 2 | my $child = $2; | ||||
326 | 2 | 3 | my $grandkids = $3; | ||||
327 | 2 | 8 | if(my @grandchildren = $grandkids ? split /\s*,\s*|\s+and\s+/, $grandkids : ()) { | ||||
328 | 0 | 0 | push @children, { | ||||
329 | name => $child, | ||||||
330 | sex => $sex, | ||||||
331 | grandchildren => \@grandchildren, | ||||||
332 | }; | ||||||
333 | } elsif(($sex eq 'F') && ($child =~ /(.+)\s+\((.+)\)/)) { | ||||||
334 | 1 | 7 | push @children, { name => $1, sex => 'F', spouse => { name => $2, sex => 'M' } } | ||||
335 | } elsif($child ne 'Mrs') { | ||||||
336 | 1 | 14 | push @children, { name => $child, sex => $sex } | ||||
337 | } | ||||||
338 | } | ||||||
339 | } | ||||||
340 | 5 | 7 | $family{children} = \@children if @children; | ||||
341 | } | ||||||
342 | |||||||
343 | 12 | 19 | if(!$family{'children'}) { | ||||
344 | 2 | 5 | if($text =~ /\ssons?[,\s]\s*(.+?)[;\.]/) { | ||||
345 | 0 | 0 | my $raw = $1; | ||||
346 | 0 | 0 | $raw =~ s/\sand their .+//; | ||||
347 | 0 | 0 | my @children = extract_names_from_phrase($raw); | ||||
348 | 0 0 0 | 0 0 0 | push @{$family{children}}, map { { name => $_, sex => 'M' } } @children; | ||||
349 | } | ||||||
350 | 2 | 7 | if($text =~ /\sdaughters?[,\s]\s*(.+?)[;\.]/) { | ||||
351 | 1 | 1 | my $raw = $1; | ||||
352 | 1 | 2 | $raw =~ s/\sand their .+//; | ||||
353 | 1 | 3 | my @children = extract_names_from_phrase($raw); | ||||
354 | 1 1 2 | 2 2 3 | push @{$family{children}}, map { { name => $_, sex => 'F' } } @children; | ||||
355 | } | ||||||
356 | } | ||||||
357 | |||||||
358 | # Extract grandchildren | ||||||
359 | 12 | 17 | if(!$family{'grandchildren'}) { | ||||
360 | 11 | 26 | if($text =~ /grandchildren\s+([^\.;]+)/i) { | ||||
361 | 3 | 16 | my @grandchildren = split /\s*(?:,|and)\s*/i, $1; | ||||
362 | 3 | 4 | if(scalar(@grandchildren)) { | ||||
363 | 3 7 8 | 3 11 13 | $family{'grandchildren'} = [ map { { 'name' => $_ } } grep { defined $_ && $_ ne '' } @grandchildren ]; | ||||
364 | } | ||||||
365 | } | ||||||
366 | } | ||||||
367 | 12 4 | 19 9 | if($family{'grandchildren'} && scalar @{$family{grandchildren}}) { | ||||
368 | 4 | 25 | while((exists $family{'grandchildren'}->[0]) && (length($family{'grandchildren'}->[0]) == 0)) { | ||||
369 | 0 0 | 0 0 | shift @{$family{'grandchildren'}}; | ||||
370 | } | ||||||
371 | 4 | 11 | if($family{'grandchildren'}->[0] =~ /brothers/) { | ||||
372 | 0 | 0 | if(!exists $family{'brothers'}) { | ||||
373 | 0 0 | 0 0 | shift @{$family{'grandchildren'}}; | ||||
374 | 0 0 | 0 0 | $family{'brothers'} = extract_people_section(join(', ', @{$family{'grandchildren'}})); | ||||
375 | } | ||||||
376 | 0 | 0 | delete $family{grandchildren}; | ||||
377 | } | ||||||
378 | } else { | ||||||
379 | 8 | 8 | delete $family{grandchildren}; | ||||
380 | } | ||||||
381 | 12 4 | 21 6 | if((!defined($family{'grandchildren'})) || (($#{$family{'grandchildren'}}) <= 0)) { | ||||
382 | # handle devoted Grandma to Tom, Dick, and Harry and loved Mother-in-law to Jack and Jill" | ||||||
383 | 9 | 20 | my ($grandchildren_str) = $text =~ /Grandma to (.*?)(?: and loved|$)/; | ||||
384 | # Normalize and split into individual names | ||||||
385 | 9 | 8 | my @grandchildren; | ||||
386 | 9 | 10 | if($grandchildren_str) { | ||||
387 | 1 | 5 | @grandchildren = split /,\s*|\s+and\s+/, $grandchildren_str; | ||||
388 | } | ||||||
389 | 9 | 18 | if(scalar(@grandchildren)) { | ||||
390 | 1 | 1 | $family{'grandchildren'} = \@grandchildren; | ||||
391 | } elsif($text =~ /grandm\w+\s/) { | ||||||
392 | 1 | 2 | my $t = $text; | ||||
393 | 1 | 5 | $t =~ s/.+(grandm\w+\s+.+?\sand\s[\w\.;,]+).+/$1/; | ||||
394 | 1 | 6 | $family{grandchildren} = [ split /\s*(?:,|and)\s*/i, ($t =~ /grandm\w+\sto\s+([^\.;]+)/i)[0] || '' ]; | ||||
395 | } | ||||||
396 | } | ||||||
397 | |||||||
398 | # Extract siblings (sisters and brothers) correctly, skipping "her" or "his" | ||||||
399 | 12 | 52 | if($text =~ /predeceased by (his|her) sisters?\s*([^;\.]+);?/i) { | ||||
400 | 2 | 2 | my $sisters_text = $2; | ||||
401 | 2 | 4 | $sisters_text =~ s/^,\s+//; | ||||
402 | 2 | 5 | $family{sisters} = extract_people_section($sisters_text); | ||||
403 | } else { | ||||||
404 | 10 | 30 | while($text =~ /\bsister[,\s]\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) { | ||||
405 | 2 | 2 | my $name = $1; | ||||
406 | 2 | 8 | $family{'sisters'} ||= []; | ||||
407 | 2 | 2 | if($name eq 'Mrs') { | ||||
408 | 1 | 4 | if($text =~ / sister,\s*Mrs\.\s+([A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+)/) { | ||||
409 | 1 | 1 | $name = $1; | ||||
410 | } else { | ||||||
411 | 0 | 0 | undef $name; | ||||
412 | } | ||||||
413 | } | ||||||
414 | 2 | 18 | if($name) { | ||||
415 | 2 2 | 1 32 | push @{$family{sisters}}, { | ||||
416 | name => $name, | ||||||
417 | status => ($text =~ /\bpredeceased by.*?$name/i) ? 'deceased' : 'living', | ||||||
418 | }; | ||||||
419 | } | ||||||
420 | } | ||||||
421 | |||||||
422 | 10 | 13 | if(!exists($family{'sisters'})) { | ||||
423 | 8 | 18 | if($text =~ /\stwo\ssisters,\s*(.*?)\sand\s(.*?)[;:]/s) { | ||||
424 | 1 | 2 | my($first, $second) = ($1, $2); | ||||
425 | 1 | 2 | foreach my $sister($first, $second) { | ||||
426 | 2 | 4 | if($sister =~ /Mrs\.\s(.+?),\s(.+)/) { | ||||
427 | 2 | 3 | my $name = $1; | ||||
428 | 2 | 10 | my $location = $2; | ||||
429 | 2 | 3 | $location =~ s/,$//; | ||||
430 | 2 | 4 | if($name =~ /(\w+)\s+(\w+)/) { | ||||
431 | 2 2 | 2 5 | push @{$family{sisters}}, { | ||||
432 | name => $1, | ||||||
433 | location => $location, | ||||||
434 | sex => 'F', | ||||||
435 | spouse => { 'name' => $2, 'sex' => 'M' } | ||||||
436 | }; | ||||||
437 | } else { | ||||||
438 | 0 0 | 0 0 | push @{$family{sisters}}, { | ||||
439 | name => $name, | ||||||
440 | location => $location, | ||||||
441 | sex => 'F', | ||||||
442 | }; | ||||||
443 | } | ||||||
444 | } else { | ||||||
445 | 0 0 | 0 0 | push @{$family{sisters}}, { | ||||
446 | name => $sister, | ||||||
447 | sex => 'F', | ||||||
448 | }; | ||||||
449 | } | ||||||
450 | } | ||||||
451 | } | ||||||
452 | } | ||||||
453 | } | ||||||
454 | |||||||
455 | 12 | 55 | if($text =~ /predeceased by (his|her) brothers?\s*([^;\.]+);?/i) { | ||||
456 | 0 | 0 | my $brothers_text = $2; | ||||
457 | 0 | 0 | $brothers_text =~ s/^,\s+//; | ||||
458 | 0 | 0 | $family{brothers} = extract_people_section($brothers_text); | ||||
459 | # TODO: mark all statuses to deceased | ||||||
460 | } else { | ||||||
461 | 12 | 11 | my @siblings; | ||||
462 | |||||||
463 | 12 | 24 | while ($text =~ /\bbrother,\s*([A-Z][a-z]+(?:\s+[A-Z][a-z.]+)*)(?:,\s*([A-Z][a-z]+))?/g) { | ||||
464 | 1 | 3 | $family{'brothers'} ||= []; | ||||
465 | 1 1 | 1 30 | push @{$family{brothers}}, { | ||||
466 | name => $1, | ||||||
467 | status => ($text =~ /\bpredeceased by.*?$1/i) ? 'deceased' : 'living', | ||||||
468 | }; | ||||||
469 | } | ||||||
470 | 12 | 36 | if((!$family{'brothers'}) && (!$family{'sisters'}) && (!$family{'siblings'})) { | ||||
471 | 7 | 25 | if($text =~ /sister of ([a-z]+) and ([a-z]+)/i) { | ||||
472 | 1 1 | 1 2 | push @{$family{'siblings'}}, | ||||
473 | { 'name' => $1 }, | ||||||
474 | { 'name' => $2 } | ||||||
475 | } | ||||||
476 | } | ||||||
477 | |||||||
478 | 12 | 14 | if(!exists($family{'brothers'})) { | ||||
479 | 11 | 25 | if($text =~ /\sbrothers,\s*(.*?)[;\.]/s) { | ||||
480 | 1 | 1 | my $brothers_text = $1; | ||||
481 | 1 | 2 | if($brothers_text =~ /, all of (.+)$/) { | ||||
482 | 1 | 1 | my $location = $1; | ||||
483 | 1 | 3 | while($brothers_text =~ /([\w. ]+?),\s/g) { | ||||
484 | 2 | 2 | my $son = $1; | ||||
485 | 2 | 3 | if($son =~ /(\w+)\s+and\s+(\w+)/) { | ||||
486 | 1 1 | 2 3 | push @{$family{brothers}}, { | ||||
487 | name => $1, | ||||||
488 | location => $location, | ||||||
489 | sex => 'M', | ||||||
490 | }, { | ||||||
491 | name => $2, | ||||||
492 | location => $location, | ||||||
493 | sex => 'M', | ||||||
494 | }; | ||||||
495 | 1 | 1 | last; | ||||
496 | } else { | ||||||
497 | 1 1 | 1 3 | push @{$family{brothers}}, { | ||||
498 | name => $son, | ||||||
499 | location => $location, | ||||||
500 | sex => 'M', | ||||||
501 | }; | ||||||
502 | } | ||||||
503 | } | ||||||
504 | } else { | ||||||
505 | 0 | 0 | while($brothers_text =~ /([\w. ]+?),\s*([\w. ]+?)(?:\s+and|\z)/g) { | ||||
506 | 0 0 | 0 0 | push @{$family{brothers}}, { | ||||
507 | name => $1, | ||||||
508 | location => $2, | ||||||
509 | sex => 'M', | ||||||
510 | }; | ||||||
511 | } | ||||||
512 | } | ||||||
513 | } | ||||||
514 | } | ||||||
515 | } | ||||||
516 | |||||||
517 | # Detect nieces/nephews | ||||||
518 | 12 | 66 | $family{nieces_nephews} = ($text =~ /as well as several nieces and nephews/i) ? ['several nieces and nephews'] : []; | ||||
519 | |||||||
520 | # Extract parents and clean the names by removing unnecessary details | ||||||
521 | 12 | 336 | if($text =~ /(son|daughter) of the late\s+(.+?)\s+and\s+(.+?)\./i) { | ||||
522 | 2 | 3 | my $father = $2; | ||||
523 | 2 | 4 | my $mother = $3; | ||||
524 | |||||||
525 | # Remove anything after the first comma in each parent's name | ||||||
526 | 2 | 2 | $father =~ s/,.*//; | ||||
527 | 2 | 3 | $mother =~ s/,.*//; | ||||
528 | |||||||
529 | 2 | 7 | if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) { | ||||
530 | 2 | 4 | $mother = "$1 $2"; | ||||
531 | } | ||||||
532 | $family{parents} = { | ||||||
533 | 2 | 7 | father => { name => $father }, | ||||
534 | mother => { name => $mother }, | ||||||
535 | }; | ||||||
536 | } elsif($text =~ /parents were (\w+) and (\w+)/i) { | ||||||
537 | $family{parents} = { | ||||||
538 | 1 | 4 | father => { name => $1 }, | ||||
539 | mother => { name => $2 }, | ||||||
540 | }; | ||||||
541 | } | ||||||
542 | |||||||
543 | # Extract spouse's death year and remove the "(year)" from the name | ||||||
544 | 12 | 329 | if($text =~ /(wife|husband) of the late\s+([\w\s]+)\s+\((\d{4})\)/) { | ||||
545 | 1 | 2 | my $name = $2; | ||||
546 | 1 | 1 | my $death_year = $3; | ||||
547 | |||||||
548 | 1 | 3 | $family{'spouse'} ||= []; | ||||
549 | |||||||
550 | # Remove the death year part from the spouse's name | ||||||
551 | 1 | 1 | $name =~ s/\s*\(\d{4}\)//; | ||||
552 | |||||||
553 | 1 1 | 1 2 | push @{$family{'spouse'}}, { | ||||
554 | name => $name, | ||||||
555 | death_year => $death_year | ||||||
556 | } | ||||||
557 | } elsif($text =~ /\bmarried ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})(?:.*?\b(?:at|in)\s+([^.,]+))?/i) { | ||||||
558 | 1 | 7 | $family{'spouse'} ||= []; | ||||
559 | |||||||
560 | 1 1 | 1 4 | push @{$family{'spouse'}}, { | ||||
561 | name => $1, | ||||||
562 | married => { | ||||||
563 | date => $2, | ||||||
564 | place => $3 // '', | ||||||
565 | } | ||||||
566 | }; | ||||||
567 | } elsif($text =~ /husband (?:to|of) the late\s([\w\s]+)[\s\.]/i) { | ||||||
568 | 1 | 3 | $family{'spouse'} ||= []; | ||||
569 | |||||||
570 | 1 1 | 1 3 | push @{$family{'spouse'}}, { name => $1, status => 'deceased' } | ||||
571 | } elsif($text =~ /\b(?:wife|husband) of ([^.,;]+)/i) { | ||||||
572 | 2 | 6 | $family{'spouse'} ||= []; | ||||
573 | |||||||
574 | 2 2 | 1 5 | push @{$family{'spouse'}}, { name => $1 } | ||||
575 | } elsif($text =~ /\bsurvived by her husband ([^.,;]+)/i) { | ||||||
576 | 1 1 | 1 5 | push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'M' } | ||||
577 | } elsif($text =~ /\bsurvived by his wife[,\s]+([^.,;]+)/i) { | ||||||
578 | 2 2 | 2 7 | push @{$family{'spouse'}}, { name => $1, 'status' => 'living', 'sex' => 'F' } | ||||
579 | } | ||||||
580 | |||||||
581 | # Ensure spouse location is properly handled | ||||||
582 | 12 | 29 | if(exists $family{spouse} && (ref $family{'spouse'} eq 'HASH') && defined $family{spouse}[0]{location} && $family{spouse}[0]{location} eq 'the late') { | ||||
583 | 0 | 0 | delete $family{spouse}[0]{location}; | ||||
584 | } | ||||||
585 | |||||||
586 | # Extract the funeral information | ||||||
587 | 12 | 224 | if($text =~ /funeral service.*?at\s+(.+?),?\s+on\s+(.*?),?\s+at\s+(.+?)\./) { | ||||
588 | $family{funeral} = { | ||||||
589 | 1 | 3 | location => $1, | ||||
590 | date => $2, | ||||||
591 | time => $3, | ||||||
592 | }; | ||||||
593 | } elsif($text =~ /funeral service.*?at\s+([^\n]+?)\s+on\s+([^\n]+)\s+at\s+([^\n]+)/i) { | ||||||
594 | $family{funeral} = { | ||||||
595 | 1 | 4 | location => $1, | ||||
596 | date => $2, | ||||||
597 | time => $3, | ||||||
598 | }; | ||||||
599 | 1 | 3 | if($family{'funeral'}->{'date'} =~ /(.+?)\.\s{2,}/) { | ||||
600 | 1 | 2 | $family{'funeral'}->{'date'} = $1; | ||||
601 | 1 | 3 | if($family{'funeral'}->{'date'} =~ /(.+?)\sat\s(.+)/) { | ||||
602 | # Wednesday 9th March at 1.15pm. Friends etc. etc. | ||||||
603 | 1 | 2 | $family{'funeral'}->{'date'} = $1; | ||||
604 | 1 | 1 | $family{'funeral'}->{'time'} = $2; | ||||
605 | } | ||||||
606 | } | ||||||
607 | } elsif($text =~ /funeral services.+\sat\s(.+)\sat\s(.+),\swith\s/i) { | ||||||
608 | $family{funeral} = { | ||||||
609 | 1 | 3 | time => $1, | ||||
610 | location => $2 | ||||||
611 | }; | ||||||
612 | } elsif($text =~ /funeral services.+\sat\s(.+),\swith\s/i) { | ||||||
613 | 0 | 0 | $family{funeral} = { location => $1 } | ||||
614 | } elsif($text =~ /services.+\sat\s(.+),\swith\s/i) { | ||||||
615 | 3 | 5 | $family{funeral} = { location => $1 } | ||||
616 | } | ||||||
617 | |||||||
618 | # Extract father-in-law and mother-in-law information (if present) | ||||||
619 | 12 | 30 | if($text =~ /father-in-law to\s+([A-Za-z\s]+)/) { | ||||
620 | 1 | 2 | my $father_in_law = $1; | ||||
621 | 1 | 2 | $family{children_in_law} = [{ name => $father_in_law }]; | ||||
622 | } elsif($text =~ /mother-in-law to\s+([A-Za-z\s]+)/i) { | ||||||
623 | 1 | 1 | my $mother_in_law = $1; | ||||
624 | 1 | 7 | $family{children_in_law} = [ split /\s*(?:,|and)\s*/i, ($text =~ /mother-in-law to\s+([^\.;]+)/i)[0] || '' ]; | ||||
625 | 1 | 2 | if(scalar($family{children_in_law} == 0)) { | ||||
626 | 0 | 0 | $family{children_in_law} = [{ name => $mother_in_law }]; | ||||
627 | } | ||||||
628 | } | ||||||
629 | |||||||
630 | # Extract aunt information | ||||||
631 | 12 | 19 | if($text =~ /niece of\s+([A-Za-z]+)/) { | ||||
632 | 1 | 1 | my $aunt = $1; | ||||
633 | 1 | 2 | $family{aunt} = [{ 'name' => $aunt }]; | ||||
634 | } | ||||||
635 | |||||||
636 | # Birth info | ||||||
637 | 12 | 1008 | if($text =~ /[^\b]Born in ([^,]+),.*?\b(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})/i) { | ||||
638 | $family{birth} = { | ||||||
639 | 1 | 2 | place => $1, | ||||
640 | date => $2, | ||||||
641 | } | ||||||
642 | } elsif($text =~ /[^\b]Born in ([a-z,\.\s]+)\s+on\s+(.+)/i) { | ||||||
643 | 1 | 2 | $family{'birth'}->{'place'} = $1; | ||||
644 | 1 | 2 | if(my $location = _extract_location($1)) { | ||||
645 | 0 | 0 | $family{'birth'}->{'location'} = $location; | ||||
646 | } | ||||||
647 | 1 | 11 | if(my $dt = _extract_date($2)) { | ||||
648 | 1 | 11 | $family{'birth'}->{date} = $dt->ymd('/'); | ||||
649 | } | ||||||
650 | 1 | 16 | $family{'birth'}->{'place'} =~ s/\s+$//; | ||||
651 | } elsif($text =~ /S?he was born (.+)\sin ([a-z,\.\s]+)\s+to\s+(.+?)\sand\s(.+?)\./i) { | ||||||
652 | 1 | 1 | $family{'birth'}->{'place'} = $2; | ||||
653 | 1 | 1 | my $father = $3; | ||||
654 | 1 | 2 | my $mother = $4; | ||||
655 | 1 | 1 | eval { | ||||
656 | 1 | 2 | if(my $dt = DateTime::Format::Text->parse_datetime($1)) { | ||||
657 | 1 | 196 | $family{'birth'}->{date} = $dt->ymd('/'); | ||||
658 | } | ||||||
659 | }; | ||||||
660 | # TODO | ||||||
661 | # if($verbose && $@) { | ||||||
662 | # Carp::carp($@); | ||||||
663 | # } | ||||||
664 | 1 | 13 | if($mother =~ /(.+)\s+\((.+)\)\s+(.+)/) { | ||||
665 | 1 | 2 | $mother = "$1 $2"; | ||||
666 | } | ||||||
667 | 1 | 2 | if($father =~ /(.+?)\.\s\s/) { | ||||
668 | 0 | 0 | $father = $1; | ||||
669 | } | ||||||
670 | $family{parents} = { | ||||||
671 | 1 | 3 | father => { name => $father }, | ||||
672 | mother => { name => $mother } | ||||||
673 | }; | ||||||
674 | 1 | 5 | if($text =~ /survived by (his|her) (father|mother)[\s,;]/i) { | ||||
675 | 1 | 2 | $family{parents}->{$2}->{'status'} = 'living'; | ||||
676 | } | ||||||
677 | } elsif($text =~ /[^\b]S?he was born\s*(?:on\s+)?([A-Z][a-z]+ \d{1,2}, \d{4})[,\s]+(?:in\s+)([^,]+)?/i) { | ||||||
678 | 1 | 3 | if(my $dt = _extract_date($1)) { | ||||
679 | 1 | 8 | $family{'birth'}->{date} = $dt->ymd('/'); | ||||
680 | } | ||||||
681 | 1 | 10 | if($2) { | ||||
682 | 1 | 3 | $family{'birth'}->{'location'} = $2; | ||||
683 | } | ||||||
684 | } | ||||||
685 | |||||||
686 | # Date of death | ||||||
687 | 12 | 133 | if($text =~ /\bpassed away\b.*?\b(?:on\s+)?([A-Z]+ \d{1,2}, \d{4})/i) { | ||||
688 | 2 | 3 | $family{death}->{date} = $1; | ||||
689 | 2 | 3 | $family{death}->{datetime} = _extract_date($1); | ||||
690 | } | ||||||
691 | |||||||
692 | # Age at death | ||||||
693 | 12 | 49 | if($text =~ /,\s(\d{1,3}), of\s/) { | ||||
694 | 2 | 5 | if($1 < 110) { | ||||
695 | 2 | 4 | $family{'death'}->{'age'} = $1; | ||||
696 | } | ||||||
697 | } | ||||||
698 | |||||||
699 | # Place of death | ||||||
700 | 12 | 144 | if($text =~ /\b(?:passed away|died)\b([a-z0-9\s,]+)\sat\s+(.+?)\./i) { | ||||
701 | 3 | 4 | my $place = $2; | ||||
702 | 3 | 272 | if($place =~ /(.+)\s+on\s+([A-Z]+ \d{1,2}, \d{4})/i) { | ||||
703 | 0 | 0 | $place = $1; | ||||
704 | 0 | 0 | $family{death}->{date} = $2; | ||||
705 | } elsif($place =~ /(.+)\son\s(.+)/) { | ||||||
706 | 1 | 2 | $place = $1; | ||||
707 | 1 | 16 | if(my $dt = _extract_date($2)) { | ||||
708 | 1 | 17 | $family{death}->{date} = $dt->ymd('/'); | ||||
709 | } | ||||||
710 | } | ||||||
711 | 3 | 16 | $place =~ s/^\bthe residence,\s//; | ||||
712 | 3 | 6 | $place =~ s/\bafter a.*$//; | ||||
713 | 3 | 3 | $place =~ s/,\s+$//; | ||||
714 | 3 | 5 | $family{death}->{place} = $place; | ||||
715 | } | ||||||
716 | |||||||
717 | # Remove blank fields from the main family hash | ||||||
718 | 12 67 67 | 21 65 93 | %family = map { $_ => $family{$_} } grep { defined $family{$_} && $family{$_} ne '' } keys %family; | ||||
719 | |||||||
720 | # Remove empty arrays the family hash | ||||||
721 | 12 | 22 | foreach my $key (keys %family) { | ||||
722 | 67 | 64 | if(ref($family{$key}) eq 'ARRAY') { | ||||
723 | 48 72 48 | 27 81 37 | $family{$key} = [ grep { /\S/ } @{$family{$key}} ]; | ||||
724 | 48 48 | 36 46 | if(@{$family{$key}} == 0) { | ||||
725 | 11 | 13 | delete $family{$key}; | ||||
726 | } | ||||||
727 | } | ||||||
728 | } | ||||||
729 | |||||||
730 | 12 | 21 | return if(!scalar keys(%family)); | ||||
731 | |||||||
732 | 12 | 39 | return Return::Set::set_return(\%family, { type => 'hashref', 'min' => 1, 'max' => 10 }); | ||||
733 | } | ||||||
734 | |||||||
735 | sub _extract_date | ||||||
736 | { | ||||||
737 | 5 | 7 | my $text = shift; | ||||
738 | 5 | 16 | my $parser = DateTime::Format::Text->new(); | ||||
739 | 5 | 19 | my $dt; | ||||
740 | |||||||
741 | 5 5 | 2 9 | eval { $dt = $parser->parse_datetime($text); }; | ||||
742 | 5 | 1323 | return $dt if $dt && !$@; | ||||
743 | 0 | 0 | return undef; | ||||
744 | } | ||||||
745 | |||||||
746 | sub _extract_location { | ||||||
747 | 1 | 2 | my $place_text = shift; | ||||
748 | |||||||
749 | 1 | 6 | $geocoder ||= Geo::Coder::List->new()->push(Geo::Coder::Free->new()); | ||||
750 | 1 | 47940 | my @locations = $geocoder->geocode(location => $place_text); # Use array to improve caching | ||||
751 | |||||||
752 | 1 | 185 | return unless scalar(@locations); | ||||
753 | |||||||
754 | 0 | my $result = $locations[0]; | |||||
755 | |||||||
756 | 0 | if(ref($result)) { | |||||
757 | return { | ||||||
758 | 0 | raw => $place_text, | |||||
759 | # city => $result->{components}{city} || $result->{components}{town}, | ||||||
760 | # region => $result->{components}{state}, | ||||||
761 | # country => $result->{components}{country}, | ||||||
762 | latitude => $result->latitude(), | ||||||
763 | longitude => $result->longitude() | ||||||
764 | }; | ||||||
765 | } | ||||||
766 | return { | ||||||
767 | raw => $place_text, | ||||||
768 | # city => $result->{components}{city} || $result->{components}{town}, | ||||||
769 | # region => $result->{components}{state}, | ||||||
770 | # country => $result->{components}{country}, | ||||||
771 | latitude => $result->{'latitude'}, | ||||||
772 | 0 | longitude => $result->{'longitude'} | |||||
773 | }; | ||||||
774 | } | ||||||
775 | |||||||
776 | |||||||
777 - 793 | =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO Test coverage report: L<https://nigelhorne.github.io/Genealogy-Obituary-Parser/coverage/> =head1 SUPPORT This module is provided as-is without any warranty. =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut | ||||||
794 | |||||||
795 | 1; |