| File: | blib/lib/Genealogy/Occupation.pm |
| Coverage: | 98.1% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package Genealogy::Occupation; | |||||
| 2 | ||||||
| 3 | # TODO: railway/railroad = le chemin de fer | |||||
| 4 | ||||||
| 5 | 6 6 6 | 814860 5 78 | use strict; | |||
| 6 | 6 6 6 | 10 4 115 | use warnings; | |||
| 7 | 6 6 | 39 9 | use 5.014; | |||
| 8 | ||||||
| 9 | 6 6 6 | 8 6 166 | use Carp qw(croak); | |||
| 10 | 6 6 6 | 1133 12324 77 | use I18N::LangTags::Detect; | |||
| 11 | 6 6 6 | 1084 87319 390 | use Lingua::EN::ABC; | |||
| 12 | 6 6 6 | 1022 25900 114 | use Params::Get; | |||
| 13 | 6 6 6 | 1212 9967 152 | use Readonly; | |||
| 14 | 6 6 6 | 1692 70623 164 | use Params::Validate::Strict qw(validate_strict); | |||
| 15 | 6 6 6 | 945 1534 6226 | use Return::Set qw(set_return); | |||
| 16 | ||||||
| 17 | our $VERSION = '0.02'; | |||||
| 18 | ||||||
| 19 | # Schema for new() arguments | |||||
| 20 | Readonly my $NEW_SCHEMA => { | |||||
| 21 | warn_on_error => { | |||||
| 22 | type => 'boolean', | |||||
| 23 | optional => 1, | |||||
| 24 | default => 0, | |||||
| 25 | }, | |||||
| 26 | }; | |||||
| 27 | ||||||
| 28 | # Schema for normalise() arguments. | |||||
| 29 | # Note: occupation is extracted and normalised to an arrayref BEFORE | |||||
| 30 | # validate_strict is called, because Params::Validate::Strict does not | |||||
| 31 | # yet support union types. Only the remaining named arguments (sex) | |||||
| 32 | # are validated here. | |||||
| 33 | Readonly my $NORMALISE_SCHEMA => { | |||||
| 34 | sex => { | |||||
| 35 | type => 'string', | |||||
| 36 | optional => 1, | |||||
| 37 | memberof => ['M', 'F'], | |||||
| 38 | }, | |||||
| 39 | }; | |||||
| 40 | ||||||
| 41 | # Occupations to filter out entirely - these are not real occupations | |||||
| 42 | # but rather descriptions of status or domestic roles | |||||
| 43 | Readonly my %FILTER => map { lc($_) => 1 } qw( | |||||
| 44 | unemployed | |||||
| 45 | retired | |||||
| 46 | ); | |||||
| 47 | ||||||
| 48 | # Filter patterns - matched case-insensitively against the occupation | |||||
| 49 | Readonly my @FILTER_PATTERNS => ( | |||||
| 50 | qr/^scho(?:ol|lar)/i, | |||||
| 51 | qr/wife$/i, | |||||
| 52 | qr/seeking work/i, | |||||
| 53 | qr/domestic duties/i, | |||||
| 54 | qr/home duties/i, | |||||
| 55 | qr/house\s?hold duties/i, | |||||
| 56 | qr/^at school$/i, | |||||
| 57 | ); | |||||
| 58 | ||||||
| 59 | # Direct lookup table for exact normalisation matches. | |||||
| 60 | # Keyed on lowercase occupation string. | |||||
| 61 | Readonly my %DIRECT => ( | |||||
| 62 | 'ag lab' => 'Agricultural Labourer', | |||||
| 63 | 'ag labourer' => 'Agricultural Labourer', | |||||
| 64 | 'ag labourer pauper' => 'Agricultural Labourer', | |||||
| 65 | 'agric labourer' => 'Agricultural Labourer', | |||||
| 66 | 'ag lab pauper' => 'Agricultural Labourer', | |||||
| 67 | 'farm labourer' => 'Agricultural Labourer', | |||||
| 68 | 'agricultural farm labourer' => 'Agricultural Labourer', | |||||
| 69 | 'ordinary agricultural labourer' => 'Agricultural Labourer', | |||||
| 70 | 'work on farm' => 'Agricultural Labourer', | |||||
| 71 | 'agricultural lab' => 'Agricultural Labourer', | |||||
| 72 | 'agril labourer' => 'Agricultural Labourer', | |||||
| 73 | 'labourer ag' => 'Agricultural Labourer', | |||||
| 74 | 'labourer (ag)' => 'Agricultural Labourer', | |||||
| 75 | 'platelayer railway' => 'Railway Platelayer', | |||||
| 76 | 'general servant domestic' => 'Domestic Servant', | |||||
| 77 | 'domestic servant' => 'Domestic Servant', | |||||
| 78 | 'lorry driver heavy worker' => 'Lorry Driver', | |||||
| 79 | 'laundry man' => 'Laundryman', | |||||
| 80 | "brewer's labourer" => 'Brewery Labourer', | |||||
| 81 | 'labourer builders' => "Builder's Labourer", | |||||
| 82 | 'gardener domestic' => 'Gardener and Domestic', | |||||
| 83 | 'gardner and domestic servant' => 'Gardener and Domestic', # sic | |||||
| 84 | 'under gardener domestic' => 'Domestic Gardener', | |||||
| 85 | 'domestic under gardner' => 'Domestic Gardener', # sic | |||||
| 86 | 'market gardener' => 'Market Gardener', | |||||
| 87 | 'plate glass cutter' => 'Plate Glass Cutter', | |||||
| 88 | 'pfc us army' => 'Private First Class', | |||||
| 89 | 'labourer gas stoker' => 'Gas Stoker', | |||||
| 90 | ); | |||||
| 91 | ||||||
| 92 | # Pattern that matches "general serv*dom*" variants | |||||
| 93 | Readonly my $GENERAL_SERVANT_RE => qr/^general serv.+dom/i; | |||||
| 94 | ||||||
| 95 | # French translations keyed on lowercase English occupation. | |||||
| 96 | # Values are either a plain string or a hashref with M/F keys | |||||
| 97 | # for gendered translations. | |||||
| 98 | Readonly my %FRENCH => ( | |||||
| 99 | 'postman' => { M => 'Facteur', F => 'Factrice' }, | |||||
| 100 | 'farmer' => { M => 'Agriculteur', F => 'Agricultrice' }, | |||||
| 101 | 'teacher' => 'Professeur', | |||||
| 102 | 'nurse' => { M => 'Infirmier', F => 'Infirmière' }, | |||||
| 103 | ); | |||||
| 104 | ||||||
| 105 | # German translations keyed on lowercase English occupation. | |||||
| 106 | # Values are either a plain string or a hashref with M/F keys. | |||||
| 107 | Readonly my %GERMAN => ( | |||||
| 108 | 'teacher' => { M => 'Lehrer', F => 'Lehrerin' }, | |||||
| 109 | 'farmer' => { M => 'Bauer', F => 'Bauerin' }, | |||||
| 110 | 'bus driver' => { M => 'Busfahrer', F => 'Busfahrerin' }, | |||||
| 111 | 'doctor' => 'Arzt', # simplified non-gendered form for fallback | |||||
| 112 | ); | |||||
| 113 | ||||||
| 114 - 220 | =head1 NAME
Genealogy::Occupation - Normalise and translate genealogical occupation strings
=head1 VERSION
Version 0.02
=head1 SYNOPSIS
use Genealogy::Occupation;
my $normaliser = Genealogy::Occupation->new();
my @occupations = $normaliser->normalise(
occupation => 'Ag Lab',
sex => 'M',
);
# Returns ('Agricultural Labourer')
# Or pass an arrayref
my @more = $normaliser->normalise(
occupation => ['Ag Lab', 'Ag Lab', 'Retired'],
sex => 'M',
);
# Returns ('Agricultural Labourer') - deduplicated and filtered
=head1 DESCRIPTION
Normalises occupation strings found in genealogical records, handling
common abbreviations, malformed entries, locale-specific spellings and
translations into French and German.
Designed to handle poor-quality data from genealogy software imports
where occupation strings may be abbreviated, inconsistent or use
archaic terminology.
Processing steps applied in order:
=over 4
=item 1. Filter out non-occupations (Scholar, Retired, Domestic Duties etc)
=item 2. Normalise abbreviations and malformed entries to canonical forms
=item 3. Deduplicate consecutive identical or equivalent entries (compared on pre-translation normalised forms)
=item 4. Apply locale-specific spellings via C<Lingua::EN::ABC>
=item 5. Translate to French or German if system locale requires it
=back
=head1 METHODS
=head2 new
=head3 Purpose
Constructs a new normaliser object.
=head3 API Specification
=head4 Input
{
warn_on_error => {
type => 'boolean',
optional => 1,
default => 0,
},
}
=head4 Output
{ type => 'object', isa => 'Genealogy::Occupation' }
=head3 Arguments
=over 4
=item * C<warn_on_error> - If true, unknown occupations that cannot be
translated will emit a warning via C<carp> rather than silently falling
back to English. Optional, defaults to 0.
=back
=head3 Returns
A blessed C<Genealogy::Occupation> object.
=head3 Side Effects
None.
=head3 Notes
The system locale is detected once at construction time and cached for
the lifetime of the object.
=head3 Example
my $normaliser = Genealogy::Occupation->new({
warn_on_error => 1,
});
=cut | |||||
| 221 | ||||||
| 222 | sub new { | |||||
| 223 | 141 | 211216 | my $class = shift; | |||
| 224 | ||||||
| 225 | # Accept both hashref and flat list, defaulting to empty hashref | |||||
| 226 | # since all constructor arguments are optional | |||||
| 227 | 141 | 209 | my $args = Params::Get::get_params(undef, @_) // {}; | |||
| 228 | ||||||
| 229 | # Validate constructor arguments | |||||
| 230 | 141 | 1466 | validate_strict({ | |||
| 231 | description => 'Genealogy::Occupation::new', | |||||
| 232 | input => $args, | |||||
| 233 | schema => $NEW_SCHEMA, | |||||
| 234 | }); | |||||
| 235 | ||||||
| 236 | # Detect and cache the system language at construction time | |||||
| 237 | 138 | 7122 | my $language = _get_language(); | |||
| 238 | ||||||
| 239 | return bless { | |||||
| 240 | 138 | 398 | warn_on_error => $args->{warn_on_error} // 0, | |||
| 241 | language => $language, | |||||
| 242 | }, $class; | |||||
| 243 | } | |||||
| 244 | ||||||
| 245 - 324 | =head2 normalise
=head3 Purpose
Normalises one or more occupation strings, applying filtering,
deduplication, abbreviation expansion, locale spelling and
translation in order.
=head3 API Specification
=head4 Input
{
occupation => {
type => ['string', 'arrayref'],
},
sex => {
type => 'string',
optional => 1,
memberof => ['M', 'F'],
},
}
=head4 Output
{
type => 'arrayref',
element_type => 'string',
}
=head3 Arguments
=over 4
=item * C<occupation> - A single occupation string or an arrayref of
occupation strings. Required.
=item * C<sex> - The sex of the person, C<'M'> or C<'F'>. Optional
but required for correct gendered translations in French and German.
Defaults to C<'M'> if not provided when a gendered translation is
needed.
=back
=head3 Returns
An arrayref of normalised occupation strings. May be empty if all
occupations were filtered out.
=head3 Side Effects
If C<warn_on_error> was set at construction and an occupation cannot
be translated, emits a warning via C<carp>.
=head3 Notes
Deduplication operates across the full list of occupations passed in.
Processing a single occupation at a time will not deduplicate across
multiple calls.
Deduplication compares the pre-translation normalised English forms, not
the translated output. This means two consecutive identical English
occupations correctly collapse to one entry even in French or German
locales, where the translated results stored in the output array would
otherwise never match the incoming English string.
=head3 Example
my $result = $normaliser->normalise(
occupation => ['Ag Lab', 'Ag Lab', 'Retired'],
sex => 'M',
);
# Returns ['Agricultural Labourer']
my $result = $normaliser->normalise(
occupation => 'Platelayer Railway',
);
# Returns ['Railway Platelayer']
=cut | |||||
| 325 | ||||||
| 326 | sub normalise { | |||||
| 327 | 233 | 33966 | my $self = shift; | |||
| 328 | ||||||
| 329 | # Accept both hashref and flat list | |||||
| 330 | 233 | 269 | my $args = Params::Get::get_params(undef, @_); | |||
| 331 | ||||||
| 332 | # Extract and normalise occupation to an arrayref BEFORE calling | |||||
| 333 | # validate_strict. Params::Validate::Strict does not yet support | |||||
| 334 | # union types, so passing an arrayref against a 'string' schema | |||||
| 335 | # would die. We handle the type check here explicitly instead. | |||||
| 336 | 233 | 2439 | my $raw = delete $args->{occupation}; | |||
| 337 | 233 | 245 | croak 'Genealogy::Occupation::normalise: occupation is required' | |||
| 338 | unless defined $raw; | |||||
| 339 | 229 | 249 | my $occupations = ref($raw) eq 'ARRAY' ? $raw : [ $raw ]; | |||
| 340 | ||||||
| 341 | # Validate remaining named arguments (sex) strictly | |||||
| 342 | 229 | 392 | validate_strict({ | |||
| 343 | description => 'Genealogy::Occupation::normalise', | |||||
| 344 | input => $args, | |||||
| 345 | schema => $NORMALISE_SCHEMA, | |||||
| 346 | }); | |||||
| 347 | ||||||
| 348 | # Default sex to M if not provided, needed for gendered translations | |||||
| 349 | 227 | 12416 | my $sex = $args->{sex} // 'M'; | |||
| 350 | ||||||
| 351 | 227 | 223 | my $language = $self->{language} // 'en'; | |||
| 352 | 227 | 164 | my @result; | |||
| 353 | # Track the last normalised English form for deduplication. We cannot | |||||
| 354 | # use $result[-1] for this because @result stores the translated output; | |||||
| 355 | # comparing a translated value against a pre-translation English string | |||||
| 356 | # means consecutive identical occupations are never deduplicated in | |||||
| 357 | # French or German locales. | |||||
| 358 | 227 | 144 | my $last_normalised = ''; | |||
| 359 | ||||||
| 360 | 227 227 | 116 187 | foreach my $occupation (@{$occupations}) { | |||
| 361 | # Clean up whitespace and punctuation artifacts | |||||
| 362 | 1866 | 1566 | $occupation =~ tr/\r\n/ /; | |||
| 363 | 1866 | 1161 | $occupation =~ s/\.+$//; | |||
| 364 | 1866 | 1224 | $occupation =~ s/[\(\)]//g; | |||
| 365 | 1866 | 1245 | $occupation =~ s/\s\s+/ /g; | |||
| 366 | 1866 | 1266 | $occupation =~ s/\s+$//; | |||
| 367 | 1866 | 1081 | $occupation =~ s/\./;/g; | |||
| 368 | ||||||
| 369 | # Step 1: filter out non-occupations | |||||
| 370 | 1866 | 1963 | next if $FILTER{lc($occupation)}; | |||
| 371 | 1652 | 3952 | my $filtered = 0; | |||
| 372 | 1652 | 1338 | foreach my $pattern (@FILTER_PATTERNS) { | |||
| 373 | 9670 | 34159 | if($occupation =~ $pattern) { | |||
| 374 | 423 | 1064 | $filtered = 1; | |||
| 375 | 423 | 258 | last; | |||
| 376 | } | |||||
| 377 | } | |||||
| 378 | 1652 | 4807 | next if $filtered; | |||
| 379 | ||||||
| 380 | # Step 2: normalise the occupation string | |||||
| 381 | 1229 | 801 | $occupation = _normalise_single($occupation); | |||
| 382 | 1229 | 996 | next unless length($occupation); | |||
| 383 | ||||||
| 384 | # Step 3: deduplicate against the previous normalised (pre-translation) | |||||
| 385 | # entry, not against $result[-1] which holds the translated form | |||||
| 386 | 1221 | 1023 | next if lc($last_normalised) eq lc($occupation); | |||
| 387 | 213 | 130 | $last_normalised = $occupation; | |||
| 388 | ||||||
| 389 | # Step 4: apply locale-specific spellings for English variants | |||||
| 390 | 213 | 174 | if($language eq 'en') { | |||
| 391 | 141 | 101 | $occupation = _apply_locale($occupation); | |||
| 392 | } | |||||
| 393 | ||||||
| 394 | # Step 5: translate to target language if not English | |||||
| 395 | 213 | 3142 | if($language eq 'fr') { | |||
| 396 | $occupation = _translate_french($occupation, $sex, | |||||
| 397 | 41 | 43 | $self->{warn_on_error}); | |||
| 398 | } elsif($language eq 'de') { | |||||
| 399 | $occupation = _translate_german($occupation, $sex, | |||||
| 400 | 31 | 38 | $self->{warn_on_error}); | |||
| 401 | } | |||||
| 402 | ||||||
| 403 | 213 | 379 | push @result, ucfirst($occupation); | |||
| 404 | } | |||||
| 405 | ||||||
| 406 | 227 | 409 | return set_return(\@result, { type => 'arrayref', element_type => 'string' }); | |||
| 407 | } | |||||
| 408 | ||||||
| 409 | # _normalise_single | |||||
| 410 | # | |||||
| 411 | # Purpose: | |||||
| 412 | # Normalises a single occupation string by expanding abbreviations, | |||||
| 413 | # fixing malformed entries and applying pattern-based corrections. | |||||
| 414 | # | |||||
| 415 | # Entry criteria: | |||||
| 416 | # $occupation - a non-empty string, already cleaned of whitespace | |||||
| 417 | # | |||||
| 418 | # Exit status: | |||||
| 419 | # Returns the normalised occupation string, or empty string if the | |||||
| 420 | # entry should be discarded after normalisation. | |||||
| 421 | # | |||||
| 422 | # Side effects: | |||||
| 423 | # None. | |||||
| 424 | # | |||||
| 425 | # Notes: | |||||
| 426 | # Checks the direct lookup table first for exact matches, | |||||
| 427 | # then applies pattern-based rules for more complex cases. | |||||
| 428 | # The "sic" comments mark known data quality issues in real | |||||
| 429 | # genealogy records that we intentionally handle. | |||||
| 430 | ||||||
| 431 | sub _normalise_single { | |||||
| 432 | 1254 | 2157 | my $occupation = shift; | |||
| 433 | ||||||
| 434 | # Check direct lookup table first for exact matches | |||||
| 435 | 1254 | 1308 | if(my $match = $DIRECT{lc($occupation)}) { | |||
| 436 | 1078 | 2813 | return $match; | |||
| 437 | } | |||||
| 438 | ||||||
| 439 | # Handle "general servant domestic" pattern variants | |||||
| 440 | 176 | 566 | if($occupation =~ $GENERAL_SERVANT_RE) { | |||
| 441 | 3 | 14 | return 'Domestic Servant'; | |||
| 442 | } | |||||
| 443 | ||||||
| 444 | # Remove common suffixes that add no occupational meaning | |||||
| 445 | 173 | 433 | $occupation =~ s/\s+own account$//i; | |||
| 446 | 173 | 112 | $occupation =~ s/^formerly //i; | |||
| 447 | 173 | 149 | $occupation =~ s/\s+retired$//i; | |||
| 448 | 173 | 111 | $occupation =~ s/\s+heavy worker$//i; | |||
| 449 | 173 | 113 | $occupation =~ s/\s+own business$//i; | |||
| 450 | 173 | 137 | $occupation =~ s/Labor/Labour/ig; | |||
| 451 | ||||||
| 452 | # Reorder "X domestic" and "X dom" patterns to "Domestic X" | |||||
| 453 | 173 | 393 | if($occupation =~ /^(.+)\s(?:domestic|dom)$/i) { | |||
| 454 | 5 | 11 | return "Domestic $1"; | |||
| 455 | } | |||||
| 456 | ||||||
| 457 | # Convert "works on/for X" to "X worker" | |||||
| 458 | 168 | 140 | if($occupation =~ /works?\s+(?:on|for)\s+(.+)/i) { | |||
| 459 | 4 | 9 | return "$1 worker"; | |||
| 460 | } | |||||
| 461 | ||||||
| 462 | # Convert "Cleaner X" prefix form to "X cleaner" | |||||
| 463 | 164 | 149 | if($occupation =~ /^Cleaner\s+(.+)/i) { | |||
| 464 | 2 | 4 | return "$1 cleaner"; | |||
| 465 | } | |||||
| 466 | ||||||
| 467 | # Reorder clerk, salesman, foreman, manager patterns | |||||
| 468 | 162 | 136 | if($occupation =~ /^Clerk\s+(.+)/i) { | |||
| 469 | 2 | 6 | return "$1 Clerk"; | |||
| 470 | } | |||||
| 471 | 160 | 126 | if($occupation =~ /^Salesman\s+(.*)/i) { | |||
| 472 | 3 | 6 | return "$1 Salesman"; | |||
| 473 | } | |||||
| 474 | 157 | 112 | if($occupation =~ /^Foreman\s+(.*)/i) { | |||
| 475 | 4 | 6 | my $of = $1; | |||
| 476 | 4 | 4 | $of =~ s/^of the //i; | |||
| 477 | 4 | 7 | return "$of Foreman"; | |||
| 478 | } | |||||
| 479 | 153 | 171 | if($occupation =~ /^Manager\s+(.*)/i | |||
| 480 | && $occupation !~ /^Manager of /i | |||||
| 481 | && $occupation !~ /Manager & /i) { | |||||
| 482 | 2 | 5 | return "$1 Manager"; | |||
| 483 | } | |||||
| 484 | ||||||
| 485 | # Convert "Shop Assistant X" to "X's Shop Assistant" | |||||
| 486 | 151 | 125 | if($occupation =~ /^Shop Assistant\s+(.+)/i) { | |||
| 487 | 2 | 3 | return "$1's Shop Assistant"; | |||
| 488 | } | |||||
| 489 | ||||||
| 490 | # Convert "X Assistant" to "X's Assistant" for known trade forms | |||||
| 491 | 149 | 722 | if($occupation =~ /^(.+)\s+Assistant$/i) { | |||
| 492 | 7 | 8 | my $trade = $1; | |||
| 493 | 7 | 8 | if(lc($trade) eq 'bakers') { | |||
| 494 | 2 | 4 | return "Baker's Assistant"; | |||
| 495 | } | |||||
| 496 | 5 | 7 | if(lc($trade) eq 'butchers') { | |||
| 497 | 2 | 4 | return "Butcher's Assistant"; | |||
| 498 | } | |||||
| 499 | 3 | 6 | unless($trade =~ /'s$/ || lc($trade) eq 'shop') { | |||
| 500 | 1 | 2 | return "${trade}'s Assistant"; | |||
| 501 | } | |||||
| 502 | } | |||||
| 503 | ||||||
| 504 | # Convert police* to "police officer" | |||||
| 505 | 144 | 132 | if($occupation =~ /police$/i) { | |||
| 506 | 3 | 4 | return "$occupation officer"; | |||
| 507 | } | |||||
| 508 | ||||||
| 509 | # Convert pluralised trade forms e.g. "Builders Labourer" | |||||
| 510 | # to possessive "Builder's Labourer". | |||||
| 511 | # | |||||
| 512 | # The regex captures: | |||||
| 513 | # $base ($1) - the word stem without the trailing 's' | |||||
| 514 | # $last ($2) - the final character of $base (used to reconstruct | |||||
| 515 | # the stem; not used in the guard comparison) | |||||
| 516 | # $role ($3) - the following word | |||||
| 517 | # | |||||
| 518 | # Guard against false positives: "Bus Driver" ($base eq 'Bu') and | |||||
| 519 | # "Harness Maker" ($base eq 'Harnes') must not be rewritten. | |||||
| 520 | # Compare against $base directly - using "$base$last" is wrong | |||||
| 521 | # because it appends the final character a second time. | |||||
| 522 | 141 | 542 | if($occupation !~ /gas works/i | |||
| 523 | && $occupation =~ /^(.+([a-z]))s\s+([a-z]+)$/i) { | |||||
| 524 | 10 | 18 | my ($base, $last, $role) = ($1, $2, $3); | |||
| 525 | 10 | 19 | unless(lc($base) eq 'bu' || lc($base) eq 'harnes') { | |||
| 526 | 2 | 2 | return "${base}'s $role"; | |||
| 527 | } | |||||
| 528 | } | |||||
| 529 | ||||||
| 530 | # Handle "on farm" pattern | |||||
| 531 | 139 | 156 | if($occupation =~ /^(.+)\s+on farm$/i) { | |||
| 532 | 2 | 3 | return "$1 on a farm"; | |||
| 533 | } | |||||
| 534 | ||||||
| 535 | 137 | 144 | return $occupation; | |||
| 536 | } | |||||
| 537 | ||||||
| 538 | # _apply_locale | |||||
| 539 | # | |||||
| 540 | # Purpose: | |||||
| 541 | # Applies locale-specific English spelling variants using Lingua::EN::ABC. | |||||
| 542 | # Handles en_US (labour->labor), en_CA, and en_GB (default) variants. | |||||
| 543 | # | |||||
| 544 | # Entry criteria: | |||||
| 545 | # $occupation - a normalised English occupation string (may be title-cased) | |||||
| 546 | # | |||||
| 547 | # Exit status: | |||||
| 548 | # Returns the occupation string with locale-appropriate spellings. | |||||
| 549 | # Original capitalisation is preserved; Lingua::EN::ABC performs | |||||
| 550 | # case-insensitive substitutions and does not require lowercased input. | |||||
| 551 | # | |||||
| 552 | # Side effects: | |||||
| 553 | # None. | |||||
| 554 | # | |||||
| 555 | # Notes: | |||||
| 556 | # Reads $ENV{'LANG'} to determine the locale. | |||||
| 557 | # Defaults to British English if no locale is detected. | |||||
| 558 | # Do NOT pass lc($occupation) here - doing so strips title case that | |||||
| 559 | # cannot be fully recovered by ucfirst() alone. | |||||
| 560 | ||||||
| 561 | sub _apply_locale { | |||||
| 562 | 144 | 2748 | my $occupation = shift; | |||
| 563 | ||||||
| 564 | # Apply American English spelling variants | |||||
| 565 | 144 | 251 | if(defined($ENV{'LANG'}) && ($ENV{'LANG'} =~ /^en_US/)) { | |||
| 566 | 2 | 3 | $occupation = Lingua::EN::ABC::b2a($occupation); | |||
| 567 | 2 | 296 | $occupation =~ s/labour/labor/ig; | |||
| 568 | 2 | 3 | return $occupation; | |||
| 569 | } | |||||
| 570 | ||||||
| 571 | # Apply Canadian English spelling variants | |||||
| 572 | 142 | 191 | if(defined($ENV{'LANG'}) && ($ENV{'LANG'} =~ /^en_CA/)) { | |||
| 573 | 2 | 5 | return Lingua::EN::ABC::b2c($occupation); | |||
| 574 | } | |||||
| 575 | ||||||
| 576 | # Default to British English spelling | |||||
| 577 | 140 | 151 | return Lingua::EN::ABC::a2b($occupation); | |||
| 578 | } | |||||
| 579 | ||||||
| 580 | # _translate_french | |||||
| 581 | # | |||||
| 582 | # Purpose: | |||||
| 583 | # Translates a normalised English occupation string to French, | |||||
| 584 | # applying gendered forms where appropriate. | |||||
| 585 | # | |||||
| 586 | # Entry criteria: | |||||
| 587 | # $occupation - normalised English occupation string | |||||
| 588 | # $sex - 'M' or 'F' | |||||
| 589 | # $warn_on_error - boolean, if true carp on unknown occupation | |||||
| 590 | # | |||||
| 591 | # Exit status: | |||||
| 592 | # Returns the French occupation string, or the original English | |||||
| 593 | # string if no translation is available and warn_on_error is false. | |||||
| 594 | # | |||||
| 595 | # Side effects: | |||||
| 596 | # Carps if warn_on_error is true and no translation is found. | |||||
| 597 | # | |||||
| 598 | # Notes: | |||||
| 599 | # Only a subset of occupations have French translations. | |||||
| 600 | # The retired/teaching special cases are handled via regex | |||||
| 601 | # rather than the lookup table since they modify rather than | |||||
| 602 | # replace the occupation string. | |||||
| 603 | ||||||
| 604 | sub _translate_french { | |||||
| 605 | 50 | 1519 | my ($occupation, $sex, $warn_on_error) = @_; | |||
| 606 | ||||||
| 607 | # Handle teaching as a special regex case | |||||
| 608 | 50 | 54 | if($occupation =~ /teaching/i) { | |||
| 609 | 3 | 4 | return 'professeur'; | |||
| 610 | } | |||||
| 611 | ||||||
| 612 | # Handle retired as a suffix replacement | |||||
| 613 | 47 | 42 | $occupation =~ s/retired/\x{00E0} la retraite/i; | |||
| 614 | ||||||
| 615 | # Handle X Farmer pattern | |||||
| 616 | 47 | 55 | if($occupation =~ /^(.+)\sFarmer$/i) { | |||
| 617 | 6 | 7 | my $type = $1; | |||
| 618 | 6 | 12 | return $sex eq 'F' | |||
| 619 | ? "Agricultrice de $type" | |||||
| 620 | : "Agriculteur de $type"; | |||||
| 621 | } | |||||
| 622 | ||||||
| 623 | # Check the French translation lookup table | |||||
| 624 | 41 | 57 | if(my $translation = $FRENCH{lc($occupation)}) { | |||
| 625 | 30 | 116 | if(ref($translation) eq 'HASH') { | |||
| 626 | 24 | 30 | return $translation->{$sex} // $translation->{'M'}; | |||
| 627 | } | |||||
| 628 | 6 | 9 | return $translation; | |||
| 629 | } | |||||
| 630 | ||||||
| 631 | # Fall back to English with optional warning | |||||
| 632 | 11 | 40 | if($warn_on_error) { | |||
| 633 | 5 | 8 | Carp::carp "Genealogy::Occupation: no French translation for '$occupation'"; | |||
| 634 | } | |||||
| 635 | ||||||
| 636 | 11 | 16 | return $occupation; | |||
| 637 | } | |||||
| 638 | ||||||
| 639 | # _translate_german | |||||
| 640 | # | |||||
| 641 | # Purpose: | |||||
| 642 | # Translates a normalised English occupation string to German, | |||||
| 643 | # applying gendered forms where appropriate. | |||||
| 644 | # | |||||
| 645 | # Entry criteria: | |||||
| 646 | # $occupation - normalised English occupation string | |||||
| 647 | # $sex - 'M' or 'F' | |||||
| 648 | # $warn_on_error - boolean, if true carp on unknown occupation | |||||
| 649 | # | |||||
| 650 | # Exit status: | |||||
| 651 | # Returns the German occupation string, or the original English | |||||
| 652 | # string if no translation is available and warn_on_error is false. | |||||
| 653 | # | |||||
| 654 | # Side effects: | |||||
| 655 | # Carps if warn_on_error is true and no translation is found. | |||||
| 656 | # | |||||
| 657 | # Notes: | |||||
| 658 | # Only a subset of occupations have German translations. | |||||
| 659 | # The retired and self-employed special cases are handled via | |||||
| 660 | # regex rather than the lookup table. | |||||
| 661 | ||||||
| 662 | sub _translate_german { | |||||
| 663 | 43 | 1393 | my ($occupation, $sex, $warn_on_error) = @_; | |||
| 664 | ||||||
| 665 | # Handle teaching as a special regex case | |||||
| 666 | 43 | 53 | if($occupation =~ /teaching/i) { | |||
| 667 | 5 | 10 | return $sex eq 'F' ? 'Lehrerin' : 'Lehrer'; | |||
| 668 | } | |||||
| 669 | ||||||
| 670 | # Handle X Farmer pattern | |||||
| 671 | 38 | 66 | if($occupation =~ /^(.+)\sFarmer$/i) { | |||
| 672 | 5 | 9 | return $sex eq 'F' ? 'Landwirtin' : 'Landwirt'; | |||
| 673 | } | |||||
| 674 | ||||||
| 675 | # Handle retired and self-employed as suffix replacements | |||||
| 676 | 33 | 28 | $occupation =~ s/retired/im ruhestand/i; | |||
| 677 | 33 | 30 | $occupation =~ s/self-employed/selbstst\x{00E4}ndig/i; | |||
| 678 | ||||||
| 679 | # Check the German translation lookup table | |||||
| 680 | 33 | 48 | if(my $translation = $GERMAN{lc($occupation)}) { | |||
| 681 | 21 | 72 | if(ref($translation) eq 'HASH') { | |||
| 682 | 19 | 23 | return $translation->{$sex} // $translation->{'M'}; | |||
| 683 | } | |||||
| 684 | 2 | 2 | return $translation; | |||
| 685 | } | |||||
| 686 | ||||||
| 687 | # Fall back to English with optional warning | |||||
| 688 | 12 | 43 | if($warn_on_error) { | |||
| 689 | 4 | 9 | Carp::carp "Genealogy::Occupation: no German translation for '$occupation'"; | |||
| 690 | } | |||||
| 691 | ||||||
| 692 | 12 | 17 | return $occupation; | |||
| 693 | } | |||||
| 694 | ||||||
| 695 | # _get_language | |||||
| 696 | # | |||||
| 697 | # Purpose: | |||||
| 698 | # Determines the system's default language using environment variables. | |||||
| 699 | # | |||||
| 700 | # Entry criteria: | |||||
| 701 | # None. Reads environment variables directly. | |||||
| 702 | # | |||||
| 703 | # Exit status: | |||||
| 704 | # Returns a two-letter language code string e.g. 'en', 'fr', 'de', | |||||
| 705 | # or undef if no language can be determined. | |||||
| 706 | # | |||||
| 707 | # Side effects: | |||||
| 708 | # None. | |||||
| 709 | # | |||||
| 710 | # Notes: | |||||
| 711 | # Checks in order: I18N::LangTags::Detect, LANGUAGE, LC_ALL, | |||||
| 712 | # LC_MESSAGES, LANG environment variables. | |||||
| 713 | # Returns 'en' for C locale. | |||||
| 714 | # See https://www.gnu.org/software/gettext/manual/html_node/Locale-Environment-Variables.html | |||||
| 715 | ||||||
| 716 | sub _get_language { | |||||
| 717 | # Try I18N::LangTags::Detect first for most accurate detection | |||||
| 718 | 147 | 3440 | for my $tag (I18N::LangTags::Detect::detect()) { | |||
| 719 | 65 | 1667 | if($tag =~ /^([a-z]{2})/i) { | |||
| 720 | 64 | 123 | return lc($1); | |||
| 721 | } | |||||
| 722 | } | |||||
| 723 | ||||||
| 724 | # Fall back to checking environment variables in order | |||||
| 725 | 83 | 639 | if(($ENV{'LANGUAGE'}) && ($ENV{'LANGUAGE'} =~ /^([a-z]{2})/i)) { | |||
| 726 | 1 | 2 | return lc($1); | |||
| 727 | } | |||||
| 728 | 82 | 61 | foreach my $variable ('LC_ALL', 'LC_MESSAGES', 'LANG') { | |||
| 729 | 243 | 178 | my $val = $ENV{$variable}; | |||
| 730 | 243 | 196 | next unless defined($val); | |||
| 731 | 79 | 126 | if($val =~ /^([a-z]{2})/i) { | |||
| 732 | 71 | 98 | return lc($1); | |||
| 733 | } | |||||
| 734 | } | |||||
| 735 | ||||||
| 736 | # Handle C locale explicitly | |||||
| 737 | 11 | 33 | return 'en' if(defined($ENV{'LANG'}) && $ENV{'LANG'} =~ /^C(\.|$)/); | |||
| 738 | ||||||
| 739 | 3 | 4 | return; | |||
| 740 | } | |||||
| 741 | ||||||
| 742 - 786 | =head1 AUTHOR Nigel Horne C<< <njh@bandsman.co.uk> >> =head1 BUGS Please report bugs via the GitHub issue tracker: L<https://github.com/nigelhorne/Genealogy-Occupation/issues> =head1 TODO =over 4 =item * Expand French and German translation tables =item * Add support for additional languages =item * Add C<normalise_place()> equivalent for occupation place strings =back =head1 SEE ALSO =over 4 =item * L<Test Dashboard|https://nigelhorne.github.io/Genealogy-Occupation/coverage/> =item * L<Lingua::EN::ABC> =item * L<Params::Get> =item * L<Params::Validate::Strict> =item * L<Return::Set> =back =head1 LICENSE AND COPYRIGHT Copyright 2026 Nigel Horne. This program is released under the following licence: GPL2 If you use it, please let me know. =cut | |||||
| 787 | ||||||
| 788 | 1; | |||||