File Coverage

File:blib/lib/Email/Abuse/Investigator.pm
Coverage:78.3%

linestmtbrancondsubtimecode
1package Email::Abuse::Investigator;
2
3
15
15
15
750166
14
204
use strict;
4
15
15
15
28
7
303
use warnings;
5
15
15
15
2697
87209
27
use autodie qw(:all);
6
15
15
15
104579
56364
33
use Time::Piece;
7
8 - 17
=head1 NAME

Email::Abuse::Investigator - Analyse spam email to identify originating hosts,
hosted URLs, and suspicious domains

=head1 VERSION

Version 0.08

=cut
18
19our $VERSION = '0.08';
20
21 - 102
=head1 SYNOPSIS

    use Email::Abuse::Investigator;

    my $analyser = Email::Abuse::Investigator->new( verbose => 1 );
    $analyser->parse_email($raw_email_text);

    # Originating IP and its network owner
    my $origin = $analyser->originating_ip();

    # All HTTP/HTTPS URLs found in the body
    my @urls  = $analyser->embedded_urls();

    # All domains extracted from mailto: links and bare addresses in the body
    my @mdoms = $analyser->mailto_domains();

    # All domains mentioned anywhere (union of the above)
    my @adoms = $analyser->all_domains();

    # Full printable report
    print $analyser->report();

=head1 DESCRIPTION

C<Email::Abuse::Investigator> examines the raw source of a spam/phishing e-mail
and answers the questions manual abuse investigators ask:

=over 4

=item 1. Where did the message really come from?

Walks the C<Received:> chain, skips private/trusted IPs, and identifies the
first external hop.  Enriches with rDNS, WHOIS/RDAP org name and abuse
contact.  Both IPv4 and IPv6 addresses are supported.

=item 2. Who hosts the advertised web sites?

Extracts every C<http://> and C<https://> URL from both plain-text and HTML
parts, resolves each hostname to an IP, and looks up the network owner.

=item 3. Who owns the reply-to / contact domains?

Extracts domains from C<mailto:> links, bare e-mail addresses in the body,
the C<From:>/C<Reply-To:>/C<Sender:>/C<Return-Path:> headers, C<DKIM-Signature: d=>
(the signing domain), C<List-Unsubscribe:> (the ESP or bulk-sender domain), and the
C<Message-ID:> domain.  For each unique domain it gathers:

=over 8

=item * Domain registrar and registrant (WHOIS)

=item * Web-hosting IP and network owner (A record -> RDAP)

=item * Mail-hosting IP and network owner (MX record -> RDAP)

=item * DNS nameserver operator (NS record -> RDAP)

=item * Whether the domain was recently registered (potential flag)

=back

=back

=head1 REQUIRED MODULES

The following modules are mandatory:

    Readonly::Values::Months
    Socket              (core since Perl 5)
    IO::Socket::INET    (core since Perl 5)
    MIME::QuotedPrint   (core since Perl 5.8)
    MIME::Base64        (core since Perl 5.8)

The following are optional but strongly recommended:

    Net::DNS            -- enables MX, NS, AAAA record lookups
    LWP::UserAgent      -- enables RDAP (faster and richer than raw WHOIS)
    HTML::LinkExtor     -- enables structural HTML link extraction
    CHI                 -- enables cross-message IP/domain result caching
    IO::Socket::IP      -- enables IPv6 WHOIS connections

=cut
103
104
15
15
15
769
16
318
use Carp qw(croak carp);
105
15
15
15
2894
9702
331
use IO::Select;
106
15
15
15
1674
71289
61
use IO::Socket::INET;
107
15
15
15
5274
2565
382
use MIME::QuotedPrint qw( decode_qp );
108
15
15
15
34
11
215
use MIME::Base64 qw( decode_base64 );
109
15
15
15
3101
637275
240
use Object::Configure;
110
15
15
15
48
30
223
use Params::Get;
111
15
15
15
27
18
167
use Params::Validate::Strict;
112
15
15
15
25
12
225
use Readonly;
113
15
15
15
2646
8585
1072
use Readonly::Values::Months;
114
15
15
15
48
15
1519
use Socket qw( inet_aton inet_ntoa AF_INET );
115
116# -----------------------------------------------------------------------
117# Optional modules -- gracefully degraded when absent
118# -----------------------------------------------------------------------
119
120# Net::DNS enables MX, NS, AAAA lookups; falls back to gethostbyname
121my $HAS_NET_DNS;
122
123# LWP::UserAgent enables RDAP queries; falls back to raw WHOIS
124my $HAS_LWP;
125my $HAS_CONN_CACHE;
126
127# HTML::LinkExtor enables structural HTML link extraction
128my $HAS_HTML_LINKEXTOR;
129
130# CHI enables a persistent cross-message cache for IP/domain data
131my $HAS_CHI;
132
133# IO::Socket::IP provides dual-stack (IPv4+IPv6) socket support
134my $HAS_IO_SOCKET_IP;
135
136# Domain::PublicSuffix enables accurate eTLD+1 normalisation
137my $HAS_PUBLIC_SUFFIX;
138
139# AnyEvent::DNS enables parallel DNS queries
140my $HAS_ANYEVENT_DNS;
141
142BEGIN {
143
15
15
0
20
821
0
        $HAS_NET_DNS       = eval { require Net::DNS;           1 };
144
15
15
0
3712
766
0
        $HAS_LWP           = eval { require LWP::UserAgent;     1 };
145
15
15
0
2925
678
0
        $HAS_CONN_CACHE    = eval { require LWP::ConnCache;     1 };
146
15
15
15
2760
2911
52565
        $HAS_HTML_LINKEXTOR= eval { require HTML::LinkExtor;    1 };
147
15
15
0
16
891
0
        $HAS_CHI           = eval { require CHI;                1 };
148
15
15
15
3201
2218
25591
        $HAS_IO_SOCKET_IP  = eval { require IO::Socket::IP;     1 };
149
15
15
0
15
786
0
        $HAS_PUBLIC_SUFFIX = eval { require Domain::PublicSuffix; 1 };
150
15
15
0
2971
20369
0
        $HAS_ANYEVENT_DNS  = eval { require AnyEvent::DNS;      1 };
151}
152
153# -----------------------------------------------------------------------
154# Constants -- all magic numbers and strings live here
155# -----------------------------------------------------------------------
156
157# WHOIS protocol port (IANA-assigned)
158Readonly::Scalar my $WHOIS_PORT        => 43;
159
160# Bytes to read per sysread() call from a WHOIS socket
161Readonly::Scalar my $WHOIS_READ_CHUNK  => 4096;
162
163# Maximum WHOIS response bytes stored in whois_raw (keep reports compact)
164Readonly::Scalar my $WHOIS_RAW_MAX     => 2048;
165
166# Maximum multipart nesting depth (recursion guard -- RFC 2046 has no limit
167# but we cap it to prevent stack exhaustion on crafted messages)
168Readonly::Scalar my $MAX_MULTIPART_DEPTH => 20;
169
170# Number of days before registration that triggers recently_registered flag
171Readonly::Scalar my $RECENT_REG_DAYS   => 180;
172
173# Number of days ahead of expiry that triggers domain_expires_soon flag
174Readonly::Scalar my $EXPIRY_WARN_DAYS  => 30;
175
176# Seconds in a day -- used in date arithmetic throughout
177Readonly::Scalar my $SECS_PER_DAY      => 86400;
178
179# Suspicious date window: dates outside +/- 7 days raise a flag
180Readonly::Scalar my $DATE_SKEW_DAYS    => 7;
181
182# Maximum positive timezone offset in minutes (+14:00 = Line Islands)
183Readonly::Scalar my $TZ_MAX_POS_MINS   => 840;
184
185# Maximum negative timezone offset in minutes (-12:00 = Baker Island)
186Readonly::Scalar my $TZ_MAX_NEG_MINS   => 720;
187
188# High-risk score threshold
189Readonly::Scalar my $SCORE_HIGH        => 9;
190
191# Medium-risk score threshold
192Readonly::Scalar my $SCORE_MEDIUM      => 5;
193
194# Low-risk score threshold
195Readonly::Scalar my $SCORE_LOW         => 2;
196
197# Flag severity weights (contribute to the numeric risk score)
198Readonly::Hash my %FLAG_WEIGHT => (
199        HIGH   => 3,
200        MEDIUM => 2,
201        LOW    => 1,
202        INFO   => 0,
203);
204
205# Maximum merged-role display string length before summarisation kicks in
206Readonly::Scalar my $ROLE_MAX_LEN      => 80;
207
208# CHI cache TTL in seconds (1 hour -- IP allocations change slowly)
209Readonly::Scalar my $CACHE_TTL_SECS    => 3600;
210
211# Default constructor timeout for network operations (seconds)
212Readonly::Scalar my $DEFAULT_TIMEOUT   => 10;
213
214# Maximum role string length before truncation
215Readonly::Scalar my $ROLE_WRAP_LEN     => 66;
216
217# -----------------------------------------------------------------------
218# Private ranges -- IPs that are never actionable abuse targets
219# -----------------------------------------------------------------------
220
221# Both IPv4 and IPv6 private/reserved ranges.  Each entry is a compiled
222# regex; _is_private() iterates over them and returns true on first match.
223my @PRIVATE_RANGES = (
224        # IPv4 ranges
225        qr/^0\./,                         # 0.0.0.0/8  this-network (RFC 1122)
226        qr/^127\./,                       # 127.0.0.0/8 loopback
227        qr/^10\./,                        # 10.0.0.0/8  RFC 1918
228        qr/^192\.168\./,                  # 192.168.0.0/16 RFC 1918
229        qr/^172\.(?:1[6-9]|2\d|3[01])\./, # 172.16.0.0/12  RFC 1918
230        qr/^169\.254\./,                  # 169.254.0.0/16 link-local
231        qr/^100\.(?:6[4-9]|[7-9]\d|1(?:[01]\d|2[0-7]))\./,  # 100.64.0.0/10 CGN (RFC 6598)
232        qr/^192\.0\.0\./,                 # 192.0.0.0/24  IETF protocol (RFC 6890)
233        qr/^192\.0\.2\./,                 # 192.0.2.0/24  TEST-NET-1 (RFC 5737)
234        qr/^198\.51\.100\./,              # 198.51.100.0/24 TEST-NET-2 (RFC 5737)
235        qr/^203\.0\.113\./,               # 203.0.113.0/24 TEST-NET-3 (RFC 5737)
236        qr/^255\./,                       # 255.0.0.0/8 broadcast
237        # IPv6 ranges
238        qr/^::1$/,                         # IPv6 loopback
239        qr/^fe80:/i,                       # IPv6 link-local (fe80::/10)
240        qr/^fc/i,                          # IPv6 ULA fc00::/7
241        qr/^fd/i,                          # IPv6 ULA fd00::/8
242        qr/^2001:db8:/i,                   # IPv6 documentation range (RFC 3849)
243        qr/^64:ff9b:/i,                    # IPv6 NAT64 well-known prefix
244);
245
246# Priority-ordered patterns for extracting IPs from Received: headers.
247# Covers bracketed IPv4, bracketed IPv6, parenthesised address, and bare dotted-quad.
248my @RECEIVED_IP_RE = (
249        qr/\[\s*([\d.]+)\s*\]/,                          # [1.2.3.4]
250        qr/\[\s*([0-9a-fA-F:]+)\s*\]/,                  # [IPv6 address]
251        qr/\(\s*[\w.-]*\s*\[?\s*([\d.]+)\s*\]?\s*\)/,   # (hostname [1.2.3.4])
252        qr/from\s+[\w.-]+\s+([\d.]+)/,                  # from hostname addr
253        qr/([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})/, # bare dotted-quad fallback
254);
255
256# -----------------------------------------------------------------------
257# Default configuration -- overridable via Object::Configure
258# -----------------------------------------------------------------------
259
260# Object::Configure may overlay
261# values from a file before new() uses them.  Use Readonly for constants
262# that should never be overridden at runtime.
263
264# -----------------------------------------------------------------------
265# Trusted domains (infrastructure -- never report these as abuse targets)
266# Can be overrideen at runtime by Object::Configure
267# -----------------------------------------------------------------------
268
269my %TRUSTED_DOMAINS = map { $_ => 1 } qw(
270        gmail.com googlemail.com yahoo.com outlook.com hotmail.com
271        google.com microsoft.com apple.com amazon.com
272        googlegroups.com groups.google.com
273        w3.org
274        fedex.com ups.com dhl.com usps.com royalmail.com
275);
276
277# -----------------------------------------------------------------------
278# URL shortener domains (real destination is hidden behind these)
279# -----------------------------------------------------------------------
280
281my %URL_SHORTENERS = map { $_ => 1 } qw(
282        bit.ly      bitly.com   tinyurl.com  t.co        ow.ly
283        goo.gl      is.gd       buff.ly      ift.tt       dlvr.it
284        short.link  rebrand.ly  tiny.cc      cutt.ly      rb.gy
285        shorturl.at bl.ink      smarturl.it  yourls.org   clicky.me
286        snip.ly     adf.ly      bc.vc        lnkd.in      fb.me
287        youtu.be
288);
289
290# -----------------------------------------------------------------------
291# Well-known provider abuse contacts
292# Can be overrideen at runtime by Object::Configure
293# -----------------------------------------------------------------------
294
295# Curated table of provider abuse contacts.  Entries with only a 'form'
296# key (no 'email') require web-form submission; abuse_contacts() suppresses
297# email addresses for those providers and form_contacts() surfaces them.
298my %PROVIDER_ABUSE = (
299        # Google / Gmail
300        'google.com'        => { email => 'abuse@google.com',      note => 'Also report Gmail accounts via https://support.google.com/mail/contact/abuse' },
301        'gmail.com'         => { email => 'abuse@google.com',      note => 'Report Gmail spam via https://support.google.com/mail/contact/abuse' },
302        'googlemail.com'    => { email => 'abuse@google.com',      note => 'Report via https://support.google.com/mail/contact/abuse' },
303        '1e100.net'         => { email => 'abuse@google.com',      note => 'Google infrastructure' },
304        'blogspot.com'      => { email => 'abuse@google.com',      note => 'Blogger/Blogspot -- report via https://support.google.com/blogger/answer/76315' },
305        'blogger.com'       => { email => 'abuse@google.com',      note => 'Blogger platform abuse' },
306        'sites.google.com'  => { email => 'abuse@google.com',      note => 'Google Sites hosted content' },
307        # Microsoft
308        'microsoft.com'     => { email => 'abuse@microsoft.com',   note => 'Also report via https://www.microsoft.com/en-us/wdsi/support/report-unsafe-site' },
309        'outlook.com'       => { email => 'abuse@microsoft.com',   note => 'Report Outlook spam: https://support.microsoft.com/en-us/office/report-phishing' },
310        'hotmail.com'       => { email => 'abuse@microsoft.com',   note => 'Report via https://support.microsoft.com/en-us/office/report-phishing' },
311        'live.com'          => { email => 'abuse@microsoft.com',   note => 'Microsoft consumer mail' },
312        'office365.com'     => { email => 'abuse@microsoft.com',   note => 'Microsoft 365 infrastructure' },
313        'protection.outlook.com' => { email => 'abuse@microsoft.com', note => 'Microsoft EOP gateway' },
314        # Yahoo
315        'yahoo.com'         => { email => 'abuse@yahoo-inc.com',   note => 'Also use https://io.help.yahoo.com/contact/index' },
316        'yahoo.co.uk'       => { email => 'abuse@yahoo-inc.com',   note => 'Yahoo UK' },
317        # Apple
318        'apple.com'         => { email => 'reportphishing@apple.com', note => 'iCloud / Apple Mail abuse' },
319        'icloud.com'        => { email => 'reportphishing@apple.com', note => 'iCloud abuse' },
320        'me.com'            => { email => 'reportphishing@apple.com', note => 'Apple legacy mail' },
321        # Amazon / AWS
322        'amazon.com'        => { email => 'abuse@amazonaws.com',   note => 'Also https://aws.amazon.com/forms/report-abuse' },
323        'amazonaws.com'     => { email => 'abuse@amazonaws.com',   note => 'AWS abuse form: https://aws.amazon.com/forms/report-abuse' },
324        'amazonses.com'     => { email => 'abuse@amazonaws.com',   note => 'Amazon SES sending infrastructure' },
325        # Cloudflare
326        'cloudflare.com'    => { email => 'abuse@cloudflare.com',  note => 'Report via https://www.cloudflare.com/abuse/' },
327        # Fastly / Akamai
328        'fastly.net'        => { email => 'abuse@fastly.com',      note => 'Fastly CDN' },
329        'akamai.com'        => { email => 'abuse@akamai.com',      note => 'Akamai CDN' },
330        'akamaitechnologies.com' => { email => 'abuse@akamai.com', note => 'Akamai CDN' },
331        # Namecheap
332        'namecheap.com'     => { email => 'abuse@namecheap.com',   note => 'Registrar abuse' },
333        # GoDaddy -- web form only; email bounces
334        'godaddy.com'       => {
335                form        => 'https://supportcenter.godaddy.com/AbuseReport',
336                form_paste  => 'Select the abuse type (spam, phishing, malware etc). '
337                             . 'Enter the domain name in the Domain field. '
338                             . 'Paste the originating IP, risk flags, and the relevant '
339                             . 'Received: headers from the report below.',
340                form_upload => 'Take a screenshot of the report as a .png or .jpg, '
341                             . 'or export it as a .pdf.',
342                note        => 'Registrar/host -- email reports not monitored, use web form',
343        },
344        # SendGrid / Twilio
345        'sendgrid.net'      => { email => 'abuse@sendgrid.com',    note => 'ESP -- include full headers' },
346        'sendgrid.com'      => { email => 'abuse@sendgrid.com',    note => 'ESP -- include full headers' },
347        # Mailchimp / Mandrill
348        'mailchimp.com'     => { email => 'abuse@mailchimp.com',   note => 'ESP abuse' },
349        'mandrillapp.com'   => { email => 'abuse@mailchimp.com',   note => 'Mandrill transactional ESP' },
350        # OVH
351        'ovh.net'           => { email => 'abuse@ovh.net',         note => 'OVH hosting' },
352        'ovh.com'           => { email => 'abuse@ovh.com',         note => 'OVH hosting' },
353        # Hetzner
354        'hetzner.com'       => { email => 'abuse@hetzner.com',     note => 'Hetzner hosting' },
355        # Digital Ocean
356        'digitalocean.com'  => { email => 'abuse@digitalocean.com',note => 'DO abuse form: https://www.digitalocean.com/company/contact/#abuse' },
357        # Linode / Akamai
358        'linode.com'        => { email => 'abuse@linode.com',      note => 'Linode/Akamai Cloud' },
359        # Constant Contact
360        'constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'ESP abuse' },
361        'r.constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'Constant Contact sending infrastructure' },
362        # HubSpot
363        'hubspot.com'         => { email => 'abuse@hubspot.com',       note => 'ESP abuse' },
364        'hs-analytics.net'    => { email => 'abuse@hubspot.com',       note => 'HubSpot analytics infrastructure' },
365        # Campaign Monitor
366        'createsend.com'      => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor ESP' },
367        'cmail20.com'         => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor sending infrastructure' },
368        # Klaviyo
369        'klaviyo.com'         => { email => 'abuse@klaviyo.com',       note => 'ESP abuse' },
370        # Brevo (formerly Sendinblue)
371        'sendinblue.com'      => { email => 'abuse@sendinblue.com',    note => 'ESP abuse' },
372        'brevo.com'           => { email => 'abuse@brevo.com',         note => 'ESP abuse' },
373        # Mailgun
374        'mailgun.com'         => { email => 'abuse@mailgun.com',       note => 'ESP abuse' },
375        'mailgun.org'         => { email => 'abuse@mailgun.com',       note => 'Mailgun sending infrastructure' },
376        # Postmark
377        'postmarkapp.com'     => { email => 'abuse@postmarkapp.com',   note => 'ESP abuse' },
378        # WordPress.com
379        'wordpress.com'       => { email => 'abuse@wordpress.com',     note => 'WordPress.com hosted blog -- report via https://en.wordpress.com/abuse/' },
380        'wp.com'              => { email => 'abuse@wordpress.com',     note => 'WordPress.com short domain' },
381        # Substack
382        'substack.com'        => { email => 'abuse@substack.com',      note => 'Substack newsletter platform abuse' },
383        # ActiveCampaign
384        'activecampaign.com'  => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign ESP' },
385        'ac-tinker.com'       => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign tracking infrastructure' },
386        # Salesforce Marketing Cloud
387        'salesforce.com'      => { email => 'abuse@salesforce.com',    note => 'Salesforce Marketing Cloud / ExactTarget ESP' },
388        'mc.salesforce.com'   => { email => 'abuse@salesforce.com',    note => 'Salesforce Marketing Cloud sending infrastructure' },
389        'exacttarget.com'     => { email => 'abuse@salesforce.com',    note => 'ExactTarget / Salesforce Marketing Cloud ESP' },
390        'et.exacttarget.com'  => { email => 'abuse@salesforce.com',    note => 'ExactTarget sending infrastructure' },
391        # Vultr
392        'vultr.com'           => { email => 'abuse@vultr.com',         note => 'Vultr hosting' },
393        # Contabo
394        'contabo.com'         => { email => 'abuse@contabo.com',       note => 'Contabo hosting' },
395        # Leaseweb
396        'leaseweb.com'        => { email => 'abuse@leaseweb.com',      note => 'Leaseweb hosting' },
397        # M247
398        'm247.com'            => { email => 'abuse@m247.com',          note => 'M247 hosting' },
399        # MarkMonitor -- web form only
400        'markmonitor.com'       => {
401                form        => 'https://corp.markmonitor.com/domain/ui/abuse-report',
402                form_paste  => 'Complete all fields including the domain name and your '
403                             . 'description of the abuse.  Paste the originating IP, '
404                             . 'risk flags, and the relevant Received: headers from the '
405                             . 'report below.',
406                form_upload => 'Take a screenshot of the report as a .png or .jpg, or export it as a .pdf.  MarkMonitor does not accept .eml files.',
407                note        => 'Brand-protection registrar -- email reports not processed',
408        },
409        # URL shortener operators
410        'is.gd'             => { email => 'abuse@is.gd',           note => 'URL shortener -- report via https://is.gd/contact.php' },
411        'bitly.com'         => { email => 'abuse@bitly.com',        note => 'URL shortener abuse' },
412        'bit.ly'            => { email => 'abuse@bitly.com',        note => 'URL shortener abuse' },
413        'tinyurl.com'       => { email => 'abuse@tinyurl.com',      note => 'URL shortener abuse' },
414        'ow.ly'             => { email => 'abuse@hootsuite.com',    note => 'Hootsuite URL shortener' },
415        'buff.ly'           => { email => 'abuse@buffer.com',       note => 'Buffer URL shortener' },
416        'rb.gy'             => { email => 'abuse@rb.gy',            note => 'URL shortener abuse' },
417        'cutt.ly'           => { email => 'abuse@cutt.ly',          note => 'URL shortener abuse' },
418        'shorturl.at'       => { email => 'abuse@shorturl.at',      note => 'URL shortener abuse' },
419        # Dynadot -- web form only
420        'dynadot.com'           => {
421                form        => 'https://www.dynadot.com/report-abuse',
422                form_paste  => 'Complete all fields including the domain name and your '
423                             . 'description of the abuse.  Paste the originating IP, '
424                             . 'risk flags, and the relevant Received: headers from the '
425                             . 'report below.',
426                form_upload => 'Take a screenshot of the report as a .png or .jpg, '
427                             . 'or export it as a .pdf.',
428                note        => 'Registrar -- email reports not monitored, use web form',
429        },
430        # Global Domain Group -- web form only
431        'globaldomaingroup.com' => {
432                form        => 'https://globaldomaingroup.com/report-abuse',
433                form_paste  => 'Complete all fields including the domain name and your '
434                             . 'description of the abuse.  Paste the originating IP, '
435                             . 'risk flags, and the relevant Received: headers from the '
436                             . 'report below.',
437                form_upload => 'Attach the original spam message as an .eml file.',
438                note        => 'Registrar -- email reports explicitly not accepted',
439        },
440        # TPG / Internode (Australia)
441        'tpgi.com.au'       => { email => 'abuse@tpg.com.au',      note => 'TPG Telecom Australia' },
442        'tpg.com.au'        => { email => 'abuse@tpg.com.au',      note => 'TPG Telecom Australia' },
443        'internode.on.net'  => { email => 'abuse@internode.on.net',note => 'Internode Australia' },
444);
445
446# -----------------------------------------------------------------------
447# Constructor
448# -----------------------------------------------------------------------
449
450 - 585
=head1 METHODS

=head2 new( %options )

Constructs and returns a new C<Email::Abuse::Investigator> analyser object.  The
object is stateless until C<parse_email()> is called; all analysis results
are stored on the object and retrieved via the public accessor methods
documented below.

A single object may be reused for multiple emails by calling C<parse_email()>
again: all per-message cached state from the previous message is discarded
automatically.  Cross-message IP and domain lookup results are retained
in a shared CHI cache (if C<CHI> is installed) to avoid redundant network
queries across messages processed in the same process.

=head3 Usage

    # Minimal -- all options take safe defaults
    my $analyser = Email::Abuse::Investigator->new();

    # With options
    my $analyser = Email::Abuse::Investigator->new(
        timeout        => 15,
        trusted_relays => ['203.0.113.0/24', '10.0.0.0/8'],
        verbose        => 0,
    );

    $analyser->parse_email($raw_rfc2822_text);
    my $origin   = $analyser->originating_ip();
    my @urls     = $analyser->embedded_urls();
    my @domains  = $analyser->mailto_domains();
    my $risk     = $analyser->risk_assessment();
    my @contacts = $analyser->abuse_contacts();
    print $analyser->report();

=head3 Arguments

All arguments are optional named parameters passed as a flat key-value list.

=over 4

=item C<timeout> (integer, default 10)

Maximum seconds to wait for any single network operation.  Set to 0 to
disable timeouts (not recommended for production use).

=item C<trusted_relays> (arrayref of strings, default [])

IP addresses or CIDR blocks to skip during Received: chain analysis.
Each element may be an exact IPv4 address (C<'192.0.2.1'>) or a CIDR
block (C<'192.0.2.0/24'>).

=item C<verbose> (boolean, default 0)

When true, diagnostic messages are written to STDERR.

=back

=head3 Returns

A blessed C<Email::Abuse::Investigator> object.  No network I/O is performed
during construction.

=head3 Side Effects

If C<CHI> is installed, a shared in-memory cache is initialised (or
re-used if a cache was already created by a prior call to C<new()>).
This cache persists for the lifetime of the process.

=head3 Notes

=over 4

=item *

Unknown option keys are silently ignored.

=item *

The object is not thread-safe.  Use a separate object per thread.

=item *

WHOIS read timeouts use C<IO::Select> rather than C<alarm()>, so they
work correctly on Windows and in threaded Perl interpreters.

=back

=head3 API Specification

=head4 Input

    {
        timeout => {
            type     => 'integer',
            optional => 1,
            min      => 0,
            default  => 10,
        },
        trusted_relays => {
            type          => 'arrayref',
            element_type  => 'string',
            optional      => 1,
            default       => [],
        },
        verbose => {
            type     => 'boolean',
            optional => 1,
            default  => 0,
        },
    }

=head4 Output

    {
        type => 'Email::Abuse::Investigator',
        isa  => 'Email::Abuse::Investigator',
    }

=head3 FORMAL SPECIFICATION

    -- Z notation (simplified)
    new == [
      timeout        : N;
      trusted_relays : seq STRING;
      verbose        : BOOL;
      _raw           : STRING;
      _headers       : seq (STRING x STRING);
      _origin?       : IP_INFO | undefined;
      _urls?         : seq URL_INFO | undefined;
      _risk?         : RISK_INFO | undefined
    ]
    pre: timeout >= 0
    post: self.timeout = params.timeout /\ self._raw = ''

=cut
586
587# Class-level cross-message CHI cache (shared across all instances).
588# Populated lazily on first call to new() when CHI is available.
589my $_cache;
590
591sub new {
592
2608
747139
        my $class = shift;
593
594        # Accept hash or hashref arguments uniformly
595
2608
4145
        my $params = Params::Validate::Strict::validate_strict({
596                args => Params::Get::get_params(undef, \@_) || {},
597                schema => {
598                        timeout => {
599                                'type'     => 'integer',
600                                'optional' => 1,
601                                'min'      => 0,
602                        },
603                        trusted_relays => {
604                                'type'         => 'arrayref',
605                                'element_type' => 'string',
606                                'optional'     => 1,
607                        },
608                        verbose => {
609                                'type'     => 'boolean',
610                                'optional' => 1,
611                        },
612                },
613        });
614
615        # Merge in any file-based configuration via Object::Configure
616
1630
161484
        $params = Object::Configure::configure($class, $params);
617
618        # Initialise the cross-message CHI cache on first construction
619
1630
2716946
        if ($HAS_CHI && !$_cache) {
620
0
0
                $_cache = CHI->new(
621                        driver     => 'Memory',
622                        global     => 1,
623                        expires_in => $CACHE_TTL_SECS,
624                );
625        }
626
627        # Build and bless the object with default slot values
628        return bless {
629                timeout        => $DEFAULT_TIMEOUT,
630                trusted_relays => [],
631                verbose        => 0,
632                _raw           => '',
633                _headers       => [],
634                _body_plain    => '',
635                _body_html     => '',
636                _received      => [],
637                _origin        => undef,
638                _urls          => undef,     # lazy-computed by embedded_urls()
639                _mailto_domains=> undef,     # lazy-computed by mailto_domains()
640                _domain_info   => {},        # per-message domain analysis cache
641                _sending_sw    => [],        # X-Mailer / X-PHP-Originating-Script etc.
642                _rcvd_tracking => [],        # per-hop tracking IDs from Received: headers
643
1630
1630
2114
8196
                %{$params},             # Override the defaults with Object:Configure and the values passed in
644        }, $class;
645}
646
647# -----------------------------------------------------------------------
648# Public: parse
649# -----------------------------------------------------------------------
650
651 - 740
=head2 parse_email( $text )

Feeds a raw RFC 2822 email message to the analyser and prepares it for
subsequent interrogation.  This is the only method that must be called
before any other public method.

If the same object is used for a second message, calling C<parse_email()>
again completely replaces all per-message state from the first message.
The cross-message CHI cache is B<not> flushed; IP and domain lookups
cached from prior messages are retained.

=head3 Usage

    my $raw = do { local $/; <STDIN> };
    $analyser->parse_email($raw);

    # Scalar reference (avoids copying large messages)
    $analyser->parse_email(\$raw);

    # Chained
    my $analyser = Email::Abuse::Investigator->new()->parse_email($raw);

=head3 Arguments

=over 4

=item C<$text> (scalar or scalar reference, required)

Complete raw RFC 2822 email message, including all headers and the body.
Both LF-only and CRLF line endings are accepted.

=back

=head3 Returns

The object itself (C<$self>), enabling method chaining.

=head3 Side Effects

Parses headers, decodes the body (quoted-printable, base64, multipart),
extracts sending-software fingerprints, and populates per-hop tracking
data.  All previously computed lazy results are discarded.

=head3 Notes

=over 4

=item *

If C<$text> is empty or contains no header/body separator, all public
methods will return empty/safe values.

=item *

Decoding errors in base64 or quoted-printable payloads are silenced; raw
bytes are used in place of correct output to prevent exceptions.

=back

=head3 API Specification

=head4 Input

    [
        {
            type => 'scalar | scalarref',
        },
    ]

=head4 Output

    {
        type => 'Email::Abuse::Investigator',
        isa  => 'Email::Abuse::Investigator',
    }

=head3 FORMAL SPECIFICATION

    -- Z notation
    parse_email == [
      Delta Email::Abuse::Investigator;
      text? : STRING | ref STRING
    ]
    pre:  defined text?
    post: self._raw = deref(text?) /\
          self._origin = undefined /\
          self._urls   = undefined /\
          self._risk   = undefined

=cut
741
742# TODO: Allow a Mail::Message object to be passed in
743sub parse_email {
744
485
55921
        my $self = shift;
745
746        # Accept both positional string and named 'text' argument
747
485
691
        my $args = Params::Get::get_params('text', \@_);
748
485
4791
        my $text = $args->{text};
749
750        # Dereference if a scalar reference was supplied
751
485
604
        $text = $$text if ref $text eq 'SCALAR';
752
753        # Sanitise: strip control characters that could affect terminal output.
754        # Keep \t (tabs in headers), \n (line endings), \r (CRLF mail format).
755
485
2216
        $text =~ s/[^\x09\x0A\x0D\x20-\x7E\x80-\xFF]//g if defined $text;
756
757        # Store the sanitised raw text for later reproduction in reports
758
485
1082
        $self->{_raw} = $text // '';
759
760        # Invalidate all per-message lazy caches
761
485
421
        $self->{_origin}         = undef;
762
485
382
        $self->{_urls}           = undef;
763
485
349
        $self->{_mailto_domains} = undef;
764
485
484
        $self->{_domain_info}    = {};
765
485
423
        $self->{_risk}           = undef;
766
485
376
        $self->{_auth_results}   = undef;
767
485
414
        $self->{_sending_sw}     = [];
768
485
385
        $self->{_rcvd_tracking}  = [];
769
770        # Perform synchronous header/body parsing (no network I/O)
771
485
1686
        $self->_split_message($text) if defined $text && $text =~ /\S/;
772
485
799
        return $self;
773}
774
775# -----------------------------------------------------------------------
776# Public: originating host
777# -----------------------------------------------------------------------
778
779 - 851
=head2 originating_ip()

Identifies the IP address of the machine that originally injected the
message into the mail system by walking the C<Received:> chain, skipping
private/trusted hops, and enriching the first external hop with rDNS,
WHOIS/RDAP organisation name, abuse contact, and country code.

Both IPv4 and IPv6 addresses are extracted and evaluated.

The result is cached; subsequent calls return the same hashref without
repeating network I/O.

=head3 Usage

    my $orig = $analyser->originating_ip();
    if (defined $orig) {
        printf "Origin: %s (%s)\n", $orig->{ip}, $orig->{rdns};
        printf "Owner:  %s\n",      $orig->{org};
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A hashref with keys C<ip>, C<rdns>, C<org>, C<abuse>, C<confidence>,
C<note>, and C<country> (may be undef).  Returns C<undef> if no suitable
originating IP can be determined.

=head3 Side Effects

On first call: one PTR lookup and one RDAP/WHOIS query.  Results are cached
in the object and in the cross-message CHI cache (if available).

=head3 Notes

Only the first (oldest) external IP in the chain is reported.  See
C<received_trail()> for the full chain.

=head3 API Specification

=head4 Input

    []

=head4 Output

    {
        type => 'hashref | undef',
        keys => {
            ip         => { type => 'scalar', regex => qr/[\d.:a-fA-F]/ },
            rdns       => { type => 'scalar' },
            org        => { type => 'scalar' },
            abuse      => { type => 'scalar' },
            confidence => { type => 'scalar', regex => qr/^(?:high|medium|low)$/ },
            note       => { type => 'scalar' },
            country    => { type => 'scalar', optional => 1 },
        },
    }

=head3 FORMAL SPECIFICATION

    -- Z notation
    originating_ip == [
      Xi Email::Abuse::Investigator;
      result! : IP_INFO | undefined
    ]
    pre:  self._raw /= ''
    post: result! = self._origin /\
          (result! /= undefined => result!.ip in EXTERNAL_IPS)

=cut
852
853sub originating_ip {
854
591
5397
        my $self = $_[0];
855
856        # Return the cached result if we already have it
857
591
810
        $self->{_origin} //= $self->_find_origin();
858
591
563
        return $self->{_origin};
859}
860
861# -----------------------------------------------------------------------
862# Public: HTTP/HTTPS URLs
863# -----------------------------------------------------------------------
864
865 - 940
=head2 embedded_urls()

Extracts every HTTP and HTTPS URL from the message body and enriches each
one with the hosting IP address, network organisation name, abuse contact,
and country code.  Both IPv4 and IPv6 host addresses are supported.

URL extraction runs across both plain-text and HTML body parts.  DNS
lookups for each unique hostname are optionally parallelised via
C<AnyEvent::DNS> if that module is installed.

The result is cached; subsequent calls return the same list without
repeating network I/O.

=head3 Usage

    my @urls = $analyser->embedded_urls();
    for my $u (@urls) {
        printf "URL: %s  host: %s  org: %s\n",
            $u->{url}, $u->{host}, $u->{org};
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs, one per unique URL, in first-seen order.  Returns an
empty list if no HTTP/HTTPS URLs are present.  Each hashref has keys
C<url>, C<host>, C<ip>, C<org>, C<abuse>, C<country>.

=head3 Side Effects

Per unique hostname: one A/AAAA lookup and one RDAP/WHOIS query.  Results
are cached in the object and in the cross-message CHI cache.

=head3 Notes

Only C<http://> and C<https://> URLs are extracted.  URL shortener hosts
are included in the returned list (they are flagged by C<risk_assessment()>).

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                url     => { type => 'scalar', regex => qr{^https?://}i },
                host    => { type => 'scalar' },
                ip      => { type => 'scalar' },
                org     => { type => 'scalar' },
                abuse   => { type => 'scalar' },
                country => { type => 'scalar', optional => 1 },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    embedded_urls == [
      Xi Email::Abuse::Investigator;
      result! : seq URL_INFO
    ]
    pre:  self._raw /= ''
    post: result! = self._urls /\
          forall u : result! @ u.url =~ m{^https?://}i

=cut
941
942sub embedded_urls {
943
604
7101
        my $self = $_[0];
944
945
604
741
        $self->{_urls} //= $self->_extract_and_resolve_urls();
946
604
604
404
708
        return @{ $self->{_urls} };
947}
948
949# -----------------------------------------------------------------------
950# Public: mailto / reply-to / from domains
951# -----------------------------------------------------------------------
952
953 - 1022
=head2 mailto_domains()

Identifies every domain associated with the message as a contact, reply,
or delivery address, then runs a full intelligence pipeline on each one
(A record, MX, NS, WHOIS) to determine hosting and registration details.

The result is cached; subsequent calls return the same list without
repeating network I/O.

=head3 Usage

    my @domains = $analyser->mailto_domains();
    for my $d (@domains) {
        printf "Domain: %s  registrar: %s\n",
            $d->{domain}, $d->{registrar} // 'unknown';
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs, one per unique domain.  See the main POD for the full
set of possible keys.  Returns an empty list if no qualifying domains are
found.

=head3 Side Effects

Per unique domain: up to three A lookups, one MX lookup, one NS lookup,
and two WHOIS queries.  Results are cached in the object and in the
cross-message CHI cache.

=head3 Notes

MX and NS lookups require C<Net::DNS>.  Without it those keys are absent
from every returned hashref.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                domain  => { type => 'scalar' },
                source  => { type => 'scalar' },
                # All other keys optional -- see main POD
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    mailto_domains == [
      Xi Email::Abuse::Investigator;
      result! : seq DOMAIN_INFO
    ]
    pre:  self._raw /= ''
    post: result! = self._mailto_domains /\
          forall d : result! @ d.domain =~ /\.[a-zA-Z]{2,}$/

=cut
1023
1024sub mailto_domains {
1025
567
2864
        my $self = $_[0];
1026
1027
567
717
        $self->{_mailto_domains} //= $self->_extract_and_analyse_domains();
1028
567
567
381
593
        return @{ $self->{_mailto_domains} };
1029}
1030
1031 - 1086
=head2 all_domains()

Returns the deduplicated union of every registrable domain seen anywhere
in the message -- URL hosts from C<embedded_urls()> and contact domains
from C<mailto_domains()> -- normalised to eTLD+1 form.

Triggers C<embedded_urls()> and C<mailto_domains()> lazily.

=head3 Usage

    my @domains = $analyser->all_domains();
    print "$_\n" for @domains;

=head3 Arguments

None.

=head3 Returns

A list of plain strings (registrable domain names), lower-cased, no
duplicates, in first-seen order.

=head3 Side Effects

Triggers C<embedded_urls()> and C<mailto_domains()> if not already cached.

=head3 Notes

Normalisation to eTLD+1 uses C<Domain::PublicSuffix> if installed, falling
back to a built-in heuristic otherwise.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        { type => 'scalar', regex => qr/^[a-z0-9][a-z0-9.-]*\.[a-z]{2,}$/ },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    all_domains == [
      Xi Email::Abuse::Investigator;
      result! : seq STRING
    ]
    post: result! = deduplicate(
                      map(_registrable, url_hosts union mailto_domains)
                    )

=cut
1087
1088sub all_domains {
1089
11
3425
        my $self = $_[0];
1090
11
9
        my (%seen, @out);
1091
1092        # Collect registrable domains from URL hosts first
1093
11
18
        for my $u ($self->embedded_urls()) {
1094
9
14
                my $dom = _registrable($u->{host});
1095
9
28
                push @out, $dom if $dom && !$seen{$dom}++;
1096        }
1097
1098        # Then from contact domains (normalise subdomains to registrable parent)
1099
11
15
        for my $d ($self->mailto_domains()) {
1100
16
20
                my $dom = _registrable($d->{domain}) // $d->{domain};
1101
16
31
                push @out, $dom if $dom && !$seen{$dom}++;
1102        }
1103
11
19
        return @out;
1104}
1105
1106 - 1169
=head2 unresolved_contacts()

Returns a list of domains and URL hosts found in the message for which no
abuse contact could be determined.  Useful for surfacing parties that may
warrant manual investigation.

=head3 Usage

    my @unresolved = $analyser->unresolved_contacts();
    for my $u (@unresolved) {
        printf "Unresolved: %s (%s) via %s\n",
            $u->{domain}, $u->{type}, $u->{source};
    }

=head3 Arguments

None.

=head3 Returns

A list of hashrefs, each with keys C<domain>, C<type> (C<'url_host'> or
C<'domain'>), and C<source> (where the domain was found).

=head3 Side Effects

Triggers C<embedded_urls()>, C<mailto_domains()>, C<abuse_contacts()>,
and C<form_contacts()> if not already cached.

=head3 Notes

Domains sourced only from spoofable sending headers (C<From:>,
C<Return-Path:>, C<Sender:>) are excluded.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                domain => { type => 'scalar' },
                type   => { type => 'scalar', regex => qr/^(?:url_host|domain)$/ },
                source => { type => 'scalar' },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    unresolved_contacts == [
      Xi Email::Abuse::Investigator;
      result! : seq UNRESOLVED_INFO
    ]
    post: forall u : result! @
            u.domain not_in covered_domains(abuse_contacts, form_contacts)

=cut
1170
1171sub unresolved_contacts {
1172
10
40
        my $self = $_[0];
1173
1174        # Build a set of domains already covered by email or form contacts
1175
10
10
        my %covered;
1176
10
16
        for my $c ($self->abuse_contacts(), $self->form_contacts()) {
1177
3
4
                my $dom = $c->{form_domain};
1178
3
6
                unless ($dom) {
1179                        # Extract domain from abuse email address
1180
3
8
                        ($dom) = ($c->{address} // '') =~ /\@([\w.-]+)/;
1181                }
1182
3
7
                $covered{lc $dom}++ if $dom;
1183        }
1184
1185        # Also mark URL hosts that already have a resolved abuse address
1186
10
16
        for my $u ($self->embedded_urls()) {
1187
6
14
                (my $bare = lc $u->{host}) =~ s/^www\.//;
1188
6
23
                $covered{$bare}++ if $u->{abuse} && $u->{abuse} ne '(unknown)';
1189        }
1190
1191
10
11
        my (@out, %seen);
1192
1193        # Check URL hosts first
1194
10
11
        for my $u ($self->embedded_urls()) {
1195
6
12
                (my $bare = lc $u->{host}) =~ s/^www\.//;
1196
6
12
                next if $covered{$bare};
1197
6
29
                next if $seen{"url:$bare"}++;
1198                push @out, {
1199                        domain => $u->{host},
1200
6
23
                        type   => 'url_host',
1201                        source => 'URL in body',
1202                };
1203        }
1204
1205        # Then check contact domains, skipping spoofable-header-only sources
1206
10
15
        for my $d ($self->mailto_domains()) {
1207
15
17
                my $dom    = $d->{domain};
1208
15
15
                my $source = $d->{source} // '';
1209
15
32
                next if $source =~ /^(?:From:|Return-Path:|Sender:) header$/;
1210
3
6
                next if $covered{lc $dom};
1211
3
7
                next if $seen{"dom:$dom"}++;
1212
3
7
                push @out, {
1213                        domain => $dom,
1214                        type   => 'domain',
1215                        source => $source,
1216                };
1217        }
1218
1219
10
19
        return @out;
1220}
1221
1222# -----------------------------------------------------------------------
1223# Public: sending software fingerprint
1224# -----------------------------------------------------------------------
1225
1226 - 1289
=head2 sending_software()

Returns information extracted from headers that identify the software or
server-side infrastructure used to compose or inject the message.  Headers
such as C<X-PHP-Originating-Script> reveal the exact PHP script and Unix
account responsible on shared-hosting platforms.

Data is extracted during C<parse_email()> with no network I/O.

=head3 Usage

    my @sw = $analyser->sending_software();
    for my $s (@sw) {
        printf "%-30s : %s\n", $s->{header}, $s->{value};
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs in alphabetical header-name order.  Returns an empty
list if none of the watched headers are present.  Each hashref has keys
C<header>, C<value>, and C<note>.

=head3 Side Effects

None.  Data is pre-collected during C<parse_email()>.

=head3 Notes

Header names are lower-cased.  Header values are stored verbatim.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                header => { type => 'scalar' },
                value  => { type => 'scalar' },
                note   => { type => 'scalar' },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    sending_software == [
      Xi Email::Abuse::Investigator;
      result! : seq SW_INFO
    ]
    post: result! = self._sending_sw

=cut
1290
1291sub sending_software {
1292
53
50
        my $self = $_[0];
1293
1294
53
53
33
62
        return @{ $self->{_sending_sw} };
1295}
1296
1297# -----------------------------------------------------------------------
1298# Public: per-hop tracking IDs
1299# -----------------------------------------------------------------------
1300
1301 - 1366
=head2 received_trail()

Returns per-hop tracking data extracted from the C<Received:> header chain:
the IP address, envelope recipient address, and server session ID for each
relay.  ISP postmasters use these identifiers to locate the SMTP session in
their logs.

=head3 Usage

    my @trail = $analyser->received_trail();
    for my $hop (@trail) {
        printf "IP: %s  ID: %s\n",
            $hop->{ip} // '?', $hop->{id} // '?';
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs in oldest-first order.  Returns an empty list if no
C<Received:> headers are present or none yielded extractable data.  Each
hashref has keys C<received>, C<ip> (may be undef), C<for> (may be undef),
C<id> (may be undef).

=head3 Side Effects

None.  Data is pre-collected during C<parse_email()>.

=head3 Notes

Private IPs are NOT filtered here; all IPs including RFC 1918 addresses
are returned as found.  Filtering is applied only by C<originating_ip()>.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                received => { type => 'scalar' },
                ip       => { type => 'scalar', optional => 1 },
                for      => { type => 'scalar', optional => 1 },
                id       => { type => 'scalar', optional => 1 },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    received_trail == [
      Xi Email::Abuse::Investigator;
      result! : seq HOP_INFO
    ]
    post: result! = self._rcvd_tracking

=cut
1367
1368sub received_trail {
1369
53
40
        my $self = $_[0];
1370
1371
53
53
32
84
        return @{ $self->{_rcvd_tracking} };
1372}
1373
1374# -----------------------------------------------------------------------
1375# Public: risk assessment
1376# -----------------------------------------------------------------------
1377
1378 - 1449
=head2 risk_assessment()

Evaluates the message against heuristic checks and returns an overall risk
level, a weighted numeric score, and a list of every specific red flag.

The assessment covers five categories: originating IP, email authentication,
Date: header validity, identity/header consistency, and URL/domain properties.

The result is cached; subsequent calls return the same hashref without
repeating any analysis.

=head3 Usage

    my $risk = $analyser->risk_assessment();
    printf "Risk: %s (score: %d)\n", $risk->{level}, $risk->{score};
    for my $f (@{ $risk->{flags} }) {
        printf "  [%s] %s\n", $f->{severity}, $f->{detail};
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A hashref with keys C<level> (HIGH/MEDIUM/LOW/INFO), C<score> (integer),
and C<flags> (arrayref of hashrefs with C<severity>, C<flag>, C<detail>).

=head3 Side Effects

Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.

=head3 Notes

Scores: HIGH >= 9, MEDIUM >= 5, LOW >= 2, INFO < 2.
Flag weights: HIGH=3, MEDIUM=2, LOW=1, INFO=0.

=head3 API Specification

=head4 Input

    []

=head4 Output

    {
        type => 'hashref',
        keys => {
            level => { type => 'scalar', regex => qr/^(?:HIGH|MEDIUM|LOW|INFO)$/ },
            score => { type => 'scalar', regex => qr/^\d+$/ },
            flags => { type => 'arrayref' },
        },
    }

=head3 FORMAL SPECIFICATION

    -- Z notation
    risk_assessment == [
      Xi Email::Abuse::Investigator;
      result! : RISK_INFO
    ]
    post: result!.score = sum({ w(f.severity) | f in result!.flags }) /\
          result!.level = classify(result!.score)
    where:
      w(HIGH) = 3; w(MEDIUM) = 2; w(LOW) = 1; w(INFO) = 0
      classify(s) = HIGH   if s >= 9
                  | MEDIUM if s >= 5
                  | LOW    if s >= 2
                  | INFO   otherwise

=cut
1450
1451sub risk_assessment {
1452
261
6828
        my $self = $_[0];
1453
1454
261
301
        return $self->{_risk} if $self->{_risk};
1455
1456
241
174
        my (@flags, $score);
1457
241
162
        $score = 0;
1458
1459        # Closure to record a flag and accumulate its weight
1460        my $flag = sub {
1461
256
273
                my ($severity, $name, $detail) = @_;
1462
256
509
                $score += $FLAG_WEIGHT{$severity} // 1;
1463
256
1100
                push @flags, { severity => $severity, flag => $name, detail => $detail };
1464
241
458
        };
1465
1466        # --- Group 1: Originating IP checks ---
1467
241
259
        my $orig = $self->originating_ip();
1468
241
261
        if ($orig) {
1469                # Residential / broadband rDNS patterns suggest a compromised host
1470
198
776
                if ($orig->{rdns} && $orig->{rdns} =~ /
1471                        \d+[-_.]\d+[-_.]\d+[-_.]\d+   # dotted-quad in rDNS
1472                        | (?:dsl|adsl|cable|broad|dial|dynamic|dhcp|ppp|
1473                             residential|cust|home|pool|client|user|
1474                             static\d|host\d)
1475                /xi) {
1476
27
43
                        $flag->('HIGH', 'residential_sending_ip',
1477                                "Sending IP $orig->{ip} rDNS '$orig->{rdns}' looks like a broadband/residential line, not a legitimate mail server");
1478                }
1479
1480                # Absence of rDNS is a strong spam indicator
1481
198
345
                if (!$orig->{rdns} || $orig->{rdns} eq '(no reverse DNS)') {
1482
8
35
                        $flag->('HIGH', 'no_reverse_dns',
1483                                "Sending IP $orig->{ip} has no reverse DNS -- legitimate mail servers always have rDNS");
1484                }
1485
1486                # Low-confidence origin means the IP came from an unverifiable header
1487
198
206
                if ($orig->{confidence} eq 'low') {
1488
3
7
                        $flag->('MEDIUM', 'low_confidence_origin',
1489                                "Originating IP taken from unverified header ($orig->{note})");
1490                }
1491
1492                # Statistically high-volume spam countries (informational only)
1493
198
281
                if ($orig->{country} && $orig->{country} =~ /^(?:CN|RU|NG|VN|IN|PK|BD)$/) {
1494                        $flag->('INFO', 'high_spam_country',
1495
18
37
                                'Sending IP is in ' . _country_name($orig->{country}) .
1496                                " ($orig->{country}) -- statistically high spam volume country");
1497                }
1498        }
1499
1500        # --- Group 2: Email authentication checks ---
1501
241
269
        my $auth = $self->_parse_auth_results_cached();
1502
241
245
        if (defined $auth->{spf}) {
1503
19
52
                if ($auth->{spf} =~ /^fail/i) {
1504
6
10
                        $flag->('HIGH', 'spf_fail',
1505                                "SPF result: $auth->{spf} -- sending IP not authorised by domain's SPF record");
1506                } elsif ($auth->{spf} =~ /^softfail/i) {
1507
1
2
                        $flag->('MEDIUM', 'spf_softfail',
1508                                "SPF result: softfail (~all) -- sending IP not explicitly authorised");
1509                } elsif ($auth->{spf} !~ /^pass/i) {
1510
0
0
                        $flag->('HIGH', 'spf_fail',
1511                                "SPF result: $auth->{spf} -- sending IP not authorised");
1512                }
1513        }
1514
241
298
        if (defined $auth->{dkim} && $auth->{dkim} !~ /^pass/i) {
1515
7
15
                $flag->('HIGH', 'dkim_fail',
1516                        "DKIM result: $auth->{dkim} -- message signature invalid or absent");
1517        }
1518
241
301
        if (defined $auth->{dmarc} && $auth->{dmarc} !~ /^pass/i) {
1519
5
8
                $flag->('HIGH', 'dmarc_fail', "DMARC result: $auth->{dmarc}");
1520        }
1521
1522        # DKIM signing domain vs From: domain mismatch check
1523
241
210
        if ($auth->{dkim_domain}) {
1524
4
6
                my ($from_domain) = ($self->_header_value('from') // '') =~ /\@([\w.-]+)/;
1525
4
6
                if ($from_domain) {
1526
4
6
                        my $reg_dkim = _registrable($auth->{dkim_domain}) // $auth->{dkim_domain};
1527
4
6
                        my $reg_from = _registrable(lc $from_domain)     // lc $from_domain;
1528
4
5
                        if ($reg_dkim ne $reg_from) {
1529                                # Passing DKIM with a different domain is normal for ESPs
1530
2
5
                                if ($auth->{dkim} && $auth->{dkim} =~ /^pass/i) {
1531
1
3
                                        $flag->('INFO', 'dkim_domain_mismatch',
1532                                                "DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'"
1533                                                . ' -- message sent via third-party sender (normal for bulk/ESP mail)');
1534                                } else {
1535                                        # Failing DKIM plus mismatched domain is more suspicious
1536
1
2
                                        $flag->('MEDIUM', 'dkim_domain_mismatch',
1537                                                "DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'"
1538                                                . ' and DKIM did not pass -- possible impersonation');
1539                                }
1540                        }
1541                }
1542        }
1543
1544        # --- Group 3: Date: header checks ---
1545
241
246
        my $date_raw = $self->_header_value('date');
1546
241
468
        if (!$date_raw || $date_raw !~ /\S/) {
1547
35
34
                $flag->('MEDIUM', 'missing_date',
1548                        'No Date: header -- violates RFC 5322; common in spam');
1549        } else {
1550                # Check for an implausible timezone offset (outside real-world bounds)
1551
206
442
                if ($date_raw =~ /([+-])(\d{2})(\d{2})\s*$/) {
1552
206
309
                        my ($sign, $hh, $mm) = ($1, $2, $3);
1553
206
229
                        my $offset_mins = $hh * 60 + $mm;
1554
206
545
                        my $implausible = $mm >= 60
1555                                || ($sign eq '+' && $offset_mins > $TZ_MAX_POS_MINS)
1556                                || ($sign eq '-' && $offset_mins > $TZ_MAX_NEG_MINS);
1557
206
217
                        if ($implausible) {
1558
8
17
                                $flag->('MEDIUM', 'implausible_timezone',
1559                                        "Date: '$date_raw' contains an implausible timezone offset "
1560                                        . "($sign$hh$mm) -- header is likely forged");
1561                        }
1562                }
1563
1564                # Check for dates more than DATE_SKEW_DAYS outside the analysis window
1565
206
202
                my $date_epoch = _parse_rfc2822_date($date_raw);
1566
206
3220
                if (defined $date_epoch) {
1567
206
186
                        my $delta = time() - $date_epoch;
1568
206
272
                        if ($delta > $DATE_SKEW_DAYS * $SECS_PER_DAY) {
1569
25
43
                                $flag->('LOW', 'suspicious_date',
1570                                        "Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the past");
1571                        } elsif ($delta < -($DATE_SKEW_DAYS * $SECS_PER_DAY)) {
1572
2
3
                                $flag->('LOW', 'suspicious_date',
1573                                        "Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the future");
1574                        }
1575                }
1576        }
1577
1578        # --- Group 4: Header identity checks ---
1579
241
230
        my $from_raw     = $self->_header_value('from') // '';
1580
241
273
        my $from_decoded = $self->_decode_mime_words($from_raw);
1581
1582        # Display-name domain spoofing: "PayPal paypal.com" <phish@evil.example>
1583
241
522
        if ($from_decoded =~ /^"?([^"<]+?)"?\s*<([^>]+)>/) {
1584
130
156
                my ($display, $addr) = ($1, $2);
1585
130
179
                while ($display =~ /\b([\w-]+\.(?:com|net|org|io|co|uk|au|gov|edu))\b/gi) {
1586
5
7
                        my $disp_domain = lc $1;
1587
5
9
                        my ($addr_domain) = $addr =~ /\@([\w.-]+)/;
1588
5
9
                        $addr_domain = lc($addr_domain // '');
1589
5
8
                        my $reg_disp = _registrable($disp_domain);
1590
5
8
                        my $reg_addr = _registrable($addr_domain);
1591
5
17
                        if ($reg_disp && $reg_addr && $reg_disp ne $reg_addr) {
1592
5
9
                                $flag->('HIGH', 'display_name_domain_spoof',
1593                                        "From: display name mentions '$disp_domain' but actual address is <$addr>");
1594                        }
1595                }
1596        }
1597
1598        # Free webmail sender flag (no corporate infrastructure)
1599
241
515
        if ($from_raw =~ /\@(gmail|yahoo|hotmail|outlook|live|aol|protonmail|yandex)\./i
1600         || $from_raw =~ /\@mail\.ru(?:[\s>]|$)/i) {
1601
26
60
                $flag->('MEDIUM', 'free_webmail_sender',
1602                        "Message sent from free webmail address ($from_raw)");
1603        }
1604
1605        # Reply-To differs from From: -- replies harvested by different address
1606
241
213
        my $reply_to = $self->_header_value('reply-to');
1607
241
212
        if ($reply_to) {
1608
6
13
                my ($from_addr)  = $from_raw =~ /([\w.+%-]+\@[\w.-]+)/;
1609
6
11
                my ($reply_addr) = $reply_to =~ /([\w.+%-]+\@[\w.-]+)/;
1610
6
27
                if ($from_addr && $reply_addr && lc($from_addr) ne lc($reply_addr)) {
1611
4
6
                        $flag->('MEDIUM', 'reply_to_differs_from_from',
1612                                "Reply-To ($reply_addr) differs from From: ($from_addr)");
1613                }
1614        }
1615
1616        # Undisclosed or absent To: header
1617
241
204
        my $to = $self->_header_value('to') // '';
1618
241
531
        if ($to =~ /undisclosed|:;/ || $to eq '') {
1619
37
45
                $flag->('MEDIUM', 'undisclosed_recipients',
1620                        "To: header is '$to' -- message was bulk-sent with hidden recipient list");
1621        }
1622
1623        # MIME-encoded Subject (potential filter evasion)
1624
241
216
        my $subj_raw = $self->_header_value('subject') // '';
1625
241
275
        if ($subj_raw =~ /=\?[^?]+\?[BQ]\?/i) {
1626
7
12
                $flag->('LOW', 'encoded_subject',
1627                        "Subject line is MIME-encoded: '$subj_raw' (decoded: '"
1628                        . $self->_decode_mime_words($subj_raw) . "')");
1629        }
1630
1631        # --- Group 5: URL and domain checks ---
1632
241
170
        my (%shortener_seen, %url_host_seen);
1633
241
261
        for my $u ($self->embedded_urls()) {
1634                # Skip trusted infrastructure -- these are not spam indicators
1635
49
52
                my $bare = lc $u->{host};
1636
49
47
                $bare =~ s/^www\.//;
1637
49
58
                next if $self->{trusted_domains}->{$bare};
1638
49
55
                next if $TRUSTED_DOMAINS{$bare};
1639
1640                # URL shortener hides real destination
1641
47
120
                if(($URL_SHORTENERS{$bare} || $self->{url_shorteners}->{$bare}) && !$shortener_seen{$bare}++) {
1642
6
9
                        $flag->('MEDIUM', 'url_shortener',
1643                                "$u->{host} is a URL shortener -- the real destination is hidden");
1644                }
1645                # Plain HTTP provides no encryption
1646
47
91
                if ($u->{url} =~ m{^http://}i && !$url_host_seen{ $u->{host} }++) {
1647
5
11
                        $flag->('LOW', 'http_not_https',
1648                                "$u->{host} linked over plain HTTP -- no encryption");
1649                }
1650        }
1651
1652        # Domain-level checks against contact/reply domains
1653
241
233
        for my $d ($self->mailto_domains()) {
1654                # Recently registered domain is a common phishing indicator
1655
109
107
                if ($d->{recently_registered}) {
1656
5
35
                        $flag->('HIGH', 'recently_registered_domain',
1657
5
12
                                "$d->{domain} was registered $d->{registered} (less than ${\$RECENT_REG_DAYS} days ago)");
1658                }
1659
1660                # Domain expiry checks
1661
109
100
                if ($d->{expires}) {
1662
13
16
                        if(my $exp = $self->_parse_date_to_epoch($d->{expires})) {
1663
13
176
                                my $now      = time();
1664
13
11
                                my $remaining = $exp - $now;
1665
13
38
                                if ($remaining > 0 && $remaining < $EXPIRY_WARN_DAYS * $SECS_PER_DAY) {
1666
3
6
                                        $flag->('HIGH', 'domain_expires_soon',
1667                                                "$d->{domain} expires $d->{expires} -- may be a throwaway domain");
1668                                } elsif ($remaining <= 0) {
1669
3
6
                                        $flag->('HIGH', 'domain_expired',
1670                                                "$d->{domain} expired $d->{expires} -- domain has lapsed");
1671                                }
1672                        }
1673                }
1674
1675                # Lookalike domain check (brand name in a non-brand domain)
1676
109
109
                for my $brand (qw(paypal apple google amazon microsoft netflix ebay
1677                                  instagram facebook twitter linkedin bankofamerica
1678                                  wellsfargo chase barclays hsbc lloyds santander)) {
1679
1833
4688
                        if ($d->{domain} =~ /\Q$brand\E/i &&
1680                            $d->{domain} !~ /^\Q$brand\E\.(?:com|co\.uk|net|org)$/) {
1681
8
11
                                $flag->('HIGH', 'lookalike_domain',
1682                                        "$d->{domain} contains brand name '$brand' but is not the real domain -- possible phishing");
1683
8
11
                                last;
1684                        }
1685                }
1686        }
1687
1688        # Determine overall risk level from accumulated score
1689
241
331
        my $level = $score >= $SCORE_HIGH   ? 'HIGH'
1690                  : $score >= $SCORE_MEDIUM ? 'MEDIUM'
1691                  : $score >= $SCORE_LOW    ? 'LOW'
1692                  :                           'INFO';
1693
1694
241
378
        $self->{_risk} = { level => $level, score => $score, flags => \@flags };
1695
241
839
        return $self->{_risk};
1696}
1697
1698# -----------------------------------------------------------------------
1699# Public: abuse report text
1700# -----------------------------------------------------------------------
1701
1702 - 1758
=head2 abuse_report_text()

Produces a compact, plain-text string suitable for sending as the body of
an abuse report email.  It summarises risk level, red flags, originating IP,
abuse contacts, and original message headers.  The message body is omitted
to keep the report concise.

Use C<abuse_contacts()> to get the recipient addresses and this method for
the body text.

=head3 Usage

    my $text     = $analyser->abuse_report_text();
    my @contacts = $analyser->abuse_contacts();
    for my $c (@contacts) {
        send_email(to => $c->{address}, body => $text);
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A plain scalar string, newline-terminated, Unix line endings.  Never empty
or undef.

=head3 Side Effects

Calls C<risk_assessment()>, C<originating_ip()>, and C<abuse_contacts()>
if not already cached.

=head3 Notes

Output text is sanitised: control characters that could affect terminal or
HTML rendering are stripped from all user-derived content before inclusion.

=head3 API Specification

=head4 Input

    []

=head4 Output

    { type => 'scalar' }

=head3 FORMAL SPECIFICATION

    -- Z notation
    abuse_report_text == [
      Xi Email::Abuse::Investigator;
      result! : STRING
    ]
    post: result! /= '' /\ result! ends_with '\n'

=cut
1759
1760sub abuse_report_text {
1761
14
3874
        my $self = $_[0];
1762
14
12
        my @out;
1763
1764
14
18
        push @out, 'This is an automated abuse report generated by Email::Abuse::Investigator.',
1765                 'Please investigate the following spam/phishing message.',
1766                '';
1767
1768
14
47
        my $risk = $self->risk_assessment();
1769
14
29
        push @out, "RISK LEVEL: $risk->{level} (score: $risk->{score})",
1770                '';
1771
1772        # List each red flag with its severity prefix
1773
14
14
8
21
        if (@{ $risk->{flags} }) {
1774
9
10
                push @out, 'RED FLAGS IDENTIFIED:';
1775
9
9
9
11
                for my $f (@{ $risk->{flags} }) {
1776
17
24
                        push @out, "  [$f->{severity}] " . _sanitise_output($f->{detail});
1777                }
1778
9
9
                push @out, '';
1779        }
1780
1781        # Originating IP summary block
1782
14
16
        my $orig = $self->originating_ip();
1783
14
17
        if ($orig) {
1784                push @out, 'ORIGINATING IP: ' . _sanitise_output("$orig->{ip} ($orig->{rdns})"),
1785
11
16
                        'NETWORK OWNER:  ' . _sanitise_output($orig->{org}),
1786                        '';
1787        }
1788
1789        # Email abuse contacts
1790
14
27
        my @contacts = $self->abuse_contacts();
1791
14
17
        if (@contacts) {
1792
12
12
                push @out, 'ABUSE CONTACTS:';
1793
12
25
                push @out, '  ' . _sanitise_output("$_->{address} ($_->{role})") for @contacts;
1794
12
11
                push @out, '';
1795        }
1796
1797        # Web-form contacts (providers that reject email)
1798
14
21
        if(my @form_cs = $self->form_contacts()) {
1799
0
0
                push @out, 'WEB-FORM REPORTS REQUIRED:',
1800                        '  The following parties do not accept email -- submit manually:';
1801
0
0
                for my $c (@form_cs) {
1802                        push @out, "  [$c->{role}]",
1803
0
0
                                '    Form   : ' . _sanitise_output($c->{form});
1804
0
0
                        push @out, '    Domain : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain};
1805
0
0
                        push @out, '    Paste  : ' . _sanitise_output($c->{form_paste})  if $c->{form_paste};
1806
0
0
                        push @out, '    Upload : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload};
1807                }
1808
0
0
                push @out, '';
1809        }
1810
1811        # Separator and raw headers (body excluded for brevity)
1812
14
24
        push @out, '-' x 72,
1813                'ORIGINAL MESSAGE HEADERS:',
1814                '-' x 72;
1815
1816
14
14
11
15
        for my $h (@{ $self->{_headers} }) {
1817
114
105
                push @out, _sanitise_output("$h->{name}: $h->{value}");
1818        }
1819
14
14
        push @out, '';
1820
1821
14
67
        return join("\n", @out);
1822}
1823
1824# -----------------------------------------------------------------------
1825# Public: abuse contacts
1826# -----------------------------------------------------------------------
1827
1828 - 1895
=head2 abuse_contacts()

Collates the complete set of parties that should receive an abuse report:
the sending ISP, URL host operators, contact domain web/mail/DNS/registrar
contacts, account providers identified from key headers, the DKIM signer,
and the ESP identified via List-Unsubscribe.

Addresses are deduplicated globally; if the same address is found via
multiple routes, a single entry is kept and role strings are merged.

=head3 Usage

    my @contacts = $analyser->abuse_contacts();
    my @addrs    = map { $_->{address} } @contacts;

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs, one per unique abuse address, in discovery order.
Each hashref has keys C<role>, C<roles> (arrayref), C<address>, C<note>,
C<via>.  Returns an empty list if no contacts can be determined.

=head3 Side Effects

Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.

=head3 Notes

The result is not independently cached; each call recomputes the contact
list from the cached results of the underlying methods.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                role    => { type => 'scalar' },
                roles   => { type => 'arrayref' },
                address => { type => 'scalar', regex => qr/\@/ },
                note    => { type => 'scalar' },
                via     => { type => 'scalar', regex => qr/^(?:provider-table|ip-whois|domain-whois)$/ },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    abuse_contacts == [
      Xi Email::Abuse::Investigator;
      result! : seq CONTACT_INFO
    ]
    post: forall c : result! @ c.address contains '@' /\
          forall c1, c2 : result! @ c1 /= c2 => c1.address /= c2.address

=cut
1896
1897sub abuse_contacts {
1898
136
3008
        my $self = $_[0];
1899
1900
136
104
        my (@contacts, %seen_idx);
1901
1902        # Inner closure: add one contact entry, merging roles for duplicate addresses
1903        my $add = sub {
1904
296
473
                my (%args) = @_;
1905
296
346
                my $addr = lc($args{address} // '');
1906
296
442
                return unless $addr && $addr =~ /\@/;
1907
1908                # Suppress addresses belonging to form-only providers (no email accepted)
1909
294
428
                if ($addr =~ /\@([\w.-]+)$/) {
1910
294
248
                        my $dom = $1;
1911
294
258
                        my $pa  = $self->_provider_abuse_for_host($dom);
1912
294
373
                        return if $pa && $pa->{form} && !$pa->{email};
1913                }
1914
1915
290
324
                if (exists $seen_idx{$addr}) {
1916                        # Merge the new role into the existing entry
1917
145
108
                        my $entry = $contacts[ $seen_idx{$addr} ];
1918
145
145
88
148
                        push @{ $entry->{roles} }, $args{role};
1919
1920                        # Collapse repeated role labels to avoid unreadable strings
1921
145
100
                        my (%role_counts, @ordered_roles);
1922
145
145
86
114
                        for my $r (@{ $entry->{roles} }) {
1923
1219
1155
                                push @ordered_roles, $r unless $role_counts{$r}++;
1924                        }
1925                        my @display = map {
1926
145
1212
131
1020
                                $role_counts{$_} > 1 ? "$_ (x$role_counts{$_})" : $_
1927                        } @ordered_roles;
1928
145
211
                        my $joined = join(' and ', @display);
1929
1930                        # Summarise if the merged string is too long to read
1931
145
157
                        if (length($joined) > $ROLE_MAX_LEN) {
1932                                my @short = map {
1933
73
1051
55
1103
                                        (my $s = $_) =~ s/[:(\d].*//;
1934
1051
980
                                        $s =~ s/\s+$//;
1935
1051
764
                                        $s;
1936                                } @display;
1937
73
142
                                $joined = scalar(@display) . ' routes: ' . join(', ', @short);
1938                        }
1939
145
111
                        $entry->{role} = $joined;
1940
145
315
                        return;
1941                }
1942
1943                # First time seeing this address -- record and store
1944
145
163
                $seen_idx{$addr} = scalar @contacts;
1945
145
173
                $args{roles} = [ $args{role} ];
1946
145
185
                push @contacts, \%args;
1947
136
369
        };
1948
1949        # Route 1 -- Sending ISP (originating IP)
1950
136
148
        my $orig = $self->originating_ip();
1951
136
161
        if ($orig) {
1952
71
111
                my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns});
1953
71
110
                if ($pa) {
1954                        $add->(
1955                                role    => 'Sending ISP',
1956                                address => $pa->{email},
1957
13
29
                                note    => "$orig->{ip} ($orig->{rdns}) -- $pa->{note}",
1958                                via     => 'provider-table',
1959                        );
1960                }
1961
71
156
                if ($orig->{abuse} && $orig->{abuse} ne '(unknown)') {
1962                        $add->(
1963                                role    => 'Sending ISP',
1964                                address => $orig->{abuse},
1965
64
124
                                note    => "Network owner of originating IP $orig->{ip} ($orig->{org})",
1966                                via     => 'ip-whois',
1967                        );
1968                }
1969        }
1970
1971        # Route 2 -- URL hosts
1972
136
108
        my %url_host_seen;
1973
136
150
        for my $u ($self->embedded_urls()) {
1974
78
120
                next if $url_host_seen{ $u->{host} }++;
1975
66
69
                my $bare_host = lc $u->{host};
1976
66
71
                $bare_host =~ s/^www\.//;
1977                # Skip trusted infrastructure (Google, W3C, etc.)
1978
66
79
                next if $self->{trusted_domains}->{$bare_host};
1979
66
86
                next if $TRUSTED_DOMAINS{$bare_host};
1980
63
73
                my $pa = $self->_provider_abuse_for_host($u->{host});
1981
63
64
                if ($pa) {
1982                        $add->(
1983                                role    => "URL host: $u->{host}",
1984                                address => $pa->{email},
1985
15
71
                                note    => "$u->{host} -- $pa->{note}",
1986                                via     => 'provider-table',
1987                        );
1988                }
1989
63
117
                if ($u->{abuse} && $u->{abuse} ne '(unknown)') {
1990                        $add->(
1991                                role    => "URL host: $u->{host}",
1992                                address => $u->{abuse},
1993
39
84
                                note    => "Hosting $u->{host} ($u->{ip}, $u->{org})",
1994                                via     => 'ip-whois',
1995                        );
1996                }
1997        }
1998
1999        # Route 3 -- Contact domain hosting and registration
2000
136
147
        for my $d ($self->mailto_domains()) {
2001
152
144
                my $dom = $d->{domain};
2002
2003                # Web host contact
2004
152
187
                if ($d->{web_abuse}) {
2005
57
58
                        my $pa = $self->_provider_abuse_for_host($dom);
2006
57
45
                        if ($pa) {
2007                                $add->(role => "Web host of $dom", address => $pa->{email},
2008
0
0
                                       note => $pa->{note}, via => 'provider-table');
2009                        }
2010                        $add->(
2011                                role    => "Web host of $dom",
2012                                address => $d->{web_abuse},
2013                                note    => sprintf('Hosting %s (%s, %s)',
2014                                             $dom             // '(unknown domain)',
2015                                             $d->{web_ip}     // '(unknown IP)',
2016
57
192
                                             $d->{web_org}    // '(unknown org)'),
2017                                via     => 'ip-whois',
2018                        );
2019                }
2020
2021                # MX (mail host) contact
2022
152
150
                if ($d->{mx_abuse}) {
2023                        $add->(
2024                                role    => "Mail host (MX) for $dom",
2025                                address => $d->{mx_abuse},
2026                                note    => sprintf('MX %s (%s, %s)',
2027                                             $d->{mx_host} // '(unknown host)',
2028                                             $d->{mx_ip}   // '(unknown IP)',
2029
23
93
                                             $d->{mx_org}  // '(unknown org)'),
2030                                via     => 'ip-whois',
2031                        );
2032                }
2033
2034                # NS (DNS host) contact
2035
152
150
                if ($d->{ns_abuse}) {
2036                        $add->(
2037                                role    => "DNS host (NS) for $dom",
2038                                address => $d->{ns_abuse},
2039                                note    => sprintf('NS %s (%s, %s)',
2040                                             $d->{ns_host} // '(unknown host)',
2041                                             $d->{ns_ip}   // '(unknown IP)',
2042
15
50
                                             $d->{ns_org}  // '(unknown org)'),
2043                                via     => 'ip-whois',
2044                        );
2045                }
2046
2047                # Domain registrar (skip if domain only seen in spoofable headers)
2048
152
151
                if ($d->{registrar_abuse}) {
2049                        my $spoofable_only =
2050                                $d->{source} =~ /^(?:From:|Return-Path:|Sender:) header$/ &&
2051                                !scalar(grep {
2052
36
67
                                        $_->{host} &&
2053                                        _registrable($_->{host}) eq (_registrable($dom) // $dom)
2054                                } $self->embedded_urls());
2055
36
38
                        unless ($spoofable_only) {
2056                                $add->(
2057                                        role    => "Domain registrar for $dom",
2058                                        address => $d->{registrar_abuse},
2059
28
66
                                        note    => 'Registrar: ' . ($d->{registrar} // '(unknown)'),
2060                                        via     => 'domain-whois',
2061                                );
2062                        }
2063                }
2064        }
2065
2066        # Route 4 -- From:/Reply-To:/Return-Path:/Sender: account provider
2067
136
132
        for my $hname (qw(from reply-to return-path sender)) {
2068
544
452
                my $val = $self->_header_value($hname) // next;
2069
2070                # Extract addr-spec from angle-bracket form to avoid display-name @-signs
2071
245
501
                my $addr_spec = ($val =~ /<([^>]*)>\s*$/) ? $1 : $val;
2072
245
379
                my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/;
2073
245
230
                next unless $addr_domain;
2074
2075                # Skip SRS-rewritten forwarder addresses (not the real sender)
2076
245
232
                next if $addr_spec =~ /\+SRS[0-9]?=/i;
2077
2078
243
201
                my $pa = $self->_provider_abuse_for_host($addr_domain);
2079
243
253
                if ($pa) {
2080
37
49
                        my $role_addr = $addr_spec =~ /\@/ ? $addr_spec : $val;
2081
37
103
                        $role_addr =~ s/^\s+|\s+$//g;
2082                        $add->(
2083                                role    => "Account provider ($hname: $role_addr)",
2084                                address => $pa->{email},
2085                                note    => $pa->{note},
2086
37
73
                                via     => 'provider-table',
2087                        );
2088                }
2089        }
2090
2091        # Route 5 -- DKIM signing organisation
2092
136
139
        my $auth = $self->_parse_auth_results_cached();
2093
136
138
        if ($auth->{dkim_domain}) {
2094
3
4
                my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain});
2095
3
3
                if ($pa) {
2096                        $add->(
2097                                role    => "DKIM signer: $auth->{dkim_domain}",
2098                                address => $pa->{email},
2099                                note    => $pa->{note},
2100
1
3
                                via     => 'provider-table',
2101                        );
2102                }
2103        }
2104
2105        # Route 6 -- List-Unsubscribe ESP domain
2106
136
119
        my $unsub = $self->_header_value('list-unsubscribe');
2107
136
129
        if ($unsub) {
2108
0
0
                my @unsub_domains;
2109
0
0
                while ($unsub =~ m{https?://([^/:?\s>]+)}gi) {
2110
0
0
                        push @unsub_domains, lc $1;
2111                }
2112
0
0
                while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) {
2113
0
0
                        push @unsub_domains, lc $1;
2114                }
2115
0
0
                my %unsub_seen;
2116
0
0
0
0
                for my $dom (grep { !$unsub_seen{$_}++ } @unsub_domains) {
2117
0
0
                        my $pa = $self->_provider_abuse_for_host($dom);
2118
0
0
                        if ($pa) {
2119                                $add->(
2120                                        role    => "ESP / bulk sender (List-Unsubscribe: $dom)",
2121                                        address => $pa->{email},
2122
0
0
                                        note    => "$pa->{note} -- responsible for this bulk delivery",
2123                                        via     => 'provider-table',
2124                                );
2125                        }
2126                }
2127        }
2128
2129        # Route 7 -- Reply addresses embedded in the message body
2130
136
100
        my %body_addr_seen;
2131
136
142
        my $combined_body = $self->{_body_plain} . "\n" . $self->{_body_html};
2132
136
154
        for my $addr_dom ($self->_domains_from_text($combined_body)) {
2133
20
22
                next if $body_addr_seen{$addr_dom}++;
2134
20
26
                my $pa = $self->_provider_abuse_for_host($addr_dom);
2135
20
33
                next unless $pa && $pa->{email};
2136
4
55
                my ($example_addr) = $combined_body =~ /(\S+\@\Q$addr_dom\E)/i;
2137
4
4
                $example_addr //= "\@$addr_dom";
2138                $add->(
2139                        role    => "Reply address in body ($example_addr)",
2140                        address => $pa->{email},
2141                        note    => $pa->{note},
2142
4
6
                        via     => 'provider-table',
2143                );
2144        }
2145
2146
136
991
        return @contacts;
2147}
2148
2149# -----------------------------------------------------------------------
2150# Public: form contacts (providers that require web-form submission)
2151# -----------------------------------------------------------------------
2152
2153 - 2219
=head2 form_contacts()

Returns the list of parties that require abuse reports via a web form
rather than email.  These are providers whose C<%PROVIDER_ABUSE> entry
has a C<form> key.  Each hashref includes the form URL, paste
instructions, upload instructions, and the discovery role.

=head3 Usage

    my @forms = $analyser->form_contacts();
    for my $c (@forms) {
        printf "Open: %s\n", $c->{form};
    }

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A list of hashrefs, one per unique form contact.  Each hashref has keys
C<form>, C<role>, C<note>, C<form_paste> (optional), C<form_upload>
(optional), and C<via>.  Returns an empty list if no form contacts are found.

=head3 Side Effects

Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.

=head3 Notes

Deduplication is by form URL.

=head3 API Specification

=head4 Input

    []

=head4 Output

    (
        {
            type => 'hashref',
            keys => {
                form        => { type => 'scalar', regex => qr{^https?://} },
                role        => { type => 'scalar' },
                note        => { type => 'scalar' },
                form_paste  => { type => 'scalar', optional => 1 },
                form_upload => { type => 'scalar', optional => 1 },
                via         => { type => 'scalar' },
            },
        },
        ...
    )

=head3 FORMAL SPECIFICATION

    -- Z notation
    form_contacts == [
      Xi Email::Abuse::Investigator;
      result! : seq FORM_CONTACT_INFO
    ]
    post: forall c : result! @ c.form =~ m{^https?://} /\
          forall c1, c2 : result! @ c1 /= c2 => c1.form /= c2.form

=cut
2220
2221sub form_contacts {
2222
81
938
        my $self = $_[0];
2223
2224
81
64
        my (@contacts, %seen);
2225
2226        # Inner closure: add one form-contact entry, deduplicating by form URL
2227        my $add = sub {
2228
11
45
                my (%args) = @_;
2229
11
15
                my $form = $args{form} // '';
2230
11
12
                return unless $form;
2231
11
15
                return if $seen{$form}++;
2232
9
14
                push @contacts, \%args;
2233
81
133
        };
2234
2235        # Route 1 -- Sending ISP
2236
81
85
        my $orig = $self->originating_ip();
2237
81
80
        if ($orig) {
2238
49
65
                my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns});
2239
49
59
                if ($pa && $pa->{form}) {
2240                        $add->(
2241                                role        => 'Sending ISP',
2242                                form        => $pa->{form},
2243                                note        => $pa->{note} // '',
2244                                form_paste  => $pa->{form_paste}  // '',
2245
0
0
                                form_upload => $pa->{form_upload} // '',
2246                                via         => 'provider-table',
2247                        );
2248                }
2249        }
2250
2251        # Route 2 -- URL hosts
2252
81
53
        my %url_host_seen;
2253
81
76
        for my $u ($self->embedded_urls()) {
2254
48
65
                next if $url_host_seen{ $u->{host} }++;
2255
39
45
                my $pa = $self->_provider_abuse_for_host($u->{host});
2256
39
65
                if ($pa && $pa->{form}) {
2257                        $add->(
2258                                role        => "URL host: $u->{host}",
2259                                form        => $pa->{form},
2260                                form_domain => $u->{host},
2261                                note        => $pa->{note} // '',
2262                                form_paste  => $pa->{form_paste}  // '',
2263
0
0
                                form_upload => $pa->{form_upload} // '',
2264                                via         => 'provider-table',
2265                        );
2266                }
2267        }
2268
2269        # Route 3 -- Contact domains (web host + registrar)
2270
81
81
        for my $d ($self->mailto_domains()) {
2271
88
83
                my $dom = $d->{domain};
2272
88
72
                my $pa  = $self->_provider_abuse_for_host($dom);
2273
88
92
                if ($pa && $pa->{form}) {
2274                        $add->(
2275                                role        => "Web host of $dom",
2276                                form        => $pa->{form},
2277                                form_domain => $dom,
2278                                note        => $pa->{note} // '',
2279                                form_paste  => $pa->{form_paste}  // '',
2280
0
0
                                form_upload => $pa->{form_upload} // '',
2281                                via         => 'provider-table',
2282                        );
2283                }
2284
2285                # Registrar identified via WHOIS -- check for form-only registrar
2286
88
132
                if ($d->{registrar_abuse} && $d->{registrar_abuse} =~ /\@([\w.-]+)/) {
2287
18
21
                        my $reg_domain = lc $1;
2288
18
14
                        my $rpa = $self->_provider_abuse_for_host($reg_domain);
2289
18
32
                        if ($rpa && $rpa->{form}) {
2290                                $add->(
2291                                        role        => "Domain registrar for $dom (web form only)",
2292                                        form        => $rpa->{form},
2293                                        form_domain => $dom,
2294                                        note        => $rpa->{note} // '',
2295                                        form_paste  => $rpa->{form_paste}  // '',
2296
11
34
                                        form_upload => $rpa->{form_upload} // '',
2297                                        via         => 'provider-table',
2298                                );
2299                        }
2300                }
2301        }
2302
2303        # Route 4 -- Account provider headers
2304
81
79
        for my $hname (qw(from reply-to return-path sender)) {
2305
324
240
                my $val = $self->_header_value($hname) // next;
2306
142
226
                my $addr_spec = ($val =~ /<([^>]*)>\s*$/) ? $1 : $val;
2307
142
214
                my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/;
2308
142
121
                next unless $addr_domain;
2309                # Skip SRS forwarder rewrite addresses
2310
142
141
                next if $addr_spec =~ /\+SRS[0-9]?=/i;
2311
142
125
                my $pa = $self->_provider_abuse_for_host($addr_domain);
2312
142
150
                if ($pa && $pa->{form}) {
2313
0
0
                        my $role_addr = $addr_spec =~ /@/ ? $addr_spec : $val;
2314
0
0
                        $role_addr =~ s/^\s+|\s+$//g;
2315                        $add->(
2316                                role        => "Account provider ($hname: $role_addr)",
2317                                form        => $pa->{form},
2318                                note        => $pa->{note} // '',
2319                                form_paste  => $pa->{form_paste}  // '',
2320
0
0
                                form_upload => $pa->{form_upload} // '',
2321                                via         => 'provider-table',
2322                        );
2323                }
2324        }
2325
2326        # Route 5 -- DKIM signer
2327
81
74
        my $auth = $self->_parse_auth_results_cached();
2328
81
85
        if ($auth->{dkim_domain}) {
2329
1
2
                my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain});
2330
1
2
                if ($pa && $pa->{form}) {
2331                        $add->(
2332                                role        => "DKIM signer: $auth->{dkim_domain}",
2333                                form        => $pa->{form},
2334                                note        => $pa->{note} // '',
2335                                form_paste  => $pa->{form_paste}  // '',
2336
0
0
                                form_upload => $pa->{form_upload} // '',
2337                                via         => 'provider-table',
2338                        );
2339                }
2340        }
2341
2342        # Route 6 -- List-Unsubscribe ESP domains
2343
81
71
        my $unsub = $self->_header_value('list-unsubscribe');
2344
81
79
        if ($unsub) {
2345
0
0
                my @unsub_domains;
2346
0
0
0
0
                while ($unsub =~ m{https?://([^/:?\s>]+)}gi) { push @unsub_domains, lc $1 }
2347
0
0
0
0
                while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) { push @unsub_domains, lc $1 }
2348
0
0
                my %useen;
2349
0
0
0
0
                for my $dom (grep { !$useen{$_}++ } @unsub_domains) {
2350
0
0
                        my $pa = $self->_provider_abuse_for_host($dom);
2351
0
0
                        if ($pa && $pa->{form}) {
2352                                $add->(
2353                                        role        => "ESP / bulk sender (List-Unsubscribe: $dom)",
2354                                        form        => $pa->{form},
2355                                        note        => $pa->{note} // '',
2356                                        form_paste  => $pa->{form_paste}  // '',
2357
0
0
                                        form_upload => $pa->{form_upload} // '',
2358                                        via         => 'provider-table',
2359                                );
2360                        }
2361                }
2362        }
2363
2364
81
233
        return @contacts;
2365}
2366
2367# -----------------------------------------------------------------------
2368# Public: full analyst report
2369# -----------------------------------------------------------------------
2370
2371 - 2427
=head2 report()

Produces a comprehensive, analyst-facing plain-text report covering all
findings: envelope fields, risk assessment, originating host, sending
software, received chain tracking IDs, embedded URLs, contact domain
intelligence, and recommended abuse contacts.

Use C<report()> for human review or ticketing systems.  Use
C<abuse_report_text()> for sending to ISP abuse desks.

=head3 Usage

    print $analyser->report();

    open my $fh, '>', 'report.txt' or croak "Cannot open: $!";
    print $fh $analyser->report();
    close $fh;

=head3 Arguments

None.  C<parse_email()> must have been called first.

=head3 Returns

A plain scalar string, newline-terminated, Unix line endings.  Never empty
or undef.

=head3 Side Effects

Triggers all analysis methods if not already cached.

=head3 Notes

The report is idempotent: calling it multiple times on the same object
always returns an identical string.  All user-derived content is sanitised
before output.

=head3 API Specification

=head4 Input

    []

=head4 Output

    { type => 'scalar' }

=head3 FORMAL SPECIFICATION

    -- Z notation
    report == [
      Xi Email::Abuse::Investigator;
      result! : STRING
    ]
    post: result! /= '' /\ result! ends_with '\n'

=cut
2428
2429sub report {
2430
48
6246
        my $self = $_[0];
2431
2432
48
37
        my @out;
2433
2434        # Banner header
2435
48
52
        push @out, '=' x 72;
2436
48
61
        push @out, "  Email::Abuse::Investigator Report  (v$VERSION)";
2437
48
38
        push @out, '=' x 72;
2438
48
33
        push @out, '';
2439
2440        # Envelope summary -- decode MIME encoded-words for readability
2441
48
51
        for my $f (qw(from reply-to return-path subject date message-id)) {
2442
288
266
                my $v = $self->_header_value($f);
2443
288
257
                next unless defined $v;
2444
207
174
                my $decoded = $self->_decode_mime_words($v);
2445
207
204
                my $label   = ucfirst($f);
2446
207
212
                push @out, sprintf('  %-14s : %s', $label,
2447                        _sanitise_output($decoded ne $v ? "$decoded  [encoded: $v]" : $v));
2448        }
2449
48
39
        push @out, '';
2450
2451        # Risk assessment section
2452
48
59
        my $risk = $self->risk_assessment();
2453
48
74
        push @out, "[ RISK ASSESSMENT: $risk->{level} (score: $risk->{score}) ]";
2454
48
48
35
59
        if (@{ $risk->{flags} }) {
2455
25
25
18
28
                for my $f (@{ $risk->{flags} }) {
2456
50
68
                        push @out, "  [$f->{severity}] " . _sanitise_output($f->{detail});
2457                }
2458        } else {
2459
23
19
                push @out, '  (no specific red flags detected)';
2460        }
2461
48
44
        push @out, '';
2462
2463        # Originating host section
2464
48
36
        push @out, '[ ORIGINATING HOST ]';
2465
48
74
        my $orig = $self->originating_ip();
2466
48
75
        if ($orig) {
2467
34
40
                push @out, '  IP           : ' . _sanitise_output($orig->{ip});
2468
34
58
                push @out, '  Reverse DNS  : ' . _sanitise_output($orig->{rdns})    if $orig->{rdns};
2469
34
44
                push @out, '  Country      : ' . _sanitise_output($orig->{country}) if $orig->{country};
2470
34
51
                push @out, '  Organisation : ' . _sanitise_output($orig->{org})     if $orig->{org};
2471
34
50
                push @out, '  Abuse addr   : ' . _sanitise_output($orig->{abuse})   if $orig->{abuse};
2472
34
35
                push @out, "  Confidence   : $orig->{confidence}";
2473
34
46
                push @out, '  Note         : ' . _sanitise_output($orig->{note})    if $orig->{note};
2474        } else {
2475
14
13
                push @out, '  (could not determine originating IP)';
2476        }
2477
48
38
        push @out, '';
2478
2479        # Sending software section (omitted if none found)
2480
48
61
        my @sw = $self->sending_software();
2481
48
54
        if (@sw) {
2482
0
0
                push @out, '[ SENDING SOFTWARE / INFRASTRUCTURE CLUES ]';
2483
0
0
                for my $s (@sw) {
2484
0
0
                        push @out, sprintf('  %-14s : %s', $s->{header}, _sanitise_output($s->{value}));
2485
0
0
                        push @out, "  Note           : $s->{note}";
2486
0
0
                        push @out, '';
2487                }
2488        }
2489
2490        # Received chain tracking IDs (only hops with id or for are shown)
2491
48
41
62
109
        my @trail = grep { defined $_->{id} || defined $_->{for} }
2492                    $self->received_trail();
2493
48
55
        if (@trail) {
2494
0
0
                push @out, '[ RECEIVED CHAIN TRACKING IDs ]';
2495
0
0
                push @out, '  (Supply these to the relevant ISP abuse team to trace the session)';
2496
0
0
                push @out, '';
2497
0
0
                for my $hop (@trail) {
2498
0
0
                        push @out, '  IP           : ' . (_sanitise_output($hop->{ip}) // '(unknown)');
2499
0
0
                        push @out, '  Envelope for : ' . _sanitise_output($hop->{for}) if $hop->{for};
2500
0
0
                        push @out, '  Server ID    : ' . _sanitise_output($hop->{id})  if $hop->{id};
2501
0
0
                        push @out, '';
2502                }
2503        }
2504
2505        # Embedded URLs section -- grouped by hostname
2506
48
39
        push @out, '[ EMBEDDED HTTP/HTTPS URLs ]';
2507
48
49
        my @urls = $self->embedded_urls();
2508
48
49
        if (@urls) {
2509
19
14
                my (%host_order, %host_meta, %host_paths);
2510
19
16
                my $seq = 0;
2511
19
19
                for my $u (@urls) {
2512
30
23
                        my $h = $u->{host};
2513
30
33
                        unless (exists $host_order{$h}) {
2514
21
21
                                $host_order{$h} = $seq++;
2515                                $host_meta{$h}  = {
2516                                        ip      => $u->{ip},
2517                                        org     => $u->{org},
2518                                        abuse   => $u->{abuse},
2519                                        country => $u->{country},
2520
21
44
                                };
2521                        }
2522
30
30
20
42
                        push @{ $host_paths{$h} }, $u->{url};
2523                }
2524
2525                # Output each host group in first-seen order
2526
19
2
26
4
                for my $h (sort { $host_order{$a} <=> $host_order{$b} } keys %host_order) {
2527
21
19
                        my $m    = $host_meta{$h};
2528
21
21
24
22
                        my $bare = lc $h; $bare =~ s/^www\.//;
2529                        push @out, '  Host         : ' . _sanitise_output($h) .
2530
21
24
                                   (($URL_SHORTENERS{$bare} || $self->{url_shorteners}->{$bare})
2531                                    ? '  *** URL SHORTENER -- real destination hidden ***' : '');
2532
21
35
                        push @out, '  IP           : ' . _sanitise_output($m->{ip})      if $m->{ip};
2533
21
29
                        push @out, '  Country      : ' . _sanitise_output($m->{country}) if $m->{country};
2534
21
32
                        push @out, '  Organisation : ' . _sanitise_output($m->{org})     if $m->{org};
2535
21
34
                        push @out, '  Abuse addr   : ' . _sanitise_output($m->{abuse})   if $m->{abuse};
2536
21
21
19
22
                        my @paths = @{ $host_paths{$h} };
2537
21
24
                        if (@paths == 1) {
2538
15
17
                                push @out, '  URL          : ' . _sanitise_output($paths[0]);
2539                        } else {
2540
6
6
                                push @out, '  URLs (' . scalar(@paths) . ')     :';
2541
6
11
                                push @out, '    ' . _sanitise_output($_) for @paths;
2542                        }
2543
21
48
                        push @out, '';
2544                }
2545        } else {
2546
29
25
                push @out, '  (none found)';
2547
29
23
                push @out, '';
2548        }
2549
2550        # Contact / reply-to domains section
2551
48
39
        push @out, '[ CONTACT / REPLY-TO DOMAINS ]';
2552
48
71
        my @mdoms = $self->mailto_domains();
2553
48
68
        if (@mdoms) {
2554
36
33
                for my $d (@mdoms) {
2555
49
55
                        push @out, '  Domain       : ' . _sanitise_output($d->{domain});
2556
49
74
                        push @out, '  Found in     : ' . _sanitise_output($d->{source});
2557
49
50
                        if ($d->{recently_registered}) {
2558
3
3
                                push @out, '  *** WARNING: RECENTLY REGISTERED - possible phishing domain ***';
2559                        }
2560
49
53
                        push @out, '  Registered   : ' . $d->{registered}       if $d->{registered};
2561
49
46
                        push @out, '  Expires      : ' . $d->{expires}           if $d->{expires};
2562
49
44
                        push @out, '  Registrar    : ' . _sanitise_output($d->{registrar})       if $d->{registrar};
2563
49
51
                        push @out, '  Reg. abuse   : ' . _sanitise_output($d->{registrar_abuse}) if $d->{registrar_abuse};
2564
49
49
                        if ($d->{web_ip}) {
2565
22
24
                                push @out, '  Web host IP  : ' . _sanitise_output($d->{web_ip});
2566
22
34
                                push @out, '  Web host org : ' . _sanitise_output($d->{web_org})   if $d->{web_org};
2567
22
25
                                push @out, '  Web abuse    : ' . _sanitise_output($d->{web_abuse}) if $d->{web_abuse};
2568                        } else {
2569
27
21
                                push @out, '  Web host     : (no A record / unreachable)';
2570                        }
2571
49
40
                        if ($d->{mx_host}) {
2572
3
4
                                push @out, '  MX host      : ' . _sanitise_output($d->{mx_host});
2573
3
7
                                push @out, '  MX IP        : ' . _sanitise_output($d->{mx_ip})    if $d->{mx_ip};
2574
3
6
                                push @out, '  MX org       : ' . _sanitise_output($d->{mx_org})   if $d->{mx_org};
2575
3
5
                                push @out, '  MX abuse     : ' . _sanitise_output($d->{mx_abuse}) if $d->{mx_abuse};
2576                        } else {
2577
46
35
                                push @out, '  MX host      : (none found)';
2578                        }
2579
49
45
                        if ($d->{ns_host}) {
2580
2
3
                                push @out, '  NS host      : ' . _sanitise_output($d->{ns_host});
2581
2
4
                                push @out, '  NS IP        : ' . _sanitise_output($d->{ns_ip})    if $d->{ns_ip};
2582
2
4
                                push @out, '  NS org       : ' . _sanitise_output($d->{ns_org})   if $d->{ns_org};
2583
2
4
                                push @out, '  NS abuse     : ' . _sanitise_output($d->{ns_abuse}) if $d->{ns_abuse};
2584                        }
2585
49
42
                        push @out, '';
2586                }
2587        } else {
2588
12
11
                push @out, '  (none found)';
2589
12
12
                push @out, '';
2590        }
2591
2592        # Abuse contacts summary
2593
48
38
        push @out, '[ WHERE TO SEND ABUSE REPORTS ]';
2594
48
54
        my @contacts = $self->abuse_contacts();
2595
48
51
        if (@contacts) {
2596
39
38
                for my $c (@contacts) {
2597
58
57
                        push @out, '  Role         : ' . _sanitise_output($c->{role});
2598
58
90
                        push @out, '  Send to      : ' . _sanitise_output($c->{address});
2599
58
70
                        push @out, '  Note         : ' . _sanitise_output($c->{note}) if $c->{note};
2600
58
86
                        push @out, "  Discovered   : $c->{via}";
2601
58
52
                        push @out, '';
2602                }
2603        } else {
2604
9
8
                push @out, '  (no abuse contacts could be determined)';
2605
9
7
                push @out, '';
2606        }
2607
2608        # Web-form contacts (providers that require manual form submission)
2609
48
51
        my @form_cs = $self->form_contacts();
2610
48
51
        if (@form_cs) {
2611
2
2
                push @out, '[ WHERE TO FILE WEB-FORM REPORTS ]';
2612
2
3
                push @out, '  The following parties require manual submission via a web form.';
2613
2
1
                push @out, '  Open each URL in a browser, then follow the instructions below it.';
2614
2
2
                push @out, '';
2615
2
2
                for my $c (@form_cs) {
2616
2
3
                        push @out, '  Role         : ' . _sanitise_output($c->{role});
2617
2
2
                        push @out, '  Form URL     : ' . _sanitise_output($c->{form});
2618
2
5
                        push @out, '  Domain/URL   : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain};
2619
2
4
                        push @out, '  Note         : ' . _sanitise_output($c->{note})        if $c->{note};
2620
2
3
                        if ($c->{form_paste}) {
2621                                # Word-wrap the paste hint at ROLE_WRAP_LEN characters
2622
2
2
                                my $hint  = $c->{form_paste};
2623
2
6
                                my @words = split /\s+/, $hint;
2624
2
3
                                my (@lines, $line);
2625
2
2
                                for my $w (@words) {
2626
59
57
                                        if (defined $line && length("$line $w") > $ROLE_WRAP_LEN) {
2627
4
3
                                                push @lines, $line;
2628
4
4
                                                $line = $w;
2629                                        } else {
2630
55
49
                                                $line = defined $line ? "$line $w" : $w;
2631                                        }
2632                                }
2633
2
3
                                push @lines, $line if defined $line;
2634
2
4
                                push @out, '  Paste        : ' . shift @lines if @lines;
2635
2
5
                                push @out, '                 ' . $_ for @lines;
2636                        }
2637
2
5
                        push @out, '  Upload       : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload};
2638
2
2
                        push @out, '';
2639                }
2640        }
2641
2642
48
42
        push @out, '=' x 72;
2643
48
353
        return join("\n", @out) . "\n";
2644}
2645
2646# -----------------------------------------------------------------------
2647# Private: output sanitisation
2648# -----------------------------------------------------------------------
2649
2650# _sanitise_output( $str ) -> $str
2651#
2652# Purpose:
2653#   Strip control characters that could affect terminal rendering or HTML
2654#   injection from any string that will appear in a report or abuse email.
2655#   Preserves printable ASCII, high-bytes (for UTF-8 content), tabs, and
2656#   line endings.
2657#
2658# Entry criteria:
2659#   $str -- a defined or undef scalar.
2660#
2661# Exit status:
2662#   Returns the sanitised string, or the empty string if $str is undef.
2663#
2664# Notes:
2665#   Only strips C0 control characters below 0x20 (except \t) and the DEL
2666#   character (0x7F).  High bytes (0x80-0xFF) are preserved because they
2667#   form valid UTF-8 multi-byte sequences in headers and body text.
2668
2669sub _sanitise_output {
2670
1153
20873
        my ($str) = @_;
2671
1153
835
        return '' unless defined $str;
2672        # Remove C0 controls (except tab) and DEL
2673
1149
893
        $str =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]//g;
2674
1149
1273
        return $str;
2675}
2676
2677# -----------------------------------------------------------------------
2678# Private: message parsing
2679# -----------------------------------------------------------------------
2680
2681# _split_message( $text )
2682#
2683# Purpose:
2684#   Split a raw RFC 2822 email into headers and body, parse all headers,
2685#   decode the body (including multipart), extract sending-software
2686#   fingerprints, and populate per-hop tracking data.
2687#
2688# Entry criteria:
2689#   $text -- defined scalar, already dereferenced by parse_email().
2690#   $self->{_sending_sw} and $self->{_rcvd_tracking} reset to [] by caller.
2691#
2692# Exit status:
2693#   Returns undef silently if the header block is empty/whitespace-only.
2694#   Otherwise all results are communicated via side effects on $self.
2695#
2696# Side effects:
2697#   Populates _headers, _received, _body_plain, _body_html, _sending_sw,
2698#   and _rcvd_tracking.
2699#
2700# Notes:
2701#   Delegates to _decode_multipart() for multipart/* content types.
2702#   Lines not matching the header pattern are silently discarded.
2703#   Boundary extraction uses a simple regex; missing boundary causes the
2704#   body to be skipped silently.
2705
2706sub _split_message {
2707
481
470
        my ($self, $text) = @_;
2708
2709        # Split at the first blank line (RFC 2822 header/body separator)
2710
481
2324
        my ($header_block, $body_raw) = split /\r?\n\r?\n/, $text, 2;
2711
2712
481
1008
        return unless defined $header_block && $header_block =~ /\S/;
2713
481
464
        $body_raw //= '';
2714
2715        # Unfold RFC 2822 continuation lines (s2.2.3)
2716
481
1382
        $header_block =~ s/\r?\n([ \t]+)/ $1/g;
2717
2718        # Parse each header line into a { name, value } pair
2719
481
326
        my @headers;
2720
481
1774
        for my $line (split /\r?\n/, $header_block) {
2721
4207
5647
                if ($line =~ /^([\w-]+)\s*:\s*(.*)/) {
2722
4204
6018
                        push @headers, { name => lc($1), value => $2 };
2723                }
2724        }
2725
481
615
        $self->{_headers}  = \@headers;
2726
2727        # Collect all Received: header values (most-recent first, as in message)
2728        $self->{_received} = [
2729
1416
1301
                map  { $_->{value} }
2730
481
4204
517
3342
                grep { $_->{name} eq 'received' } @headers
2731        ];
2732
2733        # Determine content type and transfer encoding from top-level headers
2734
481
4204
451
3050
        my ($ct_h)  = grep { $_->{name} eq 'content-type' }              @headers;
2735
481
4204
361
2892
        my ($cte_h) = grep { $_->{name} eq 'content-transfer-encoding' } @headers;
2736
481
554
        my $ct  = defined $ct_h  ? $ct_h->{value}  : '';
2737
481
421
        my $cte = defined $cte_h ? $cte_h->{value} : '';
2738
2739        # Decode multipart or single-part body as appropriate
2740
481
521
        if ($ct =~ /multipart/i) {
2741
17
38
                my ($boundary) = $ct =~ /boundary="?([^";]+)"?/i;
2742                # Pass depth=0 to enforce the MAX_MULTIPART_DEPTH recursion guard
2743
17
36
                $self->_decode_multipart($body_raw, $boundary, 0) if $boundary;
2744        } else {
2745
464
515
                my $decoded = $self->_decode_body($body_raw, $cte);
2746
464
8
495
9
                if ($ct =~ /html/i) { $self->{_body_html}  = $decoded }
2747
456
430
                else                 { $self->{_body_plain} = $decoded }
2748        }
2749
2750        $self->_debug(sprintf 'Parsed %d headers, %d Received lines',
2751
481
481
369
1169
                scalar @headers, scalar @{ $self->{_received} });
2752
2753        # --- Sending software fingerprints ---
2754        # These headers identify the mailer or shared-hosting script that sent
2755        # the message; invaluable for shared-hosting abuse reports.
2756
481
1513
        my %sw_notes = (
2757                'x-php-originating-script' => 'PHP script on shared hosting -- report to hosting abuse team',
2758                'x-source'                 => 'Source file on shared hosting -- report to hosting abuse team',
2759                'x-source-host'            => 'Sending hostname injected by shared hosting provider',
2760                'x-source-args'            => 'Command-line args injected by shared hosting provider',
2761                'x-mailer'                 => 'Email client or bulk-mailer identifier',
2762                'user-agent'               => 'Email client identifier',
2763        );
2764
481
1061
        for my $sw_hdr (sort keys %sw_notes) {
2765
2886
25224
1847
17097
                my ($h) = grep { $_->{name} eq $sw_hdr } @headers;
2766
2886
2327
                next unless $h;
2767
7
15
                push @{ $self->{_sending_sw} }, {
2768                        header => $sw_hdr,
2769                        value  => $h->{value},
2770
7
3
                        note   => $sw_notes{$sw_hdr},
2771                };
2772        }
2773
2774        # --- Per-hop tracking IDs from Received: chain ---
2775        # Walk oldest-first (reverse) so _rcvd_tracking is oldest-first
2776
481
481
403
521
        for my $rcvd (reverse @{ $self->{_received} }) {
2777
1416
1158
                my $ip = $self->_extract_ip_from_received($rcvd);
2778
1416
1117
                my ($for_addr) = $rcvd =~ /\bfor\s+<?([^\s>]+\@[\w.-]+\.[\w]+)>?/i;
2779
1416
2222
                my ($srv_id)   = $rcvd =~ /\bid\s+([\w.-]+)/i;
2780                # Skip hops with no actionable tracking data
2781
1416
1190
                next unless defined $ip || defined $for_addr || defined $srv_id;
2782
1415
1415
815
2678
                push @{ $self->{_rcvd_tracking} }, {
2783                        received => $rcvd,
2784                        ip       => $ip,
2785                        for      => $for_addr,
2786                        id       => $srv_id,
2787                };
2788        }
2789}
2790
2791# _decode_multipart( $body, $boundary, $depth )
2792#
2793# Purpose:
2794#   Recursively split a MIME multipart body on its boundary and decode each
2795#   text/plain and text/html part.  Nested multipart/* containers are
2796#   recursed into up to MAX_MULTIPART_DEPTH levels deep.
2797#
2798# Entry criteria:
2799#   $body     -- the raw body text of the multipart container.
2800#   $boundary -- the boundary string from the Content-Type header.
2801#   $depth    -- current recursion depth (starts at 0 from _split_message).
2802#
2803# Exit status:
2804#   Returns undef if $depth >= MAX_MULTIPART_DEPTH (recursion guard).
2805#   Otherwise all results via side effects.
2806#
2807# Side effects:
2808#   Appends decoded text to $self->{_body_plain} and $self->{_body_html}.
2809#
2810# Notes:
2811#   Whitespace-only MIME segments between boundaries are silently skipped.
2812#   Decoding errors are silenced; raw bytes are used as fallback.
2813
2814sub _decode_multipart {
2815
49
111
        my ($self, $body, $boundary, $depth) = @_;
2816
49
47
        $depth //= 0;
2817
2818        # Enforce the recursion depth limit to prevent stack exhaustion on
2819        # pathological crafted messages with deeply nested multipart structures.
2820
49
49
        if ($depth >= $MAX_MULTIPART_DEPTH) {
2821
7
15
                Carp::carp 'Email::Abuse::Investigator: multipart nesting depth limit',
2822                        "($MAX_MULTIPART_DEPTH) exceeded; stopping recursion";
2823
7
18
                return;
2824        }
2825
2826        # Split on the boundary marker; the (?:--)? suffix handles closing boundary
2827
42
332
        my @parts = split /--\Q$boundary\E(?:--)?/, $body;
2828
2829
42
53
        for my $part (@parts) {
2830                # Skip whitespace-only segments between boundaries
2831
135
160
                next unless $part =~ /\S/;
2832
2833
52
80
                $part =~ s/^\r?\n//;
2834
2835                # Each MIME part has its own headers separated from body by a blank line
2836
52
104
                my ($phdr_block, $pbody) = split /\r?\n\r?\n/, $part, 2;
2837
52
48
                next unless defined $pbody;
2838
2839                # Unfold continuation header lines within this part
2840
51
41
                $phdr_block =~ s/\r?\n([ \t]+)/ $1/g;
2841
2842                # Parse this part's headers into a simple hash
2843
51
38
                my %phdr;
2844
51
56
                for my $line (split /\r?\n/, $phdr_block) {
2845
54
124
                        $phdr{ lc($1) } = $2 if $line =~ /^([\w-]+)\s*:\s*(.*)/;
2846                }
2847
2848
51
58
                my $pct  = $phdr{'content-type'}              // '';
2849
51
72
                my $pcte = $phdr{'content-transfer-encoding'} // '';
2850
2851                # Nested multipart/* must be recursed into; without this URLs in
2852                # multipart/alternative inside multipart/mixed would be missed.
2853
51
59
                if ($pct =~ /multipart/i) {
2854
24
35
                        my ($inner_boundary) = $pct =~ /boundary\s*=\s*"?([^";]+)"?/i;
2855
24
24
                        if ($inner_boundary) {
2856
24
19
                                $inner_boundary =~ s/\s+$//;
2857                                # Increment depth counter for the recursion guard
2858
24
43
                                $self->_decode_multipart($pbody, $inner_boundary, $depth + 1);
2859                        }
2860
24
22
                        next;
2861                }
2862
2863                # Decode transfer encoding and accumulate by content type
2864
27
30
                my $decoded = $self->_decode_body($pbody, $pcte);
2865
27
10
69
17
                if    ($pct =~ /text\/html/i)    { $self->{_body_html}  .= $decoded }
2866
16
26
                elsif ($pct =~ /text/i || !$pct) { $self->{_body_plain} .= $decoded }
2867        }
2868}
2869
2870# _decode_body( $body, $cte ) -> string
2871#
2872# Purpose:
2873#   Decode a MIME body part according to its Content-Transfer-Encoding.
2874#
2875# Entry criteria:
2876#   $body -- raw body string (may be undef).
2877#   $cte  -- Content-Transfer-Encoding value string (may be undef).
2878#
2879# Exit status:
2880#   Returns the decoded string, or the original string if the encoding is
2881#   7bit/8bit/binary or unrecognised.
2882#
2883# Notes:
2884#   decode_qp and decode_base64 are imported from MIME:: modules; errors
2885#   from malformed content are silenced by the eval wrappers they provide.
2886
2887sub _decode_body {
2888
497
1124
        my ($self, $body, $cte) = @_;
2889
497
451
        $cte //= '';
2890
497
453
        return decode_qp($body)     if $cte =~ /quoted-printable/i;
2891
491
408
        return decode_base64($body) if $cte =~ /base64/i;
2892
488
587
        return $body // '';
2893}
2894
2895# -----------------------------------------------------------------------
2896# Private: Received-chain -> originating IP
2897# -----------------------------------------------------------------------
2898
2899# _find_origin()
2900#
2901# Purpose:
2902#   Walk the Received: chain (oldest-first) to find the first external IP,
2903#   or fall back to X-Originating-IP.  Enrich with rDNS and WHOIS.
2904#
2905# Entry criteria:
2906#   $self->{_received} populated by _split_message().
2907#   $self->{trusted_relays} set by new().
2908#
2909# Exit status:
2910#   Returns { ip, rdns, org, abuse, country, confidence, note } on success.
2911#   Returns undef if no usable IP can be identified.
2912#
2913# Side effects:
2914#   Network I/O via _enrich_ip(): one PTR lookup, one RDAP/WHOIS query.
2915#   Results are also stored in the CHI cross-message cache if available.
2916#
2917# Notes:
2918#   confidence 'high' = 2+ distinct external IPs;
2919#   'medium' = exactly one external IP;
2920#   'low' = taken from X-Originating-IP.
2921
2922sub _find_origin {
2923
267
191
        my $self = $_[0];
2924
2925
267
203
        my @candidates;
2926
2927        # Walk oldest-first (reverse) to collect external IPs
2928
267
267
219
247
        for my $hdr (reverse @{ $self->{_received} }) {
2929
1222
960
                my $ip = $self->_extract_ip_from_received($hdr) // next;
2930
1221
974
                next if $self->_is_private($ip);
2931
89
1078
                next if $self->_is_trusted($ip);
2932
84
86
                push @candidates, $ip;
2933        }
2934
2935        # Fall back to X-Originating-IP if no external IPs in Received: chain
2936
267
262
        unless (@candidates) {
2937
187
196
                my $xoip = $self->_header_value('x-originating-ip');
2938
187
178
                if ($xoip) {
2939
6
8
                        $xoip =~ s/[\[\]\s]//g;
2940
6
7
                        return $self->_enrich_ip($xoip, 'low',
2941                                'Taken from X-Originating-IP (webmail, unverified)')
2942                                unless $self->_is_private($xoip);
2943                }
2944
182
267
                return undef;
2945        }
2946
2947        # Report the oldest (first) external IP; confidence depends on count
2948
80
149
        return $self->_enrich_ip(
2949                $candidates[0],
2950                @candidates > 1 ? 'high' : 'medium',
2951                'First external hop in Received: chain',
2952        );
2953}
2954
2955# _extract_ip_from_received( $hdr ) -> ipv4_or_ipv6_string | undef
2956#
2957# Purpose:
2958#   Extract the most-significant IP address from a raw Received: header
2959#   value, trying patterns in priority order.  Supports both IPv4 dotted-
2960#   quad and IPv6 bracket notation.
2961#
2962# Entry criteria:
2963#   $hdr -- a defined Received: header value string.
2964#
2965# Exit status:
2966#   Returns the IP string on success, undef if no IP can be extracted.
2967#
2968# Notes:
2969#   IPv4 addresses are validated (all octets <= 255).
2970#   IPv6 addresses are returned as-is if they contain colons.
2971
2972sub _extract_ip_from_received {
2973
2653
3281
        my ($self, $hdr) = @_;
2974
2653
1749
        for my $re (@RECEIVED_IP_RE) {
2975
2719
4730
                if ($hdr =~ $re) {
2976
2659
2086
                        my $ip = $1;
2977
2978                        # Accept IPv6 addresses (contain colons) without further validation
2979
2659
2076
                        return $ip if $ip =~ /:/;
2980
2981                        # Validate IPv4 format and octet range
2982
2656
3089
                        next unless $ip =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/;
2983
2650
10600
2457
8556
                        next if grep { $_ > 255 } split /\./, $ip;
2984
2642
2942
                        return $ip;
2985                }
2986        }
2987
8
15
        return undef;
2988}
2989
2990# _is_private( $ip ) -> bool
2991#
2992# Purpose:
2993#   Test whether an IP address falls in any private, reserved, or special-
2994#   use range (IPv4 or IPv6) that should never be reported as a spam origin.
2995#
2996# Entry criteria:
2997#   $ip -- a scalar IP string (IPv4 or IPv6); may be undef.
2998#
2999# Exit status:
3000#   Returns 1 (true) if the IP is private/reserved, 0 (false) otherwise.
3001#   Returns 1 for undef or empty strings.
3002#
3003# Notes:
3004#   Uses the module-level @PRIVATE_RANGES array of pre-compiled regexes.
3005#   Covers all ranges listed in RFC 1122, 1918, 5737, 6598, and RFC 4193.
3006
3007sub _is_private {
3008
1263
1089
        my ($self, $ip) = @_;
3009
1263
1404
        return 1 unless defined $ip && $ip ne '';
3010
1261
6320
829
7363
        for my $re (@PRIVATE_RANGES) { return 1 if $ip =~ $re }
3011
101
119
        return 0;
3012}
3013
3014# _is_trusted( $ip ) -> bool
3015#
3016# Purpose:
3017#   Test whether an IP address matches any entry in the caller-supplied
3018#   trusted_relays list (exact IP or CIDR block).
3019#
3020# Entry criteria:
3021#   $ip -- a defined IPv4 address string.
3022#   $self->{trusted_relays} -- arrayref of exact IPs or CIDR strings.
3023#
3024# Exit status:
3025#   Returns 1 (true) if the IP matches any trusted relay, 0 otherwise.
3026
3027sub _is_trusted {
3028
95
83
        my ($self, $ip) = @_;
3029
95
95
69
99
        for my $cidr (@{ $self->{trusted_relays} }) {
3030
19
17
                return 1 if $self->_ip_in_cidr($ip, $cidr);
3031        }
3032
87
98
        return 0;
3033}
3034
3035# -----------------------------------------------------------------------
3036# Private: HTTP/HTTPS URL extraction and resolution
3037# -----------------------------------------------------------------------
3038
3039# _extract_and_resolve_urls() -> arrayref of url hashrefs
3040#
3041# Purpose:
3042#   Extract all HTTP/HTTPS URLs from the decoded body, resolve each unique
3043#   hostname to an IP, and enrich with WHOIS/RDAP data.  Optionally uses
3044#   AnyEvent::DNS to parallelise the DNS resolution step.
3045#
3046# Entry criteria:
3047#   $self->{_body_plain} and $self->{_body_html} populated by _split_message().
3048#
3049# Exit status:
3050#   Returns an arrayref of url hashrefs (possibly empty).
3051#
3052# Side effects:
3053#   Network I/O per unique hostname: one A/AAAA lookup, one RDAP/WHOIS.
3054#   Results stored in the CHI cross-message cache if available.
3055
3056sub _extract_and_resolve_urls {
3057
144
105
        my $self = $_[0];
3058
144
182
        my (%url_seen, %host_cache);
3059
144
0
        my @results;
3060
144
165
        my $combined = $self->{_body_plain} . "\n" . $self->{_body_html};
3061
3062        # Collect unique URLs from body
3063
144
614
168
560
        my @urls = grep { !$url_seen{$_}++ } $self->_extract_http_urls($combined);
3064
3065        # Extract unique hostnames for parallel DNS resolution
3066
144
105
        my %hostname_needed;
3067
144
122
        for my $url (@urls) {
3068
614
805
                my ($host) = $url =~ m{https?://([^/:?\s#]+)}i;
3069
614
586
                $hostname_needed{$host}++ if $host;
3070        }
3071
3072        # Parallelise DNS lookups if AnyEvent::DNS is available
3073
144
176
        if ($HAS_ANYEVENT_DNS && scalar(keys %hostname_needed) > 1) {
3074
0
0
                $self->_parallel_resolve_hosts(\%hostname_needed, \%host_cache);
3075        }
3076
3077        # Process each URL: resolve hostname and WHOIS-enrich
3078
144
113
        for my $url (@urls) {
3079
614
780
                my ($host) = $url =~ m{https?://([^/:?\s#]+)}i;
3080
614
493
                next unless $host;
3081
3082                # Resolve and WHOIS once per unique hostname, then cache the result
3083
614
492
                unless (exists $host_cache{$host}) {
3084                        # Check the cross-message CHI cache first
3085
99
94
                        my $cached = $_cache ? $_cache->get("url:$host") : undef;
3086
99
81
                        if ($cached) {
3087
0
0
                                $host_cache{$host} = $cached;
3088                        } else {
3089
99
132
                                my $ip    = $self->_resolve_host($host) // '(unresolved)';
3090
99
298
                                my $whois = $ip ne '(unresolved)'
3091                                          ? $self->_whois_ip($ip)
3092                                          : {};
3093
3094                                # Fall back to domain WHOIS if IP lookup returned nothing
3095
99
191
                                if (!$whois->{abuse}) {
3096
56
84
                                        my $reg = _registrable($host) // $host;
3097
56
76
                                        my $dw  = $self->_parse_domain_whois_abuse($reg);
3098
56
142
                                        $whois  = $dw if $dw->{abuse};
3099                                }
3100
3101                                my $entry = {
3102                                        ip      => $ip,
3103                                        org     => $whois->{org}     // '(unknown)',
3104                                        abuse   => $whois->{abuse}   // '(unknown)',
3105                                        country => $whois->{country} // undef,
3106
99
337
                                };
3107
99
92
                                $host_cache{$host} = $entry;
3108
3109                                # Store in cross-message cache for reuse across messages
3110
99
139
                                $_cache->set("url:$host", $entry) if $_cache;
3111                        }
3112                }
3113
3114
614
614
372
1028
                push @results, { url => $url, host => $host, %{ $host_cache{$host} } };
3115        }
3116
144
377
        return \@results;
3117}
3118
3119# _parallel_resolve_hosts( \%hostnames, \%cache )
3120#
3121# Purpose:
3122#   Resolve multiple hostnames to IPs in parallel using AnyEvent::DNS.
3123#   Populates the cache with resolved IPs so the sequential loop in
3124#   _extract_and_resolve_urls() can skip the DNS step for pre-resolved hosts.
3125#
3126# Entry criteria:
3127#   $hostnames_ref -- hashref keyed by hostname (values ignored).
3128#   $cache_ref     -- hashref to populate with { ip => '...' } results.
3129#   AnyEvent::DNS must be installed ($HAS_ANYEVENT_DNS is true).
3130#
3131# Exit status:
3132#   Returns undef; all results written to %$cache_ref via side effects.
3133#
3134# Notes:
3135#   Errors (NXDOMAIN, timeout) are silently swallowed; the sequential
3136#   resolution loop will return '(unresolved)' for those hosts.
3137
3138sub _parallel_resolve_hosts {
3139
2
419
        my ($self, $hostnames_ref, $cache_ref) = @_;
3140
2
6
        return unless $HAS_ANYEVENT_DNS;
3141
3142        # Build an AnyEvent condvar to wait for all lookups to complete
3143
0
0
        my $cv      = AnyEvent->condvar;
3144
0
0
        my $pending = scalar keys %$hostnames_ref;
3145
3146
0
0
        for my $host (keys %$hostnames_ref) {
3147                # Fire an async A (and AAAA) query for each hostname
3148                AnyEvent::DNS::resolve(
3149                        $host, 'A',
3150                        sub {
3151
0
0
                                my @answers = @_;
3152
0
0
                                if (@answers) {
3153                                        # Cache the first A record result
3154
0
0
                                        $cache_ref->{$host} = { ip => $answers[0][4] };
3155                                }
3156                                # Decrement the pending counter; signal when all done
3157
0
0
                                $cv->send if --$pending <= 0;
3158                        },
3159
0
0
                );
3160        }
3161
3162        # Block until all DNS queries complete (subject to AnyEvent's own timeouts)
3163
0
0
        $cv->recv;
3164}
3165
3166# _extract_http_urls( $body ) -> list of url strings
3167#
3168# Purpose:
3169#   Extract all HTTP and HTTPS URLs from a body string, using both
3170#   structural HTML parsing (if HTML::LinkExtor is available) and a
3171#   plain-text regex pass.  Deduplicates and strips trailing punctuation.
3172#
3173# Entry criteria:
3174#   $body -- combined plain+HTML body string.
3175#
3176# Exit status:
3177#   Returns a list of URL strings (possibly empty), deduplicated.
3178
3179sub _extract_http_urls {
3180
147
693
        my ($self, $body) = @_;
3181
147
102
        my @urls;
3182
3183        # Structural HTML link extraction (handles quoted attributes correctly)
3184
147
148
        if ($HAS_HTML_LINKEXTOR) {
3185                my $p = HTML::LinkExtor->new(sub {
3186
16
251
                        my ($tag, %attrs) = @_;
3187
16
14
                        for my $attr (qw(href src action)) {
3188
48
61
                                my $val = $attrs{$attr} // '';
3189
48
120
                                if ($val =~ m{^https?://}i) {
3190
12
11
                                        push @urls, $val;
3191                                } elsif ($val =~ m{^//[\w.-]}) {
3192                                        # Protocol-relative -- assume https
3193
2
2
                                        push @urls, 'https:' . $val;
3194                                }
3195                        }
3196
147
476
                });
3197
147
5285
                $p->parse($body);
3198        }
3199
3200        # Plain-text regex pass for bare URLs not in HTML attributes
3201
147
1030
        while ($body =~ m{(https?://[^\s<>"'\)\]]+)}gi) {
3202
1116
1702
                push @urls, $1;
3203        }
3204
3205        # Protocol-relative URLs not caught above
3206
147
388
        while ($body =~ m{(?:^|[\s"'=])(//[\w.-][^\s<>"'\)\]]*)}gim) {
3207
2
4
                push @urls, 'https:' . $1;
3208        }
3209
3210        # Deduplicate and strip trailing punctuation
3211
147
104
        my %seen;
3212
147
1132
147
999
        my @all = grep { !$seen{$_}++ } @urls;
3213
147
335
        s/[.,;:!?\)>\]]+$// for @all;
3214
147
292
        return @all;
3215}
3216
3217# -----------------------------------------------------------------------
3218# Private: domain extraction and full analysis
3219# -----------------------------------------------------------------------
3220
3221# _extract_and_analyse_domains() -> arrayref of domain hashrefs
3222#
3223# Purpose:
3224#   Collect all non-infrastructure contact domains from headers and body,
3225#   run the full domain intelligence pipeline on each, and return an arrayref
3226#   suitable for storage in $self->{_mailto_domains}.
3227#
3228# Entry criteria:
3229#   _split_message() must have been called.
3230#
3231# Exit status:
3232#   Always returns an arrayref; never undef.
3233#
3234# Side effects:
3235#   Network I/O per domain via _analyse_domain().
3236#   Results stored in $self->{_domain_info} and CHI cache.
3237
3238sub _extract_and_analyse_domains {
3239
138
103
        my $self = $_[0];
3240
138
149
        my (%seen, @domains_with_source);
3241
3242        # Build a set of recipient domains to exclude (victims, not senders)
3243
138
0
        my %recipient_domains;
3244
138
119
        for my $hname (qw(to cc)) {
3245
276
272
                my $val = $self->_header_value($hname) // next;
3246
129
168
                for my $dom ($self->_domains_from_text($val)) {
3247
126
135
                        my $reg = _registrable($dom) // $dom;
3248
126
142
                        $recipient_domains{$dom}++;
3249
126
146
                        $recipient_domains{$reg}++;
3250                }
3251        }
3252
3253        # Also exclude domains from Received: "for" envelope recipients
3254
138
138
87
132
        for my $hop (@{ $self->{_rcvd_tracking} }) {
3255
135
179
                next unless $hop->{for} && $hop->{for} =~ /\@([\w.-]+)/;
3256
0
0
                my $dom = lc $1;
3257
0
0
                my $reg = _registrable($dom) // $dom;
3258
0
0
                $recipient_domains{$dom}++;
3259
0
0
                $recipient_domains{$reg}++;
3260        }
3261
3262        # Inner closure: record a domain if it passes all filters
3263        my $record = sub {
3264
624
511
                my ($dom, $source) = @_;
3265
624
466
                $dom = lc $dom;
3266
624
403
                $dom =~ s/\.$//;
3267
624
570
                next if $self->{trusted_domains}->{$dom};
3268
624
520
                return if $TRUSTED_DOMAINS{$dom};
3269
585
422
                return if $recipient_domains{$dom};
3270
583
435
                return if $recipient_domains{ _registrable($dom) // $dom };
3271                # Discard non-routable hostnames (single-label, pseudo-TLDs, etc.)
3272
583
713
                return unless $dom =~ /\.[a-zA-Z]{2,}$/;
3273
519
613
                return if $dom =~ /\.(?:local|internal|lan|localdomain|arpa)$/i;
3274
517
554
                return if $seen{$dom}++;
3275
383
518
                push @domains_with_source, { domain => $dom, source => $source };
3276
138
289
        };
3277
3278        # Collect from standard sender/reply headers
3279
138
274
        my %header_sources = (
3280                'from'        => 'From: header',
3281                'reply-to'    => 'Reply-To: header',
3282                'return-path' => 'Return-Path: header',
3283                'sender'      => 'Sender: header',
3284        );
3285
138
244
        for my $hname (sort keys %header_sources) {
3286
552
441
                my $val = $self->_header_value($hname) // next;
3287                $record->($_, $header_sources{$hname})
3288
262
258
                        for $self->_domains_from_text($val);
3289        }
3290
3291        # Message-ID domain often reveals the real bulk-sending platform
3292
138
161
        my $mid = $self->_header_value('message-id');
3293
138
282
        if ($mid && $mid =~ /\@([\w.-]+)/) {
3294
128
120
                my $mid_dom = lc $1;
3295
128
118
                my $mid_reg = _registrable($mid_dom) // $mid_dom;
3296                $record->($mid_dom, 'Message-ID: header')
3297
128
421
                        unless $TRUSTED_DOMAINS{$mid_dom} || $TRUSTED_DOMAINS{$mid_reg} || $self->{trusted_domains}->{$mid_dom} || $self->{trusted_domains}->{$mid_reg};
3298        }
3299
3300        # DKIM signing domain(s) -- the organisation that vouches for the message
3301
138
151
        my $auth = $self->_parse_auth_results_cached();
3302
138
138
101
229
        for my $dkim_d (@{ $auth->{dkim_domains} // [] }) {
3303
5
5
                $record->($dkim_d, 'DKIM-Signature: d= (signing domain)');
3304        }
3305
3306        # List-Unsubscribe identifies the ESP or bulk sender
3307
138
147
        my $unsub = $self->_header_value('list-unsubscribe');
3308
138
132
        if ($unsub) {
3309
0
0
                while ($unsub =~ m{https?://([^/:?\s>]+)}gi) {
3310
0
0
                        $record->(lc $1, 'List-Unsubscribe: header');
3311                }
3312
0
0
                while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) {
3313
0
0
                        $record->(lc $1, 'List-Unsubscribe: header');
3314                }
3315        }
3316
3317        # Body email addresses (mailto: and bare user@domain forms)
3318
138
144
        my $combined = $self->{_body_plain} . "\n" . $self->{_body_html};
3319        $record->($_, 'email address / mailto in body')
3320
138
124
                for $self->_domains_from_text($combined);
3321
3322        # Run the full intelligence pipeline on each collected domain
3323
138
100
        my @results;
3324
138
103
        for my $entry (@domains_with_source) {
3325
383
358
                my $info = $self->_analyse_domain($entry->{domain});
3326
383
655
                push @results, { %$entry, %$info };
3327        }
3328
138
803
        return \@results;
3329}
3330
3331# _domains_from_text( $text ) -> list of domain strings
3332#
3333# Purpose:
3334#   Extract unique domain names from mailto: links and bare user@domain
3335#   addresses in a block of text.
3336#
3337# Entry criteria:
3338#   $text -- a defined scalar of decoded body or header text.
3339#
3340# Exit status:
3341#   Returns a list of lower-cased domain strings (possibly empty).
3342
3343sub _domains_from_text {
3344
674
2009
        my ($self, $text) = @_;
3345
674
416
        my (%seen, @out);
3346
3347        # mailto: links (including HTML-entity-encoded @ signs from QP)
3348
674
738
        while ($text =~ /mailto:(?:[^@\s<>"]+)@([\w.-]+)/gi) {
3349
14
14
18
12
                my $dom = lc $1; $dom =~ s/\.$//;
3350
14
30
                push @out, $dom unless $seen{$dom}++;
3351        }
3352
3353        # Bare user@domain patterns
3354
674
1239
        while ($text =~ /\b[\w.+%-]+@([\w.-]+\.[a-zA-Z]{2,})\b/g) {
3355
662
662
563
468
                my $dom = lc $1; $dom =~ s/\.$//;
3356
662
1132
                push @out, $dom unless $seen{$dom}++;
3357        }
3358
674
960
        return @out;
3359}
3360
3361# _analyse_domain( $domain ) -> hashref
3362#
3363# Purpose:
3364#   Run the complete intelligence pipeline for a single domain: A record
3365#   (web hosting), MX record (mail hosting), NS record (DNS hosting),
3366#   and WHOIS (registrar, creation/expiry dates, abuse contact).
3367#   Each IP is enriched via RDAP/WHOIS.  Results are cached per domain
3368#   in $self->{_domain_info} and in the CHI cross-message cache.
3369#
3370# Entry criteria:
3371#   $domain -- lower-cased, no trailing dot, not in TRUSTED_DOMAINS.
3372#   $self->{timeout} used for all network operations.
3373#
3374# Exit status:
3375#   Always returns a hashref reference; never undef; may be empty ({}).
3376#   Possible keys: web_ip, web_org, web_abuse, mx_host, mx_ip, mx_org,
3377#   mx_abuse, ns_host, ns_ip, ns_org, ns_abuse, registrar,
3378#   registrar_abuse, registered, expires, recently_registered, whois_raw.
3379#
3380# Side effects:
3381#   Network I/O; writes result to $self->{_domain_info}{$domain} and CHI.
3382#
3383# Notes:
3384#   MX/NS lookups require Net::DNS; absent without it.
3385#   recently_registered is set to 1 (not 0) when the threshold is met.
3386#   whois_raw is truncated to WHOIS_RAW_MAX bytes.
3387
3388sub _analyse_domain {
3389
386
3654
        my ($self, $domain) = @_;
3390
3391        # Return the per-message cached result if already analysed
3392        return $self->{_domain_info}{$domain}
3393
386
355
                if $self->{_domain_info}{$domain};
3394
3395        # Check the cross-message CHI cache before hitting the network
3396
382
293
        if ($_cache) {
3397
0
0
                my $cached = $_cache->get("dom:$domain");
3398
0
0
                if ($cached) {
3399
0
0
                        $self->{_domain_info}{$domain} = $cached;
3400
0
0
                        return $cached;
3401                }
3402        }
3403
3404
382
452
        $self->_debug("Analysing domain: $domain");
3405
382
250
        my %info;
3406
3407        # --- A record -> web hosting IP ---
3408
382
367
        my $web_ip = $self->_resolve_host($domain);
3409
382
608
        if ($web_ip) {
3410
50
58
                $info{web_ip} = $web_ip;
3411
50
53
                my $w = $self->_whois_ip($web_ip);
3412
50
122
                $info{web_org}   = $w->{org}   if $w->{org};
3413
50
72
                $info{web_abuse} = $w->{abuse} if $w->{abuse};
3414        }
3415
3416        # MX and NS lookups require Net::DNS
3417
382
276
        if ($HAS_NET_DNS) {
3418                my $res = Net::DNS::Resolver->new(
3419                        tcp_timeout => $self->{timeout},
3420                        udp_timeout => $self->{timeout},
3421
0
0
                );
3422
3423                # --- MX record -> mail hosting ---
3424
0
0
                my $mxq = $res->search($domain, 'MX');
3425
0
0
                if ($mxq) {
3426
0
0
                        my ($best) = sort { $a->preference <=> $b->preference }
3427
0
0
0
0
                                     grep { $_->type eq 'MX' } $mxq->answer;
3428
0
0
                        if ($best) {
3429
0
0
                                (my $mx_host = lc $best->exchange) =~ s/\.$//;
3430
0
0
                                $info{mx_host} = $mx_host;
3431
0
0
                                my $mx_ip = $self->_resolve_host($mx_host);
3432
0
0
                                if ($mx_ip) {
3433
0
0
                                        $info{mx_ip} = $mx_ip;
3434
0
0
                                        my $mw = $self->_whois_ip($mx_ip);
3435
0
0
                                        $info{mx_org}   = $mw->{org}   if $mw->{org};
3436
0
0
                                        $info{mx_abuse} = $mw->{abuse} if $mw->{abuse};
3437                                }
3438                        }
3439                }
3440
3441                # --- NS record -> DNS hosting ---
3442
0
0
                my $nsq = $res->search($domain, 'NS');
3443
0
0
                if ($nsq) {
3444
0
0
0
0
                        my ($first) = grep { $_->type eq 'NS' } $nsq->answer;
3445
0
0
                        if ($first) {
3446
0
0
                                (my $ns_host = lc $first->nsdname) =~ s/\.$//;
3447
0
0
                                $info{ns_host} = $ns_host;
3448
0
0
                                my $ns_ip = $self->_resolve_host($ns_host);
3449
0
0
                                if ($ns_ip) {
3450
0
0
                                        $info{ns_ip} = $ns_ip;
3451
0
0
                                        my $nw = $self->_whois_ip($ns_ip);
3452
0
0
                                        $info{ns_org}   = $nw->{org}   if $nw->{org};
3453
0
0
                                        $info{ns_abuse} = $nw->{abuse} if $nw->{abuse};
3454                                }
3455                        }
3456                }
3457        }
3458
3459        # --- Domain WHOIS -> registrar + dates ---
3460
382
339
        my $domain_whois = $self->_domain_whois($domain);
3461
382
515
        if ($domain_whois) {
3462                # Truncate raw WHOIS for storage but parse structured fields from full text
3463
19
47
                $info{whois_raw} = substr($domain_whois, 0, $WHOIS_RAW_MAX);
3464
3465                # Registrar name
3466
19
42
                if ($domain_whois =~ /Registrar:\s*(.+)/i) {
3467
15
35
                        ($info{registrar} = $1) =~ s/\s+$//;
3468                }
3469
3470                # Registrar abuse contact email (try multiple field names)
3471
19
37
                for my $pat (
3472                        qr/Registrar Abuse Contact Email:\s*(\S+@\S+)/i,
3473                        qr/Abuse Contact Email:\s*(\S+@\S+)/i,
3474                        qr/abuse-contact:\s*(\S+@\S+)/i,
3475                ) {
3476
57
139
                        if (!$info{registrar_abuse} && $domain_whois =~ $pat) {
3477
12
25
                                ($info{registrar_abuse} = $1) =~ s/\s+$//;
3478                        }
3479                }
3480
3481                # Domain creation date (multiple registrar field name variations)
3482
19
49
                for my $pat (
3483                        qr/Creation Date:\s*(\S+)/i,
3484                        qr/Created(?:\s+On)?:\s*(\S+)/i,
3485                        qr/Registration Time:\s*(\S+)/i,
3486                        qr/^registered:\s*(\S+)/im,
3487                ) {
3488
76
127
                        if (!$info{registered} && $domain_whois =~ $pat) {
3489
15
23
                                ($info{registered} = $1) =~ s/[TZ].*//;
3490                        }
3491                }
3492
3493                # Domain expiry date
3494
19
34
                for my $pat (
3495                        qr/Registry Expiry Date:\s*(\S+)/i,
3496                        qr/Expir(?:y|ation)(?: Date)?:\s*(\S+)/i,
3497                        qr/paid-till:\s*(\S+)/i,
3498                ) {
3499
57
110
                        if (!$info{expires} && $domain_whois =~ $pat) {
3500
13
24
                                ($info{expires} = $1) =~ s/[TZ].*//;
3501                        }
3502                }
3503
3504                # Flag recently-registered domains (< RECENT_REG_DAYS old)
3505
19
35
                if ($info{registered}) {
3506
15
34
                        my $epoch = $self->_parse_date_to_epoch($info{registered});
3507
15
300
                        $info{recently_registered} = 1
3508                                if $epoch && (time() - $epoch) < $RECENT_REG_DAYS * $SECS_PER_DAY;
3509                }
3510        }
3511
3512        # Store in per-message and cross-message caches
3513
382
339
        $self->{_domain_info}{$domain} = \%info;
3514
382
298
        $_cache->set("dom:$domain", \%info) if $_cache;
3515
3516
382
303
        return \%info;
3517}
3518
3519# -----------------------------------------------------------------------
3520# Private: DNS helpers
3521# -----------------------------------------------------------------------
3522
3523# _resolve_host( $host ) -> ip_string | undef
3524#
3525# Purpose:
3526#   Resolve a hostname to an IPv4 (or IPv6) address.  Uses Net::DNS for
3527#   both A and AAAA queries when available; falls back to inet_aton for
3528#   pure IPv4 resolution.
3529#
3530# Entry criteria:
3531#   $host -- hostname string or already-numeric IP.
3532#
3533# Exit status:
3534#   Returns the first resolved IP string, or undef on failure.
3535#
3536# Notes:
3537#   When the input is already a dotted-quad IPv4 it is returned immediately.
3538#   AAAA records are tried if the A query fails and Net::DNS is available.
3539
3540sub _resolve_host {
3541
13
212
        my ($self, $host) = @_;
3542
13
23
        return $host if $host =~ /^\d{1,3}(?:\.\d{1,3}){3}$/;
3543
3544        # Check the CHI cache before hitting DNS
3545
12
16
        if ($_cache) {
3546
0
0
                my $cached_ip = $_cache->get("resolve:$host");
3547
0
0
                return $cached_ip if defined $cached_ip;
3548        }
3549
3550
12
6
        my $ip;
3551
3552
12
18
        if ($HAS_NET_DNS) {
3553                my $res = Net::DNS::Resolver->new(
3554                        tcp_timeout => $self->{timeout},
3555                        udp_timeout => $self->{timeout},
3556
0
0
                );
3557
3558                # Try A record first, then AAAA for IPv6
3559
0
0
                for my $type (qw(A AAAA)) {
3560
0
0
                        my $query = $res->search($host, $type);
3561
0
0
                        if ($query) {
3562
0
0
                                for my $rr ($query->answer) {
3563
0
0
                                        if ($rr->type eq 'A') {
3564
0
0
                                                $ip = $rr->address;
3565
0
0
                                                last;
3566                                        } elsif ($rr->type eq 'AAAA') {
3567
0
0
                                                $ip = $rr->address;
3568
0
0
                                                last;
3569                                        }
3570                                }
3571                        }
3572
0
0
                        last if defined $ip;
3573                }
3574        } else {
3575                # Fallback: gethostbyname (IPv4 only)
3576
12
12
9
213098
                my $packed = eval { inet_aton($host) };
3577
12
52
                $ip = $packed ? inet_ntoa($packed) : undef;
3578        }
3579
3580        # Cache the result (including undef as '' to avoid repeated failed lookups)
3581
12
19
        if ($_cache) {
3582
0
0
                $_cache->set("resolve:$host", $ip // '');
3583        }
3584
3585
12
29
        return $ip;
3586}
3587
3588# _reverse_dns( $ip ) -> hostname | undef
3589#
3590# Purpose:
3591#   Perform a PTR (reverse DNS) lookup for an IP address.  Supports both
3592#   IPv4 and IPv6 via Net::DNS when available; falls back to gethostbyaddr.
3593#
3594# Entry criteria:
3595#   $ip -- a defined IPv4 or IPv6 address string.
3596#
3597# Exit status:
3598#   Returns the PTR hostname string, or undef if no record exists.
3599
3600sub _reverse_dns {
3601
6
7
        my ($self, $ip) = @_;
3602
6
35
        return undef unless $ip;
3603
3604
6
13
        if ($HAS_NET_DNS) {
3605
0
0
                my $res   = Net::DNS::Resolver->new(tcp_timeout => $self->{timeout});
3606
0
0
                my $query = $res->search($ip, 'PTR');
3607
0
0
                if ($query) {
3608
0
0
                        for my $rr ($query->answer) {
3609
0
0
                                return $rr->ptrdname if $rr->type eq 'PTR';
3610                        }
3611                }
3612
0
0
                return undef;
3613        }
3614
3615        # Fallback for IPv4 only
3616
6
176465
        return scalar gethostbyaddr(inet_aton($ip), AF_INET);
3617}
3618
3619# -----------------------------------------------------------------------
3620# Private: WHOIS / RDAP
3621# -----------------------------------------------------------------------
3622
3623# _whois_ip( $ip ) -> hashref
3624#
3625# Purpose:
3626#   Enrich an IP address with organisation name, abuse contact, and country
3627#   code.  Tries RDAP first (if LWP is available), then falls back to raw
3628#   WHOIS via IANA referral.  Results are cached in CHI if available.
3629#
3630# Entry criteria:
3631#   $ip -- a defined IPv4 or IPv6 address string.
3632#
3633# Exit status:
3634#   Returns { org, abuse, country } hashref; keys absent when unknown.
3635
3636sub _whois_ip {
3637
9
1150
        my ($self, $ip) = @_;
3638
3639        # Check CHI cache before going to the network
3640
9
20
        if ($_cache) {
3641
0
0
                my $cached = $_cache->get("whois_ip:$ip");
3642
0
0
                return $cached if $cached;
3643        }
3644
3645
9
22
        my $result = $HAS_LWP ? $self->_rdap_lookup($ip) : {};
3646
3647        # Fall back to raw WHOIS if RDAP returned no organisation
3648
9
33
        unless ($result->{org}) {
3649
9
24
                my $raw = $self->_raw_whois($ip, 'whois.iana.org');
3650
9
61
                if ($raw) {
3651
8
61
                        my ($ref) = $raw =~ /whois:\s*([\w.-]+)/i;
3652
8
27
                        my $detail = $ref ? $self->_raw_whois($ip, $ref) : $raw;
3653
8
74
                        $result = $self->_parse_whois_text($detail) if $detail;
3654                }
3655        }
3656
3657        # Cache the enrichment result
3658
9
24
        $_cache->set("whois_ip:$ip", $result) if $_cache && $result;
3659
3660
9
17
        return $result;
3661}
3662
3663# _domain_whois( $domain ) -> raw_whois_string | undef
3664#
3665# Purpose:
3666#   Perform a two-step WHOIS lookup for a domain: first ask IANA for the
3667#   TLD's authoritative WHOIS server, then query that server.
3668#
3669# Entry criteria:
3670#   $domain -- a lower-cased domain name string.
3671#
3672# Exit status:
3673#   Returns the raw WHOIS response string, or undef on failure.
3674
3675sub _domain_whois {
3676
16
590
        my ($self, $domain) = @_;
3677
16
30
        my $iana = $self->_raw_whois($domain, 'whois.iana.org') // return undef;
3678
15
122
        my ($server) = $iana =~ /whois:\s*([\w.-]+)/i;
3679
15
40
        return undef unless $server;
3680
1
2
        return $self->_raw_whois($domain, $server);
3681}
3682
3683# _parse_domain_whois_abuse( $domain ) -> hashref
3684#
3685# Purpose:
3686#   Lightweight domain WHOIS lookup to extract only registrar name and
3687#   abuse contact.  Used as a fallback in _extract_and_resolve_urls() when
3688#   a URL host cannot be resolved to an IP.
3689#
3690# Entry criteria:
3691#   $domain -- a registrable domain name string.
3692#
3693# Exit status:
3694#   Returns { org, abuse } hashref; empty hashref on failure.
3695
3696sub _parse_domain_whois_abuse {
3697
56
56
        my ($self, $domain) = @_;
3698
56
66
        my $raw = $self->_domain_whois($domain) // return {};
3699
7
17
        my %info;
3700
7
13
        if ($raw =~ /Registrar:\s*(.+)/i) {
3701
7
13
                ($info{org} = $1) =~ s/\s+$//;
3702        }
3703        # Try multiple field name patterns for the abuse email
3704
7
10
        for my $pat (
3705                qr/Registrar Abuse Contact Email:\s*(\S+\@\S+)/i,
3706                qr/Abuse Contact Email:\s*(\S+\@\S+)/i,
3707                qr/abuse-contact:\s*(\S+\@\S+)/i,
3708        ) {
3709
21
37
                if (!$info{abuse} && $raw =~ $pat) {
3710
7
9
                        ($info{abuse} = $1) =~ s/\s+$//;
3711                }
3712        }
3713
7
8
        return \%info;
3714}
3715
3716# _rdap_lookup( $ip ) -> hashref
3717#
3718# Purpose:
3719#   Query the ARIN RDAP API for IP block ownership information.  RDAP is
3720#   preferred over raw WHOIS because it returns structured JSON.
3721#
3722# Entry criteria:
3723#   $ip     -- a defined IPv4 or IPv6 address string.
3724#   LWP::UserAgent must be installed.
3725#
3726# Exit status:
3727#   Returns { org, abuse, country } hashref; empty hashref on failure.
3728
3729sub _rdap_lookup {
3730
0
0
        my ($self, $ip) = @_;
3731
0
0
        return {} unless $HAS_LWP;
3732
3733
0
0
        my $ua = $self->{ua};
3734
0
0
        if(!defined($ua)) {
3735                $ua = LWP::UserAgent->new(
3736                        timeout => $self->{timeout},
3737
0
0
                        agent   => "Email-Abuse-Investigator/$VERSION",
3738                );
3739
3740
0
0
                if($HAS_CONN_CACHE) {
3741
0
0
                        my $conn_cache = LWP::ConnCache->new();
3742
0
0
                        $conn_cache->total_capacity(10);
3743
0
0
                        $ua->conn_cache($conn_cache);
3744                }
3745
3746
0
0
                $ua->env_proxy(1);
3747
0
0
                $self->{ua} = $ua;
3748        }
3749
3750        # Use the ARIN RDAP endpoint; it covers the ARIN region and redirects
3751        # for RIPE/APNIC/LACNIC/AfriNIC allocations.
3752
0
0
0
0
        my $res = eval { $ua->get("https://rdap.arin.net/registry/ip/$ip") };
3753
0
0
        return {} unless $res && $res->is_success();
3754
3755
0
0
        my $j = $res->decoded_content();
3756
0
0
        my %info;
3757
3758        # Extract organisation name from the JSON response
3759
0
0
        $info{org}    = $1 if $j =~ /"name"\s*:\s*"([^"]+)"/;
3760
0
0
        $info{handle} = $1 if $j =~ /"handle"\s*:\s*"([^"]+)"/;
3761
3762        # Extract abuse email from the vcardArray contact block
3763
0
0
        if ($j =~ /"abuse".*?"email"\s*:\s*"([^"]+)"/s) {
3764
0
0
                $info{abuse} = $1;
3765        } elsif ($j =~ /"email"\s*:\s*"([^@"]+@[^"]+)"/) {
3766
0
0
                $info{abuse} = $1;
3767        }
3768
3769        # Country code from the network's country field
3770
0
0
        $info{country} = $1 if $j =~ /"country"\s*:\s*"([A-Z]{2})"/;
3771
3772
0
0
        return \%info;
3773}
3774
3775# _raw_whois( $query, $server ) -> string | undef
3776#
3777# Purpose:
3778#   Open a TCP connection to a WHOIS server on port 43, send the query,
3779#   and return the full response as a string.  Uses IO::Select for read
3780#   timeouts so that alarm() is never needed (alarm() is unreliable on
3781#   Windows and in threaded Perl).  Supports IPv6 WHOIS servers via
3782#   IO::Socket::IP when that module is available.
3783#
3784# Entry criteria:
3785#   $query   -- the domain name or IP to query (defined, non-empty).
3786#   $server  -- the WHOIS server hostname (default: 'whois.iana.org').
3787#   $self->{timeout} -- seconds used for connect and per-read waits.
3788#
3789# Exit status:
3790#   Returns the raw WHOIS response string, or undef on connection/write failure.
3791#
3792# Notes:
3793#   Uses IO::Socket::IP (dual-stack) when available, falling back to
3794#   IO::Socket::INET (IPv4 only) otherwise.  The IO::Select loop reads
3795#   until the server closes the connection or the per-read timeout expires.
3796
3797sub _raw_whois {
3798
23
274
        my ($self, $query, $server) = @_;
3799
23
37
        $server //= 'whois.iana.org';
3800
23
71
        $self->_debug("WHOIS $server -> $query");
3801
3802        # Choose the socket class based on what is installed.
3803        # IO::Socket::IP supports both IPv4 and IPv6 WHOIS servers.
3804
23
46
        my $sock_class = $HAS_IO_SOCKET_IP ? 'IO::Socket::IP' : 'IO::Socket::INET';
3805
3806        # Attempt TCP connection to port 43 on the WHOIS server
3807
23
28
        my $sock = eval {
3808                $sock_class->new(
3809                        PeerAddr => $server,
3810                        PeerPort => $WHOIS_PORT,
3811                        Proto    => 'tcp',
3812                        Timeout  => $self->{timeout},
3813
23
154
                );
3814        };
3815
23
2419586
        return undef unless $sock;
3816
3817        # Send the WHOIS query in wire format (CRLF-terminated per RFC 3912)
3818
23
0
0
123
0
0
        $sock->print("$query\r\n") or do { $sock->close(); return undef };
3819
3820        # Use IO::Select to implement per-read timeouts without alarm()
3821
23
1138
        my $sel      = IO::Select->new($sock);
3822
23
1095
        my $response = '';
3823
23
95
        my $buf      = '';
3824
3825        # Read until EOF (server closes) or timeout
3826
23
87
        while ($sel->can_read($self->{timeout})) {
3827                # Wrap in eval to catch 'Connection reset by peer' thrown by Fatal/autodie
3828
75
75
1289566
180
                my $n = eval { sysread($sock, $buf, $WHOIS_READ_CHUNK) };
3829
3830
75
8443
                if ($@ || !defined $n || $n <= 0) {
3831
23
37
                        $self->_debug("WHOIS read failed: $@") if $@;
3832
23
28
                        last;
3833                }
3834
52
132
                last unless defined $n && $n > 0;
3835
52
243
                $response .= $buf;
3836        }
3837
3838
23
111
        $sock->close();
3839
23
1252
        return $response || undef;
3840}
3841
3842# _parse_whois_text( $text ) -> hashref
3843#
3844# Purpose:
3845#   Parse a raw WHOIS IP block response to extract organisation name,
3846#   abuse contact email, and country code.
3847#
3848# Entry criteria:
3849#   $text -- a defined WHOIS response string.
3850#
3851# Exit status:
3852#   Returns { org, abuse, country } hashref; keys absent when not found.
3853
3854sub _parse_whois_text {
3855
37
5885
        my ($self, $text) = @_;
3856
37
47
        return {} unless $text;
3857
35
31
        my %info;
3858
3859        # Try multiple field names for the organisation name
3860
35
110
        for my $pat (
3861                qr/^OrgName:\s*(.+)/mi,    qr/^org-name:\s*(.+)/mi,
3862                qr/^owner:\s*(.+)/mi,      qr/^descr:\s*(.+)/mi,
3863        ) {
3864
140
417
                if (!$info{org} && $text =~ $pat) {
3865
20
69
                        ($info{org} = $1) =~ s/\s+$//;
3866                }
3867        }
3868
3869        # Try multiple field names for the abuse email
3870
35
75
        for my $pat (
3871                qr/OrgAbuseEmail:\s*(\S+@\S+)/mi,
3872                qr/abuse-mailbox:\s*(\S+@\S+)/mi,
3873        ) {
3874
70
178
                if (!$info{abuse} && $text =~ $pat) {
3875
6
14
                        ($info{abuse} = $1) =~ s/\s+$//;
3876                }
3877        }
3878
3879        # Last-resort: any abuse@ address in the response
3880
35
108
        $info{abuse} //= $1 if $text =~ /(abuse\@[\w.-]+)/i;
3881
3882        # Country code (case-insensitive match, normalised to uppercase)
3883
35
66
        if ($text =~ /^country:\s*([A-Za-z]{2})\s*$/m) {
3884
12
35
                $info{country} = uc $1;
3885        }
3886
35
63
        return \%info;
3887}
3888
3889# -----------------------------------------------------------------------
3890# Private: authentication results parsing
3891# -----------------------------------------------------------------------
3892
3893# _parse_auth_results_cached() -> hashref
3894#
3895# Purpose:
3896#   Parse the Authentication-Results: header(s) from the message once,
3897#   cache the result, and return it.  Extracts SPF, DKIM, DMARC, ARC
3898#   results and the DKIM signing domain(s).
3899#
3900# Entry criteria:
3901#   $self->{_headers} populated by _split_message().
3902#
3903# Exit status:
3904#   Returns { spf, dkim, dmarc, arc, dkim_domain, dkim_domains } hashref.
3905#   Keys absent when the corresponding header or field is not present.
3906
3907sub _parse_auth_results_cached {
3908
609
1828
        my $self = $_[0];
3909
609
686
        return $self->{_auth_results} if $self->{_auth_results};
3910
3911
339
231
        my %auth;
3912
3913        # Concatenate all Authentication-Results: header values
3914        my $raw = join('; ',
3915
37
48
                map  { $_->{value} }
3916
2571
2064
                grep { $_->{name} eq 'authentication-results' }
3917
339
339
301
341
                @{ $self->{_headers} }
3918        );
3919
3920        # Extract individual authentication mechanism results
3921
339
401
        $auth{spf}   = $1 if $raw =~ /\bspf=(\S+)/i;
3922
339
329
        $auth{dkim}  = $1 if $raw =~ /\bdkim=(\S+)/i;
3923
339
328
        $auth{dmarc} = $1 if $raw =~ /\bdmarc=(\S+)/i;
3924
339
782
        $auth{arc}   = $1 if $raw =~ /\barc=(\S+)/i;
3925
3926        # Strip trailing punctuation captured by the greedy \S+
3927
339
295
        for my $k (qw(spf dkim dmarc arc)) {
3928
1356
1115
                $auth{$k} =~ s/[;,\s]+$// if defined $auth{$k};
3929        }
3930
3931        # Extract DKIM signing domains from all DKIM-Signature: d= tags.
3932        # Prefer the first domain that matches the provider table (identifies ESP).
3933
339
226
        my @dkim_domains;
3934
339
2571
339
243
1846
287
        for my $h (grep { $_->{name} eq 'dkim-signature' } @{ $self->{_headers} }) {
3935
14
25
                if ($h->{value} =~ /\bd=([^;,\s]+)/) {
3936
14
17
                        push @dkim_domains, lc $1;
3937                }
3938        }
3939
3940
339
298
        if (@dkim_domains) {
3941                # Check if any signing domain matches a known provider
3942
9
6
                my $preferred;
3943
9
8
                for my $d (@dkim_domains) {
3944
14
12
                        if ($self->_provider_abuse_for_host($d)) {
3945
4
2
                                $preferred = $d;
3946
4
5
                                last;
3947                        }
3948                }
3949
9
17
                $auth{dkim_domain}  = $preferred // $dkim_domains[0];
3950
9
11
                $auth{dkim_domains} = \@dkim_domains;
3951        }
3952
3953
339
295
        $self->{_auth_results} = \%auth;
3954
339
355
        return \%auth;
3955}
3956
3957# -----------------------------------------------------------------------
3958# Private: provider-table lookups
3959# -----------------------------------------------------------------------
3960
3961# _provider_abuse_for_host( $host ) -> hashref | undef
3962#
3963# Purpose:
3964#   Look up a hostname (and each of its parent domains, stripping one label
3965#   at a time from the left) in the %PROVIDER_ABUSE table.
3966#
3967# Entry criteria:
3968#   $host -- a defined hostname or domain string.
3969#
3970# Exit status:
3971#   Returns the %PROVIDER_ABUSE entry hashref on match, undef otherwise.
3972
3973sub _provider_abuse_for_host {
3974
1131
6535
        my ($self, $host) = @_;
3975
1131
845
        $host = lc $host;
3976        # Strip successive subdomains until we find a match or exhaust labels
3977
1131
1140
        while ($host =~ /\./) {
3978
1253
1055
                return $self->{provider_abuse}->{$host} if $self->{provider_abuse}->{$host};
3979
1253
1164
                return $PROVIDER_ABUSE{$host} if $PROVIDER_ABUSE{$host};
3980
1003
1513
                $host =~ s/^[^.]+\.//;
3981        }
3982
881
689
        return undef;
3983}
3984
3985# _provider_abuse_for_ip( $ip, $rdns ) -> hashref | undef
3986#
3987# Purpose:
3988#   Look up an IP's reverse-DNS hostname in the %PROVIDER_ABUSE table to
3989#   identify well-known provider networks by rDNS pattern.
3990#
3991# Entry criteria:
3992#   $ip   -- IPv4 or IPv6 address string (used as fallback if $rdns absent).
3993#   $rdns -- optional rDNS hostname string.
3994#
3995# Exit status:
3996#   Returns the %PROVIDER_ABUSE entry on match, undef otherwise.
3997
3998sub _provider_abuse_for_ip {
3999
125
870
        my ($self, $ip, $rdns) = @_;
4000
125
181
        return $self->_provider_abuse_for_host($rdns) if $rdns;
4001
2
5
        return undef;
4002}
4003
4004# -----------------------------------------------------------------------
4005# Private: eTLD+1 normalisation
4006# -----------------------------------------------------------------------
4007
4008# _registrable( $host ) -> string | undef
4009#
4010# Purpose:
4011#   Return the registrable eTLD+1 form of a hostname.  Uses
4012#   Domain::PublicSuffix when installed for accurate results; falls back
4013#   to a built-in heuristic for the common two-letter ccTLD+2 pattern.
4014#
4015# Entry criteria:
4016#   $host -- a hostname string (may include subdomains).
4017#
4018# Exit status:
4019#   Returns the registrable domain string, or undef for single-label
4020#   hostnames (e.g. 'localhost').
4021#
4022# Notes:
4023#   The heuristic handles co.uk, com.au, net.jp, org.nz etc. but not
4024#   uncommon second-level delegations like ltd.uk or plc.uk.
4025
4026sub _registrable {
4027
976
12405
        my ($host) = @_;
4028
976
1481
        return undef unless $host && $host =~ /\./;
4029
4030        # Use Domain::PublicSuffix for accurate PSL-based normalisation
4031
839
644
        if ($HAS_PUBLIC_SUFFIX) {
4032
0
0
                my $psl = Domain::PublicSuffix->new();
4033
0
0
                my $root = $psl->get_root_domain(lc $host);
4034
0
0
                return $root if $root;
4035        }
4036
4037        # Built-in heuristic fallback
4038
839
811
        my @labels = split /\./, lc $host;
4039
839
1172
        return $host if @labels <= 2;
4040
4041        # Detect common ccTLD second-level patterns (e.g. co.uk, com.au)
4042
95
231
        if ($labels[-1] =~ /^[a-z]{2}$/ &&
4043            $labels[-2] =~ /^(?:co|com|net|org|gov|edu|ac|me)$/) {
4044
47
106
                return join('.', @labels[-3..-1]);
4045        }
4046
48
151
        return join('.', @labels[-2..-1]);
4047}
4048
4049# -----------------------------------------------------------------------
4050# Private: utilities
4051# -----------------------------------------------------------------------
4052
4053# _enrich_ip( $ip, $confidence, $note ) -> origin hashref
4054#
4055# Purpose:
4056#   Perform rDNS and WHOIS/RDAP for a single IP and package the results
4057#   into the standard origin hashref returned by originating_ip().
4058#
4059# Entry criteria:
4060#   $ip         -- a defined, non-private IPv4 or IPv6 address string.
4061#   $confidence -- 'high', 'medium', or 'low'.
4062#   $note       -- human-readable explanation of why this IP was chosen.
4063#
4064# Exit status:
4065#   Returns { ip, rdns, org, abuse, country, confidence, note } hashref.
4066
4067sub _enrich_ip {
4068
89
2371
        my ($self, $ip, $confidence, $note) = @_;
4069
89
129
        my $rdns  = $self->_reverse_dns($ip);
4070
89
223
        my $whois = $self->_whois_ip($ip);
4071        return {
4072                ip         => $ip,
4073                rdns       => $rdns  // '(no reverse DNS)',
4074                org        => $whois->{org}     // '(unknown)',
4075                abuse      => $whois->{abuse}   // '(unknown)',
4076                country    => $whois->{country} // undef,
4077
89
714
                confidence => $confidence,
4078                note       => $note,
4079        };
4080}
4081
4082# _header_value( $name ) -> value_string | undef
4083#
4084# Purpose:
4085#   Return the value of the first header matching the given lower-cased
4086#   header name.
4087#
4088# Entry criteria:
4089#   $name -- a lower-cased header name string.
4090#   $self->{_headers} populated by _split_message().
4091#
4092# Exit status:
4093#   Returns the value string, or undef if the header is not present.
4094
4095sub _header_value {
4096
3892
3997
        my ($self, $name) = @_;
4097
3892
3892
2279
2969
        for my $h (@{ $self->{_headers} }) {
4098
22126
18296
                return $h->{value} if $h->{name} eq lc($name);
4099        }
4100
1883
1755
        return undef;
4101}
4102
4103# _ip_in_cidr( $ip, $cidr ) -> bool
4104#
4105# Purpose:
4106#   Test whether an IPv4 address falls within a CIDR block or is an exact
4107#   match (when $cidr contains no '/' separator).
4108#
4109# Entry criteria:
4110#   $ip   -- a defined dotted-quad IPv4 address string.
4111#   $cidr -- a CIDR string like '10.0.0.0/8' or an exact IP.
4112#
4113# Exit status:
4114#   Returns 1 (true) if the IP is within the CIDR block, 0 otherwise.
4115
4116sub _ip_in_cidr {
4117
51
838
        my ($self, $ip, $cidr) = @_;
4118
51
105
        return $ip eq $cidr unless $cidr =~ m{/};
4119
45
61
        my ($net_addr, $prefix) = split m{/}, $cidr;
4120
45
186
        return 0 unless defined $prefix && $prefix =~ /^\d+$/ && $prefix <= 32;
4121
4122        # Compute the network mask and compare masked network addresses
4123
44
43
        my $mask  = ~0 << (32 - $prefix);
4124
44
157
        my $net_n = unpack 'N', (inet_aton($net_addr) // return 0);
4125
44
79
        my $ip_n  = unpack 'N', (inet_aton($ip)       // return 0);
4126
44
97
        return ($ip_n & $mask) == ($net_n & $mask);
4127}
4128
4129# _decode_mime_words( $str ) -> decoded_string
4130#
4131# Purpose:
4132#   Decode MIME encoded-words (=?charset?B/Q?...?=) in a header value
4133#   string for human-readable display in reports.
4134#
4135# Entry criteria:
4136#   $str -- a defined header value string; may be undef.
4137#
4138# Exit status:
4139#   Returns the decoded string, or '' if $str is undef.
4140
4141sub _decode_mime_words {
4142
473
2628
        my ($self, $str) = @_;
4143
473
413
        return '' unless defined $str;
4144        # Replace each encoded-word with its decoded equivalent
4145
471
33
428
37
        $str =~ s/=\?([^?]+)\?([BbQq])\?([^?]*)\?=/_decode_ew($1,$2,$3)/ge;
4146
471
385
        return $str;
4147}
4148
4149# _decode_ew( $charset, $enc, $text ) -> decoded_bytes
4150#
4151# Purpose:
4152#   Decode a single MIME encoded-word component (base64 or quoted-printable).
4153#
4154# Notes:
4155#   Non-UTF-8 charsets return raw bytes; good enough for display-name spoof
4156#   detection which only needs ASCII matching.
4157
4158sub _decode_ew {
4159
33
53
        my ($charset, $enc, $text) = @_;
4160
33
25
        my $raw;
4161
33
42
        if (uc($enc) eq 'B') {
4162
31
55
                $raw = decode_base64($text);
4163        } else {
4164                # Quoted-printable encoded-word uses underscore for space
4165
2
3
                $text =~ s/_/ /g;
4166
2
4
                $raw  = decode_qp($text);
4167        }
4168
33
66
        return $raw;
4169}
4170
4171# _parse_date_to_epoch( $str ) -> epoch_int | undef
4172#
4173# Purpose:
4174#   Parse common WHOIS date strings to a Unix epoch integer.
4175#   Handles YYYY-MM-DD, YYYY-MM-DDThh:mm:ssZ, and DD-Mon-YYYY formats.
4176#
4177# Entry criteria:
4178#   $str -- a defined date string; may be undef.
4179#
4180# Exit status:
4181#   Returns epoch integer on success, undef if the string cannot be parsed.
4182
4183sub _parse_date_to_epoch {
4184
61
5112
        my ($self, $str) = @_;
4185
61
86
        return undef unless $str;
4186
4187        # Clean the string of trailing whitespace/newlines
4188
60
132
        $str =~ s/^\s+|\s+$//g;
4189
4190        # Guard Regex: Validates the strict YYYY-MM-DDThh:mm:ssZ format
4191
60
67
        if ($str =~ /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.\d+)?Z$/) {
4192                # Parse the string
4193                # We use 'strptime' to create a Time::Piece object.
4194                # The 'Z' indicates UTC (Zulu time).
4195
2
2
                my $epoch = eval {
4196
2
11
                        my $t = Time::Piece->strptime($1, '%Y-%m-%dT%H:%M:%S');
4197
4198                        # Return seconds since the epoch
4199                        # Time::Piece handles the timezone offset internally when calling ->epoch
4200
4201                        # strptime returns a local time object.
4202                        # We must subtract the local timezone offset to get the true UTC epoch.
4203
2
217
                        return $t->epoch - $t->tzoffset->seconds;
4204                };
4205
2
99
                return $epoch if defined $epoch;
4206        }
4207
58
42
        my ($y, $m, $d);
4208
4209
58
39
124
71
        if    ($str =~ /^(\d{4})-(\d{2})-(\d{2})/)         { ($y,$m,$d)=($1,$2,$3) }
4210
17
42
        elsif ($str =~ /^(\d{2})-([A-Za-z]{3})-(\d{4})/)   { ($d,$m,$y)=($1,$Readonly::Values::Months::months{lc$2}//0,$3) }
4211
1
2
        elsif ($str =~ /^(\d{2})\/(\d{2})\/(\d{4})/)        { ($m,$d,$y)=($1,$2,$3) }
4212
4213
58
185
        return undef unless $y && $m && $d;
4214
4215
56
56
56
42
103
61
        if (eval { require Time::Local; 1 }) {
4216
56
56
32
105
                return eval { Time::Local::timegm(0,0,0,$d,$m-1,$y-1900) };
4217        }
4218        # Approximate fallback without Time::Local
4219
0
0
        return ($y-1970)*365.25*$SECS_PER_DAY + ($m-1)*30.5*$SECS_PER_DAY + ($d-1)*$SECS_PER_DAY;
4220}
4221
4222# _parse_rfc2822_date( $str ) -> epoch_int | undef
4223#
4224# Purpose:
4225#   Parse an RFC 2822 Date: header value to a Unix epoch integer.
4226#   Timezone offsets are intentionally ignored; the function returns a
4227#   UTC-equivalent value.  For the 7-day suspicious_date window the
4228#   maximum error is ~14 hours, well within the tolerance.
4229#
4230# Entry criteria:
4231#   $str -- a defined Date: header value string.
4232#
4233# Exit status:
4234#   Returns epoch integer on success, undef if the string cannot be parsed.
4235
4236sub _parse_rfc2822_date {
4237
206
172
        my ($str) = @_;
4238
206
168
        return undef unless $str;
4239
4240        # Match: DD Mon YYYY HH:MM:SS (timezone offset ignored)
4241
206
475
        if ($str =~ /(\d{1,2})\s+([A-Za-z]{3})\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})/) {
4242                my ($d, $m, $y, $H, $M, $S) =
4243
206
555
                        ($1, $Readonly::Values::Months::months{ lc $2 } // 0, $3, $4, $5, $6);
4244
206
1038
                return undef unless $m;
4245
206
206
206
184
485
216
                if (eval { require Time::Local; 1 }) {
4246
206
206
138
372
                        return eval { Time::Local::timegm($S, $M, $H, $d, $m - 1, $y - 1900) };
4247                }
4248        }
4249
0
0
        return undef;
4250}
4251
4252# _country_name( $cc ) -> country_name_string
4253#
4254# Purpose:
4255#   Return a human-readable country name for a two-letter ISO 3166-1
4256#   alpha-2 country code.  Only the small set of statistically high-volume
4257#   spam-originating countries is covered; other codes are returned as-is.
4258#
4259# Entry criteria:
4260#   $cc -- a two-letter uppercase country code string.
4261#
4262# Exit status:
4263#   Returns the country name string, or the code itself if not in the table.
4264
4265sub _country_name {
4266
35
4711
        my ($cc) = @_;
4267
35
74
        my %names = (
4268                CN => 'China',       RU => 'Russia',    NG => 'Nigeria',
4269                VN => 'Vietnam',     IN => 'India',      PK => 'Pakistan',
4270                BD => 'Bangladesh',
4271        );
4272
35
95
        return $names{$cc} // $cc;
4273}
4274
4275# _debug( $msg )
4276#
4277# Purpose:
4278#   Write a diagnostic message to STDERR when verbose mode is enabled.
4279#
4280# Entry criteria:
4281#   $msg -- a defined message string.
4282#
4283# Notes:
4284#   Messages are prefixed with the class name for easy grepping.
4285
4286sub _debug {
4287
888
709
        my ($self, $msg) = @_;
4288
4289
888
1007
        if($self->{verbose}) {
4290
2
4
                if(my $logger = $self->{logger}) {   # May have been set in Object::Configure
4291
0
0
                        $logger->debug("[Email::Abuse::Investigator] $msg");
4292                } else {
4293
2
4
                        print STDERR "[Email::Abuse::Investigator] $msg\n";
4294                }
4295        }
4296}
4297
42981;
4299