File Coverage

File:	blib/lib/Email/Abuse/Investigator.pm
Coverage:	78.3%

line	stmt	bran	cond	sub	time	code
1						package Email::Abuse::Investigator;
2
3	15 15 15			15	750166 14 204	use strict;
4	15 15 15			15	28 7 303	use warnings;
5	15 15 15			15	2697 87209 27	use autodie qw(:all);
6	15 15 15			15	104579 56364 33	use Time::Piece;
7
8 - 17						=head1 NAME Email::Abuse::Investigator - Analyse spam email to identify originating hosts, hosted URLs, and suspicious domains =head1 VERSION Version 0.08 =cut
18
19						our $VERSION = '0.08';
20
21 - 102						=head1 SYNOPSIS use Email::Abuse::Investigator; my $analyser = Email::Abuse::Investigator->new( verbose => 1 ); $analyser->parse_email($raw_email_text); # Originating IP and its network owner my $origin = $analyser->originating_ip(); # All HTTP/HTTPS URLs found in the body my @urls = $analyser->embedded_urls(); # All domains extracted from mailto: links and bare addresses in the body my @mdoms = $analyser->mailto_domains(); # All domains mentioned anywhere (union of the above) my @adoms = $analyser->all_domains(); # Full printable report print $analyser->report(); =head1 DESCRIPTION C<Email::Abuse::Investigator> examines the raw source of a spam/phishing e-mail and answers the questions manual abuse investigators ask: =over 4 =item 1. Where did the message really come from? Walks the C<Received:> chain, skips private/trusted IPs, and identifies the first external hop. Enriches with rDNS, WHOIS/RDAP org name and abuse contact. Both IPv4 and IPv6 addresses are supported. =item 2. Who hosts the advertised web sites? Extracts every C<http://> and C<https://> URL from both plain-text and HTML parts, resolves each hostname to an IP, and looks up the network owner. =item 3. Who owns the reply-to / contact domains? Extracts domains from C<mailto:> links, bare e-mail addresses in the body, the C<From:>/C<Reply-To:>/C<Sender:>/C<Return-Path:> headers, C<DKIM-Signature: d=> (the signing domain), C<List-Unsubscribe:> (the ESP or bulk-sender domain), and the C<Message-ID:> domain. For each unique domain it gathers: =over 8 =item * Domain registrar and registrant (WHOIS) =item * Web-hosting IP and network owner (A record -> RDAP) =item * Mail-hosting IP and network owner (MX record -> RDAP) =item * DNS nameserver operator (NS record -> RDAP) =item * Whether the domain was recently registered (potential flag) =back =back =head1 REQUIRED MODULES The following modules are mandatory: Readonly::Values::Months Socket (core since Perl 5) IO::Socket::INET (core since Perl 5) MIME::QuotedPrint (core since Perl 5.8) MIME::Base64 (core since Perl 5.8) The following are optional but strongly recommended: Net::DNS -- enables MX, NS, AAAA record lookups LWP::UserAgent -- enables RDAP (faster and richer than raw WHOIS) HTML::LinkExtor -- enables structural HTML link extraction CHI -- enables cross-message IP/domain result caching IO::Socket::IP -- enables IPv6 WHOIS connections =cut
103
104	15 15 15			15	769 16 318	use Carp qw(croak carp);
105	15 15 15			15	2894 9702 331	use IO::Select;
106	15 15 15			15	1674 71289 61	use IO::Socket::INET;
107	15 15 15			15	5274 2565 382	use MIME::QuotedPrint qw( decode_qp );
108	15 15 15			15	34 11 215	use MIME::Base64 qw( decode_base64 );
109	15 15 15			15	3101 637275 240	use Object::Configure;
110	15 15 15			15	48 30 223	use Params::Get;
111	15 15 15			15	27 18 167	use Params::Validate::Strict;
112	15 15 15			15	25 12 225	use Readonly;
113	15 15 15			15	2646 8585 1072	use Readonly::Values::Months;
114	15 15 15			15	48 15 1519	use Socket qw( inet_aton inet_ntoa AF_INET );
115
116						# -----------------------------------------------------------------------
117						# Optional modules -- gracefully degraded when absent
118						# -----------------------------------------------------------------------
119
120						# Net::DNS enables MX, NS, AAAA lookups; falls back to gethostbyname
121						my $HAS_NET_DNS;
122
123						# LWP::UserAgent enables RDAP queries; falls back to raw WHOIS
124						my $HAS_LWP;
125						my $HAS_CONN_CACHE;
126
127						# HTML::LinkExtor enables structural HTML link extraction
128						my $HAS_HTML_LINKEXTOR;
129
130						# CHI enables a persistent cross-message cache for IP/domain data
131						my $HAS_CHI;
132
133						# IO::Socket::IP provides dual-stack (IPv4+IPv6) socket support
134						my $HAS_IO_SOCKET_IP;
135
136						# Domain::PublicSuffix enables accurate eTLD+1 normalisation
137						my $HAS_PUBLIC_SUFFIX;
138
139						# AnyEvent::DNS enables parallel DNS queries
140						my $HAS_ANYEVENT_DNS;
141
142						BEGIN {
143	15 15 0			15	20 821 0	$HAS_NET_DNS = eval { require Net::DNS; 1 };
144	15 15 0				3712 766 0	$HAS_LWP = eval { require LWP::UserAgent; 1 };
145	15 15 0				2925 678 0	$HAS_CONN_CACHE = eval { require LWP::ConnCache; 1 };
146	15 15 15				2760 2911 52565	$HAS_HTML_LINKEXTOR= eval { require HTML::LinkExtor; 1 };
147	15 15 0				16 891 0	$HAS_CHI = eval { require CHI; 1 };
148	15 15 15				3201 2218 25591	$HAS_IO_SOCKET_IP = eval { require IO::Socket::IP; 1 };
149	15 15 0				15 786 0	$HAS_PUBLIC_SUFFIX = eval { require Domain::PublicSuffix; 1 };
150	15 15 0				2971 20369 0	$HAS_ANYEVENT_DNS = eval { require AnyEvent::DNS; 1 };
151						}
152
153						# -----------------------------------------------------------------------
154						# Constants -- all magic numbers and strings live here
155						# -----------------------------------------------------------------------
156
157						# WHOIS protocol port (IANA-assigned)
158						Readonly::Scalar my $WHOIS_PORT => 43;
159
160						# Bytes to read per sysread() call from a WHOIS socket
161						Readonly::Scalar my $WHOIS_READ_CHUNK => 4096;
162
163						# Maximum WHOIS response bytes stored in whois_raw (keep reports compact)
164						Readonly::Scalar my $WHOIS_RAW_MAX => 2048;
165
166						# Maximum multipart nesting depth (recursion guard -- RFC 2046 has no limit
167						# but we cap it to prevent stack exhaustion on crafted messages)
168						Readonly::Scalar my $MAX_MULTIPART_DEPTH => 20;
169
170						# Number of days before registration that triggers recently_registered flag
171						Readonly::Scalar my $RECENT_REG_DAYS => 180;
172
173						# Number of days ahead of expiry that triggers domain_expires_soon flag
174						Readonly::Scalar my $EXPIRY_WARN_DAYS => 30;
175
176						# Seconds in a day -- used in date arithmetic throughout
177						Readonly::Scalar my $SECS_PER_DAY => 86400;
178
179						# Suspicious date window: dates outside +/- 7 days raise a flag
180						Readonly::Scalar my $DATE_SKEW_DAYS => 7;
181
182						# Maximum positive timezone offset in minutes (+14:00 = Line Islands)
183						Readonly::Scalar my $TZ_MAX_POS_MINS => 840;
184
185						# Maximum negative timezone offset in minutes (-12:00 = Baker Island)
186						Readonly::Scalar my $TZ_MAX_NEG_MINS => 720;
187
188						# High-risk score threshold
189						Readonly::Scalar my $SCORE_HIGH => 9;
190
191						# Medium-risk score threshold
192						Readonly::Scalar my $SCORE_MEDIUM => 5;
193
194						# Low-risk score threshold
195						Readonly::Scalar my $SCORE_LOW => 2;
196
197						# Flag severity weights (contribute to the numeric risk score)
198						Readonly::Hash my %FLAG_WEIGHT => (
199						HIGH => 3,
200						MEDIUM => 2,
201						LOW => 1,
202						INFO => 0,
203						);
204
205						# Maximum merged-role display string length before summarisation kicks in
206						Readonly::Scalar my $ROLE_MAX_LEN => 80;
207
208						# CHI cache TTL in seconds (1 hour -- IP allocations change slowly)
209						Readonly::Scalar my $CACHE_TTL_SECS => 3600;
210
211						# Default constructor timeout for network operations (seconds)
212						Readonly::Scalar my $DEFAULT_TIMEOUT => 10;
213
214						# Maximum role string length before truncation
215						Readonly::Scalar my $ROLE_WRAP_LEN => 66;
216
217						# -----------------------------------------------------------------------
218						# Private ranges -- IPs that are never actionable abuse targets
219						# -----------------------------------------------------------------------
220
221						# Both IPv4 and IPv6 private/reserved ranges. Each entry is a compiled
222						# regex; _is_private() iterates over them and returns true on first match.
223						my @PRIVATE_RANGES = (
224						# IPv4 ranges
225						qr/^0\./, # 0.0.0.0/8 this-network (RFC 1122)
226						qr/^127\./, # 127.0.0.0/8 loopback
227						qr/^10\./, # 10.0.0.0/8 RFC 1918
228						qr/^192\.168\./, # 192.168.0.0/16 RFC 1918
229						qr/^172\.(?:1[6-9]\|2\d\|3[01])\./, # 172.16.0.0/12 RFC 1918
230						qr/^169\.254\./, # 169.254.0.0/16 link-local
231						qr/^100\.(?:6[4-9]\|[7-9]\d\|1(?:[01]\d\|2[0-7]))\./, # 100.64.0.0/10 CGN (RFC 6598)
232						qr/^192\.0\.0\./, # 192.0.0.0/24 IETF protocol (RFC 6890)
233						qr/^192\.0\.2\./, # 192.0.2.0/24 TEST-NET-1 (RFC 5737)
234						qr/^198\.51\.100\./, # 198.51.100.0/24 TEST-NET-2 (RFC 5737)
235						qr/^203\.0\.113\./, # 203.0.113.0/24 TEST-NET-3 (RFC 5737)
236						qr/^255\./, # 255.0.0.0/8 broadcast
237						# IPv6 ranges
238						qr/^::1$/, # IPv6 loopback
239						qr/^fe80:/i, # IPv6 link-local (fe80::/10)
240						qr/^fc/i, # IPv6 ULA fc00::/7
241						qr/^fd/i, # IPv6 ULA fd00::/8
242						qr/^2001:db8:/i, # IPv6 documentation range (RFC 3849)
243						qr/^64:ff9b:/i, # IPv6 NAT64 well-known prefix
244						);
245
246						# Priority-ordered patterns for extracting IPs from Received: headers.
247						# Covers bracketed IPv4, bracketed IPv6, parenthesised address, and bare dotted-quad.
248						my @RECEIVED_IP_RE = (
249						qr/\[\s([\d.]+)\s\]/, # [1.2.3.4]
250						qr/\[\s([0-9a-fA-F:]+)\s\]/, # [IPv6 address]
251						qr/$\s[\w.-]\s\[?\s([\d.]+)\s\]?\s$/, # (hostname [1.2.3.4])
252						qr/from\s+[\w.-]+\s+([\d.]+)/, # from hostname addr
253						qr/([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})/, # bare dotted-quad fallback
254						);
255
256						# -----------------------------------------------------------------------
257						# Default configuration -- overridable via Object::Configure
258						# -----------------------------------------------------------------------
259
260						# Object::Configure may overlay
261						# values from a file before new() uses them. Use Readonly for constants
262						# that should never be overridden at runtime.
263
264						# -----------------------------------------------------------------------
265						# Trusted domains (infrastructure -- never report these as abuse targets)
266						# Can be overrideen at runtime by Object::Configure
267						# -----------------------------------------------------------------------
268
269						my %TRUSTED_DOMAINS = map { $_ => 1 } qw(
270						gmail.com googlemail.com yahoo.com outlook.com hotmail.com
271						google.com microsoft.com apple.com amazon.com
272						googlegroups.com groups.google.com
273						w3.org
274						fedex.com ups.com dhl.com usps.com royalmail.com
275						);
276
277						# -----------------------------------------------------------------------
278						# URL shortener domains (real destination is hidden behind these)
279						# -----------------------------------------------------------------------
280
281						my %URL_SHORTENERS = map { $_ => 1 } qw(
282						bit.ly bitly.com tinyurl.com t.co ow.ly
283						goo.gl is.gd buff.ly ift.tt dlvr.it
284						short.link rebrand.ly tiny.cc cutt.ly rb.gy
285						shorturl.at bl.ink smarturl.it yourls.org clicky.me
286						snip.ly adf.ly bc.vc lnkd.in fb.me
287						youtu.be
288						);
289
290						# -----------------------------------------------------------------------
291						# Well-known provider abuse contacts
292						# Can be overrideen at runtime by Object::Configure
293						# -----------------------------------------------------------------------
294
295						# Curated table of provider abuse contacts. Entries with only a 'form'
296						# key (no 'email') require web-form submission; abuse_contacts() suppresses
297						# email addresses for those providers and form_contacts() surfaces them.
298						my %PROVIDER_ABUSE = (
299						# Google / Gmail
300						'google.com' => { email => 'abuse@google.com', note => 'Also report Gmail accounts via https://support.google.com/mail/contact/abuse' },
301						'gmail.com' => { email => 'abuse@google.com', note => 'Report Gmail spam via https://support.google.com/mail/contact/abuse' },
302						'googlemail.com' => { email => 'abuse@google.com', note => 'Report via https://support.google.com/mail/contact/abuse' },
303						'1e100.net' => { email => 'abuse@google.com', note => 'Google infrastructure' },
304						'blogspot.com' => { email => 'abuse@google.com', note => 'Blogger/Blogspot -- report via https://support.google.com/blogger/answer/76315' },
305						'blogger.com' => { email => 'abuse@google.com', note => 'Blogger platform abuse' },
306						'sites.google.com' => { email => 'abuse@google.com', note => 'Google Sites hosted content' },
307						# Microsoft
308						'microsoft.com' => { email => 'abuse@microsoft.com', note => 'Also report via https://www.microsoft.com/en-us/wdsi/support/report-unsafe-site' },
309						'outlook.com' => { email => 'abuse@microsoft.com', note => 'Report Outlook spam: https://support.microsoft.com/en-us/office/report-phishing' },
310						'hotmail.com' => { email => 'abuse@microsoft.com', note => 'Report via https://support.microsoft.com/en-us/office/report-phishing' },
311						'live.com' => { email => 'abuse@microsoft.com', note => 'Microsoft consumer mail' },
312						'office365.com' => { email => 'abuse@microsoft.com', note => 'Microsoft 365 infrastructure' },
313						'protection.outlook.com' => { email => 'abuse@microsoft.com', note => 'Microsoft EOP gateway' },
314						# Yahoo
315						'yahoo.com' => { email => 'abuse@yahoo-inc.com', note => 'Also use https://io.help.yahoo.com/contact/index' },
316						'yahoo.co.uk' => { email => 'abuse@yahoo-inc.com', note => 'Yahoo UK' },
317						# Apple
318						'apple.com' => { email => 'reportphishing@apple.com', note => 'iCloud / Apple Mail abuse' },
319						'icloud.com' => { email => 'reportphishing@apple.com', note => 'iCloud abuse' },
320						'me.com' => { email => 'reportphishing@apple.com', note => 'Apple legacy mail' },
321						# Amazon / AWS
322						'amazon.com' => { email => 'abuse@amazonaws.com', note => 'Also https://aws.amazon.com/forms/report-abuse' },
323						'amazonaws.com' => { email => 'abuse@amazonaws.com', note => 'AWS abuse form: https://aws.amazon.com/forms/report-abuse' },
324						'amazonses.com' => { email => 'abuse@amazonaws.com', note => 'Amazon SES sending infrastructure' },
325						# Cloudflare
326						'cloudflare.com' => { email => 'abuse@cloudflare.com', note => 'Report via https://www.cloudflare.com/abuse/' },
327						# Fastly / Akamai
328						'fastly.net' => { email => 'abuse@fastly.com', note => 'Fastly CDN' },
329						'akamai.com' => { email => 'abuse@akamai.com', note => 'Akamai CDN' },
330						'akamaitechnologies.com' => { email => 'abuse@akamai.com', note => 'Akamai CDN' },
331						# Namecheap
332						'namecheap.com' => { email => 'abuse@namecheap.com', note => 'Registrar abuse' },
333						# GoDaddy -- web form only; email bounces
334						'godaddy.com' => {
335						form => 'https://supportcenter.godaddy.com/AbuseReport',
336						form_paste => 'Select the abuse type (spam, phishing, malware etc). '
337						. 'Enter the domain name in the Domain field. '
338						. 'Paste the originating IP, risk flags, and the relevant '
339						. 'Received: headers from the report below.',
340						form_upload => 'Take a screenshot of the report as a .png or .jpg, '
341						. 'or export it as a .pdf.',
342						note => 'Registrar/host -- email reports not monitored, use web form',
343						},
344						# SendGrid / Twilio
345						'sendgrid.net' => { email => 'abuse@sendgrid.com', note => 'ESP -- include full headers' },
346						'sendgrid.com' => { email => 'abuse@sendgrid.com', note => 'ESP -- include full headers' },
347						# Mailchimp / Mandrill
348						'mailchimp.com' => { email => 'abuse@mailchimp.com', note => 'ESP abuse' },
349						'mandrillapp.com' => { email => 'abuse@mailchimp.com', note => 'Mandrill transactional ESP' },
350						# OVH
351						'ovh.net' => { email => 'abuse@ovh.net', note => 'OVH hosting' },
352						'ovh.com' => { email => 'abuse@ovh.com', note => 'OVH hosting' },
353						# Hetzner
354						'hetzner.com' => { email => 'abuse@hetzner.com', note => 'Hetzner hosting' },
355						# Digital Ocean
356						'digitalocean.com' => { email => 'abuse@digitalocean.com',note => 'DO abuse form: https://www.digitalocean.com/company/contact/#abuse' },
357						# Linode / Akamai
358						'linode.com' => { email => 'abuse@linode.com', note => 'Linode/Akamai Cloud' },
359						# Constant Contact
360						'constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'ESP abuse' },
361						'r.constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'Constant Contact sending infrastructure' },
362						# HubSpot
363						'hubspot.com' => { email => 'abuse@hubspot.com', note => 'ESP abuse' },
364						'hs-analytics.net' => { email => 'abuse@hubspot.com', note => 'HubSpot analytics infrastructure' },
365						# Campaign Monitor
366						'createsend.com' => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor ESP' },
367						'cmail20.com' => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor sending infrastructure' },
368						# Klaviyo
369						'klaviyo.com' => { email => 'abuse@klaviyo.com', note => 'ESP abuse' },
370						# Brevo (formerly Sendinblue)
371						'sendinblue.com' => { email => 'abuse@sendinblue.com', note => 'ESP abuse' },
372						'brevo.com' => { email => 'abuse@brevo.com', note => 'ESP abuse' },
373						# Mailgun
374						'mailgun.com' => { email => 'abuse@mailgun.com', note => 'ESP abuse' },
375						'mailgun.org' => { email => 'abuse@mailgun.com', note => 'Mailgun sending infrastructure' },
376						# Postmark
377						'postmarkapp.com' => { email => 'abuse@postmarkapp.com', note => 'ESP abuse' },
378						# WordPress.com
379						'wordpress.com' => { email => 'abuse@wordpress.com', note => 'WordPress.com hosted blog -- report via https://en.wordpress.com/abuse/' },
380						'wp.com' => { email => 'abuse@wordpress.com', note => 'WordPress.com short domain' },
381						# Substack
382						'substack.com' => { email => 'abuse@substack.com', note => 'Substack newsletter platform abuse' },
383						# ActiveCampaign
384						'activecampaign.com' => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign ESP' },
385						'ac-tinker.com' => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign tracking infrastructure' },
386						# Salesforce Marketing Cloud
387						'salesforce.com' => { email => 'abuse@salesforce.com', note => 'Salesforce Marketing Cloud / ExactTarget ESP' },
388						'mc.salesforce.com' => { email => 'abuse@salesforce.com', note => 'Salesforce Marketing Cloud sending infrastructure' },
389						'exacttarget.com' => { email => 'abuse@salesforce.com', note => 'ExactTarget / Salesforce Marketing Cloud ESP' },
390						'et.exacttarget.com' => { email => 'abuse@salesforce.com', note => 'ExactTarget sending infrastructure' },
391						# Vultr
392						'vultr.com' => { email => 'abuse@vultr.com', note => 'Vultr hosting' },
393						# Contabo
394						'contabo.com' => { email => 'abuse@contabo.com', note => 'Contabo hosting' },
395						# Leaseweb
396						'leaseweb.com' => { email => 'abuse@leaseweb.com', note => 'Leaseweb hosting' },
397						# M247
398						'm247.com' => { email => 'abuse@m247.com', note => 'M247 hosting' },
399						# MarkMonitor -- web form only
400						'markmonitor.com' => {
401						form => 'https://corp.markmonitor.com/domain/ui/abuse-report',
402						form_paste => 'Complete all fields including the domain name and your '
403						. 'description of the abuse. Paste the originating IP, '
404						. 'risk flags, and the relevant Received: headers from the '
405						. 'report below.',
406						form_upload => 'Take a screenshot of the report as a .png or .jpg, or export it as a .pdf. MarkMonitor does not accept .eml files.',
407						note => 'Brand-protection registrar -- email reports not processed',
408						},
409						# URL shortener operators
410						'is.gd' => { email => 'abuse@is.gd', note => 'URL shortener -- report via https://is.gd/contact.php' },
411						'bitly.com' => { email => 'abuse@bitly.com', note => 'URL shortener abuse' },
412						'bit.ly' => { email => 'abuse@bitly.com', note => 'URL shortener abuse' },
413						'tinyurl.com' => { email => 'abuse@tinyurl.com', note => 'URL shortener abuse' },
414						'ow.ly' => { email => 'abuse@hootsuite.com', note => 'Hootsuite URL shortener' },
415						'buff.ly' => { email => 'abuse@buffer.com', note => 'Buffer URL shortener' },
416						'rb.gy' => { email => 'abuse@rb.gy', note => 'URL shortener abuse' },
417						'cutt.ly' => { email => 'abuse@cutt.ly', note => 'URL shortener abuse' },
418						'shorturl.at' => { email => 'abuse@shorturl.at', note => 'URL shortener abuse' },
419						# Dynadot -- web form only
420						'dynadot.com' => {
421						form => 'https://www.dynadot.com/report-abuse',
422						form_paste => 'Complete all fields including the domain name and your '
423						. 'description of the abuse. Paste the originating IP, '
424						. 'risk flags, and the relevant Received: headers from the '
425						. 'report below.',
426						form_upload => 'Take a screenshot of the report as a .png or .jpg, '
427						. 'or export it as a .pdf.',
428						note => 'Registrar -- email reports not monitored, use web form',
429						},
430						# Global Domain Group -- web form only
431						'globaldomaingroup.com' => {
432						form => 'https://globaldomaingroup.com/report-abuse',
433						form_paste => 'Complete all fields including the domain name and your '
434						. 'description of the abuse. Paste the originating IP, '
435						. 'risk flags, and the relevant Received: headers from the '
436						. 'report below.',
437						form_upload => 'Attach the original spam message as an .eml file.',
438						note => 'Registrar -- email reports explicitly not accepted',
439						},
440						# TPG / Internode (Australia)
441						'tpgi.com.au' => { email => 'abuse@tpg.com.au', note => 'TPG Telecom Australia' },
442						'tpg.com.au' => { email => 'abuse@tpg.com.au', note => 'TPG Telecom Australia' },
443						'internode.on.net' => { email => 'abuse@internode.on.net',note => 'Internode Australia' },
444						);
445
446						# -----------------------------------------------------------------------
447						# Constructor
448						# -----------------------------------------------------------------------
449
450 - 585						=head1 METHODS =head2 new( %options ) Constructs and returns a new C<Email::Abuse::Investigator> analyser object. The object is stateless until C<parse_email()> is called; all analysis results are stored on the object and retrieved via the public accessor methods documented below. A single object may be reused for multiple emails by calling C<parse_email()> again: all per-message cached state from the previous message is discarded automatically. Cross-message IP and domain lookup results are retained in a shared CHI cache (if C<CHI> is installed) to avoid redundant network queries across messages processed in the same process. =head3 Usage # Minimal -- all options take safe defaults my $analyser = Email::Abuse::Investigator->new(); # With options my $analyser = Email::Abuse::Investigator->new( timeout => 15, trusted_relays => ['203.0.113.0/24', '10.0.0.0/8'], verbose => 0, ); $analyser->parse_email($raw_rfc2822_text); my $origin = $analyser->originating_ip(); my @urls = $analyser->embedded_urls(); my @domains = $analyser->mailto_domains(); my $risk = $analyser->risk_assessment(); my @contacts = $analyser->abuse_contacts(); print $analyser->report(); =head3 Arguments All arguments are optional named parameters passed as a flat key-value list. =over 4 =item C<timeout> (integer, default 10) Maximum seconds to wait for any single network operation. Set to 0 to disable timeouts (not recommended for production use). =item C<trusted_relays> (arrayref of strings, default []) IP addresses or CIDR blocks to skip during Received: chain analysis. Each element may be an exact IPv4 address (C<'192.0.2.1'>) or a CIDR block (C<'192.0.2.0/24'>). =item C<verbose> (boolean, default 0) When true, diagnostic messages are written to STDERR. =back =head3 Returns A blessed C<Email::Abuse::Investigator> object. No network I/O is performed during construction. =head3 Side Effects If C<CHI> is installed, a shared in-memory cache is initialised (or re-used if a cache was already created by a prior call to C<new()>). This cache persists for the lifetime of the process. =head3 Notes =over 4 =item * Unknown option keys are silently ignored. =item * The object is not thread-safe. Use a separate object per thread. =item * WHOIS read timeouts use C<IO::Select> rather than C<alarm()>, so they work correctly on Windows and in threaded Perl interpreters. =back =head3 API Specification =head4 Input { timeout => { type => 'integer', optional => 1, min => 0, default => 10, }, trusted_relays => { type => 'arrayref', element_type => 'string', optional => 1, default => [], }, verbose => { type => 'boolean', optional => 1, default => 0, }, } =head4 Output { type => 'Email::Abuse::Investigator', isa => 'Email::Abuse::Investigator', } =head3 FORMAL SPECIFICATION -- Z notation (simplified) new == [ timeout : N; trusted_relays : seq STRING; verbose : BOOL; _raw : STRING; _headers : seq (STRING x STRING); _origin? : IP_INFO \| undefined; _urls? : seq URL_INFO \| undefined; _risk? : RISK_INFO \| undefined ] pre: timeout >= 0 post: self.timeout = params.timeout /\ self._raw = '' =cut
586
587						# Class-level cross-message CHI cache (shared across all instances).
588						# Populated lazily on first call to new() when CHI is available.
589						my $_cache;
590
591						sub new {
592	2608			2608	747139	my $class = shift;
593
594						# Accept hash or hashref arguments uniformly
595	2608		100		4145	my $params = Params::Validate::Strict::validate_strict({
596						args => Params::Get::get_params(undef, \@_) \|\| {},
597						schema => {
598						timeout => {
599						'type' => 'integer',
600						'optional' => 1,
601						'min' => 0,
602						},
603						trusted_relays => {
604						'type' => 'arrayref',
605						'element_type' => 'string',
606						'optional' => 1,
607						},
608						verbose => {
609						'type' => 'boolean',
610						'optional' => 1,
611						},
612						},
613						});
614
615						# Merge in any file-based configuration via Object::Configure
616	1630				161484	$params = Object::Configure::configure($class, $params);
617
618						# Initialise the cross-message CHI cache on first construction
619	1630	50	33		2716946	if ($HAS_CHI && !$_cache) {
620	0				0	$_cache = CHI->new(
621						driver => 'Memory',
622						global => 1,
623						expires_in => $CACHE_TTL_SECS,
624						);
625						}
626
627						# Build and bless the object with default slot values
628						return bless {
629						timeout => $DEFAULT_TIMEOUT,
630						trusted_relays => [],
631						verbose => 0,
632						_raw => '',
633						_headers => [],
634						_body_plain => '',
635						_body_html => '',
636						_received => [],
637						_origin => undef,
638						_urls => undef, # lazy-computed by embedded_urls()
639						_mailto_domains=> undef, # lazy-computed by mailto_domains()
640						_domain_info => {}, # per-message domain analysis cache
641						_sending_sw => [], # X-Mailer / X-PHP-Originating-Script etc.
642						_rcvd_tracking => [], # per-hop tracking IDs from Received: headers
643	1630 1630				2114 8196	%{$params}, # Override the defaults with Object:Configure and the values passed in
644						}, $class;
645						}
646
647						# -----------------------------------------------------------------------
648						# Public: parse
649						# -----------------------------------------------------------------------
650
651 - 740						=head2 parse_email( $text ) Feeds a raw RFC 2822 email message to the analyser and prepares it for subsequent interrogation. This is the only method that must be called before any other public method. If the same object is used for a second message, calling C<parse_email()> again completely replaces all per-message state from the first message. The cross-message CHI cache is B<not> flushed; IP and domain lookups cached from prior messages are retained. =head3 Usage my $raw = do { local $/; <STDIN> }; $analyser->parse_email($raw); # Scalar reference (avoids copying large messages) $analyser->parse_email(\$raw); # Chained my $analyser = Email::Abuse::Investigator->new()->parse_email($raw); =head3 Arguments =over 4 =item C<$text> (scalar or scalar reference, required) Complete raw RFC 2822 email message, including all headers and the body. Both LF-only and CRLF line endings are accepted. =back =head3 Returns The object itself (C<$self>), enabling method chaining. =head3 Side Effects Parses headers, decodes the body (quoted-printable, base64, multipart), extracts sending-software fingerprints, and populates per-hop tracking data. All previously computed lazy results are discarded. =head3 Notes =over 4 =item * If C<$text> is empty or contains no header/body separator, all public methods will return empty/safe values. =item * Decoding errors in base64 or quoted-printable payloads are silenced; raw bytes are used in place of correct output to prevent exceptions. =back =head3 API Specification =head4 Input [ { type => 'scalar \| scalarref', }, ] =head4 Output { type => 'Email::Abuse::Investigator', isa => 'Email::Abuse::Investigator', } =head3 FORMAL SPECIFICATION -- Z notation parse_email == [ Delta Email::Abuse::Investigator; text? : STRING \| ref STRING ] pre: defined text? post: self._raw = deref(text?) /\ self._origin = undefined /\ self._urls = undefined /\ self._risk = undefined =cut
741
742						# TODO: Allow a Mail::Message object to be passed in
743						sub parse_email {
744	485			485	55921	my $self = shift;
745
746						# Accept both positional string and named 'text' argument
747	485				691	my $args = Params::Get::get_params('text', \@_);
748	485				4791	my $text = $args->{text};
749
750						# Dereference if a scalar reference was supplied
751	485	50			604	$text = $$text if ref $text eq 'SCALAR';
752
753						# Sanitise: strip control characters that could affect terminal output.
754						# Keep \t (tabs in headers), \n (line endings), \r (CRLF mail format).
755	485	50			2216	$text =~ s/[^\x09\x0A\x0D\x20-\x7E\x80-\xFF]//g if defined $text;
756
757						# Store the sanitised raw text for later reproduction in reports
758	485		50		1082	$self->{_raw} = $text // '';
759
760						# Invalidate all per-message lazy caches
761	485				421	$self->{_origin} = undef;
762	485				382	$self->{_urls} = undef;
763	485				349	$self->{_mailto_domains} = undef;
764	485				484	$self->{_domain_info} = {};
765	485				423	$self->{_risk} = undef;
766	485				376	$self->{_auth_results} = undef;
767	485				414	$self->{_sending_sw} = [];
768	485				385	$self->{_rcvd_tracking} = [];
769
770						# Perform synchronous header/body parsing (no network I/O)
771	485	100	67		1686	$self->_split_message($text) if defined $text && $text =~ /\S/;
772	485				799	return $self;
773						}
774
775						# -----------------------------------------------------------------------
776						# Public: originating host
777						# -----------------------------------------------------------------------
778
779 - 851						=head2 originating_ip() Identifies the IP address of the machine that originally injected the message into the mail system by walking the C<Received:> chain, skipping private/trusted hops, and enriching the first external hop with rDNS, WHOIS/RDAP organisation name, abuse contact, and country code. Both IPv4 and IPv6 addresses are extracted and evaluated. The result is cached; subsequent calls return the same hashref without repeating network I/O. =head3 Usage my $orig = $analyser->originating_ip(); if (defined $orig) { printf "Origin: %s (%s)\n", $orig->{ip}, $orig->{rdns}; printf "Owner: %s\n", $orig->{org}; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A hashref with keys C<ip>, C<rdns>, C<org>, C<abuse>, C<confidence>, C<note>, and C<country> (may be undef). Returns C<undef> if no suitable originating IP can be determined. =head3 Side Effects On first call: one PTR lookup and one RDAP/WHOIS query. Results are cached in the object and in the cross-message CHI cache (if available). =head3 Notes Only the first (oldest) external IP in the chain is reported. See C<received_trail()> for the full chain. =head3 API Specification =head4 Input [] =head4 Output { type => 'hashref \| undef', keys => { ip => { type => 'scalar', regex => qr/[\d.:a-fA-F]/ }, rdns => { type => 'scalar' }, org => { type => 'scalar' }, abuse => { type => 'scalar' }, confidence => { type => 'scalar', regex => qr/^(?:high\|medium\|low)$/ }, note => { type => 'scalar' }, country => { type => 'scalar', optional => 1 }, }, } =head3 FORMAL SPECIFICATION -- Z notation originating_ip == [ Xi Email::Abuse::Investigator; result! : IP_INFO \| undefined ] pre: self._raw /= '' post: result! = self._origin /\ (result! /= undefined => result!.ip in EXTERNAL_IPS) =cut
852
853						sub originating_ip {
854	591			591	5397	my $self = $_[0];
855
856						# Return the cached result if we already have it
857	591		100		810	$self->{_origin} //= $self->_find_origin();
858	591				563	return $self->{_origin};
859						}
860
861						# -----------------------------------------------------------------------
862						# Public: HTTP/HTTPS URLs
863						# -----------------------------------------------------------------------
864
865 - 940						=head2 embedded_urls() Extracts every HTTP and HTTPS URL from the message body and enriches each one with the hosting IP address, network organisation name, abuse contact, and country code. Both IPv4 and IPv6 host addresses are supported. URL extraction runs across both plain-text and HTML body parts. DNS lookups for each unique hostname are optionally parallelised via C<AnyEvent::DNS> if that module is installed. The result is cached; subsequent calls return the same list without repeating network I/O. =head3 Usage my @urls = $analyser->embedded_urls(); for my $u (@urls) { printf "URL: %s host: %s org: %s\n", $u->{url}, $u->{host}, $u->{org}; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs, one per unique URL, in first-seen order. Returns an empty list if no HTTP/HTTPS URLs are present. Each hashref has keys C<url>, C<host>, C<ip>, C<org>, C<abuse>, C<country>. =head3 Side Effects Per unique hostname: one A/AAAA lookup and one RDAP/WHOIS query. Results are cached in the object and in the cross-message CHI cache. =head3 Notes Only C<http://> and C<https://> URLs are extracted. URL shortener hosts are included in the returned list (they are flagged by C<risk_assessment()>). =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { url => { type => 'scalar', regex => qr{^https?://}i }, host => { type => 'scalar' }, ip => { type => 'scalar' }, org => { type => 'scalar' }, abuse => { type => 'scalar' }, country => { type => 'scalar', optional => 1 }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation embedded_urls == [ Xi Email::Abuse::Investigator; result! : seq URL_INFO ] pre: self._raw /= '' post: result! = self._urls /\ forall u : result! @ u.url =~ m{^https?://}i =cut
941
942						sub embedded_urls {
943	604			604	7101	my $self = $_[0];
944
945	604		67		741	$self->{_urls} //= $self->_extract_and_resolve_urls();
946	604 604				404 708	return @{ $self->{_urls} };
947						}
948
949						# -----------------------------------------------------------------------
950						# Public: mailto / reply-to / from domains
951						# -----------------------------------------------------------------------
952
953 - 1022						=head2 mailto_domains() Identifies every domain associated with the message as a contact, reply, or delivery address, then runs a full intelligence pipeline on each one (A record, MX, NS, WHOIS) to determine hosting and registration details. The result is cached; subsequent calls return the same list without repeating network I/O. =head3 Usage my @domains = $analyser->mailto_domains(); for my $d (@domains) { printf "Domain: %s registrar: %s\n", $d->{domain}, $d->{registrar} // 'unknown'; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs, one per unique domain. See the main POD for the full set of possible keys. Returns an empty list if no qualifying domains are found. =head3 Side Effects Per unique domain: up to three A lookups, one MX lookup, one NS lookup, and two WHOIS queries. Results are cached in the object and in the cross-message CHI cache. =head3 Notes MX and NS lookups require C<Net::DNS>. Without it those keys are absent from every returned hashref. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { domain => { type => 'scalar' }, source => { type => 'scalar' }, # All other keys optional -- see main POD }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation mailto_domains == [ Xi Email::Abuse::Investigator; result! : seq DOMAIN_INFO ] pre: self._raw /= '' post: result! = self._mailto_domains /\ forall d : result! @ d.domain =~ /\.[a-zA-Z]{2,}$/ =cut
1023
1024						sub mailto_domains {
1025	567			567	2864	my $self = $_[0];
1026
1027	567		67		717	$self->{_mailto_domains} //= $self->_extract_and_analyse_domains();
1028	567 567				381 593	return @{ $self->{_mailto_domains} };
1029						}
1030
1031 - 1086						=head2 all_domains() Returns the deduplicated union of every registrable domain seen anywhere in the message -- URL hosts from C<embedded_urls()> and contact domains from C<mailto_domains()> -- normalised to eTLD+1 form. Triggers C<embedded_urls()> and C<mailto_domains()> lazily. =head3 Usage my @domains = $analyser->all_domains(); print "$_\n" for @domains; =head3 Arguments None. =head3 Returns A list of plain strings (registrable domain names), lower-cased, no duplicates, in first-seen order. =head3 Side Effects Triggers C<embedded_urls()> and C<mailto_domains()> if not already cached. =head3 Notes Normalisation to eTLD+1 uses C<Domain::PublicSuffix> if installed, falling back to a built-in heuristic otherwise. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'scalar', regex => qr/^[a-z0-9][a-z0-9.-]*\.[a-z]{2,}$/ }, ... ) =head3 FORMAL SPECIFICATION -- Z notation all_domains == [ Xi Email::Abuse::Investigator; result! : seq STRING ] post: result! = deduplicate( map(_registrable, url_hosts union mailto_domains) ) =cut
1087
1088						sub all_domains {
1089	11			11	3425	my $self = $_[0];
1090	11				9	my (%seen, @out);
1091
1092						# Collect registrable domains from URL hosts first
1093	11				18	for my $u ($self->embedded_urls()) {
1094	9				14	my $dom = _registrable($u->{host});
1095	9	50	33		28	push @out, $dom if $dom && !$seen{$dom}++;
1096						}
1097
1098						# Then from contact domains (normalise subdomains to registrable parent)
1099	11				15	for my $d ($self->mailto_domains()) {
1100	16		33		20	my $dom = _registrable($d->{domain}) // $d->{domain};
1101	16	100	67		31	push @out, $dom if $dom && !$seen{$dom}++;
1102						}
1103	11				19	return @out;
1104						}
1105
1106 - 1169						=head2 unresolved_contacts() Returns a list of domains and URL hosts found in the message for which no abuse contact could be determined. Useful for surfacing parties that may warrant manual investigation. =head3 Usage my @unresolved = $analyser->unresolved_contacts(); for my $u (@unresolved) { printf "Unresolved: %s (%s) via %s\n", $u->{domain}, $u->{type}, $u->{source}; } =head3 Arguments None. =head3 Returns A list of hashrefs, each with keys C<domain>, C<type> (C<'url_host'> or C<'domain'>), and C<source> (where the domain was found). =head3 Side Effects Triggers C<embedded_urls()>, C<mailto_domains()>, C<abuse_contacts()>, and C<form_contacts()> if not already cached. =head3 Notes Domains sourced only from spoofable sending headers (C<From:>, C<Return-Path:>, C<Sender:>) are excluded. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { domain => { type => 'scalar' }, type => { type => 'scalar', regex => qr/^(?:url_host\|domain)$/ }, source => { type => 'scalar' }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation unresolved_contacts == [ Xi Email::Abuse::Investigator; result! : seq UNRESOLVED_INFO ] post: forall u : result! @ u.domain not_in covered_domains(abuse_contacts, form_contacts) =cut
1170
1171						sub unresolved_contacts {
1172	10			10	40	my $self = $_[0];
1173
1174						# Build a set of domains already covered by email or form contacts
1175	10				10	my %covered;
1176	10				16	for my $c ($self->abuse_contacts(), $self->form_contacts()) {
1177	3				4	my $dom = $c->{form_domain};
1178	3	50			6	unless ($dom) {
1179						# Extract domain from abuse email address
1180	3		50		8	($dom) = ($c->{address} // '') =~ /\@([\w.-]+)/;
1181						}
1182	3	50			7	$covered{lc $dom}++ if $dom;
1183						}
1184
1185						# Also mark URL hosts that already have a resolved abuse address
1186	10				16	for my $u ($self->embedded_urls()) {
1187	6				14	(my $bare = lc $u->{host}) =~ s/^www\.//;
1188	6	50	33		23	$covered{$bare}++ if $u->{abuse} && $u->{abuse} ne '(unknown)';
1189						}
1190
1191	10				11	my (@out, %seen);
1192
1193						# Check URL hosts first
1194	10				11	for my $u ($self->embedded_urls()) {
1195	6				12	(my $bare = lc $u->{host}) =~ s/^www\.//;
1196	6	50			12	next if $covered{$bare};
1197	6	50			29	next if $seen{"url:$bare"}++;
1198						push @out, {
1199						domain => $u->{host},
1200	6				23	type => 'url_host',
1201						source => 'URL in body',
1202						};
1203						}
1204
1205						# Then check contact domains, skipping spoofable-header-only sources
1206	10				15	for my $d ($self->mailto_domains()) {
1207	15				17	my $dom = $d->{domain};
1208	15		50		15	my $source = $d->{source} // '';
1209	15	100			32	next if $source =~ /^(?:From:\|Return-Path:\|Sender:) header$/;
1210	3	50			6	next if $covered{lc $dom};
1211	3	50			7	next if $seen{"dom:$dom"}++;
1212	3				7	push @out, {
1213						domain => $dom,
1214						type => 'domain',
1215						source => $source,
1216						};
1217						}
1218
1219	10				19	return @out;
1220						}
1221
1222						# -----------------------------------------------------------------------
1223						# Public: sending software fingerprint
1224						# -----------------------------------------------------------------------
1225
1226 - 1289						=head2 sending_software() Returns information extracted from headers that identify the software or server-side infrastructure used to compose or inject the message. Headers such as C<X-PHP-Originating-Script> reveal the exact PHP script and Unix account responsible on shared-hosting platforms. Data is extracted during C<parse_email()> with no network I/O. =head3 Usage my @sw = $analyser->sending_software(); for my $s (@sw) { printf "%-30s : %s\n", $s->{header}, $s->{value}; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs in alphabetical header-name order. Returns an empty list if none of the watched headers are present. Each hashref has keys C<header>, C<value>, and C<note>. =head3 Side Effects None. Data is pre-collected during C<parse_email()>. =head3 Notes Header names are lower-cased. Header values are stored verbatim. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { header => { type => 'scalar' }, value => { type => 'scalar' }, note => { type => 'scalar' }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation sending_software == [ Xi Email::Abuse::Investigator; result! : seq SW_INFO ] post: result! = self._sending_sw =cut
1290
1291						sub sending_software {
1292	53			53	50	my $self = $_[0];
1293
1294	53 53				33 62	return @{ $self->{_sending_sw} };
1295						}
1296
1297						# -----------------------------------------------------------------------
1298						# Public: per-hop tracking IDs
1299						# -----------------------------------------------------------------------
1300
1301 - 1366						=head2 received_trail() Returns per-hop tracking data extracted from the C<Received:> header chain: the IP address, envelope recipient address, and server session ID for each relay. ISP postmasters use these identifiers to locate the SMTP session in their logs. =head3 Usage my @trail = $analyser->received_trail(); for my $hop (@trail) { printf "IP: %s ID: %s\n", $hop->{ip} // '?', $hop->{id} // '?'; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs in oldest-first order. Returns an empty list if no C<Received:> headers are present or none yielded extractable data. Each hashref has keys C<received>, C<ip> (may be undef), C<for> (may be undef), C<id> (may be undef). =head3 Side Effects None. Data is pre-collected during C<parse_email()>. =head3 Notes Private IPs are NOT filtered here; all IPs including RFC 1918 addresses are returned as found. Filtering is applied only by C<originating_ip()>. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { received => { type => 'scalar' }, ip => { type => 'scalar', optional => 1 }, for => { type => 'scalar', optional => 1 }, id => { type => 'scalar', optional => 1 }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation received_trail == [ Xi Email::Abuse::Investigator; result! : seq HOP_INFO ] post: result! = self._rcvd_tracking =cut
1367
1368						sub received_trail {
1369	53			53	40	my $self = $_[0];
1370
1371	53 53				32 84	return @{ $self->{_rcvd_tracking} };
1372						}
1373
1374						# -----------------------------------------------------------------------
1375						# Public: risk assessment
1376						# -----------------------------------------------------------------------
1377
1378 - 1449						=head2 risk_assessment() Evaluates the message against heuristic checks and returns an overall risk level, a weighted numeric score, and a list of every specific red flag. The assessment covers five categories: originating IP, email authentication, Date: header validity, identity/header consistency, and URL/domain properties. The result is cached; subsequent calls return the same hashref without repeating any analysis. =head3 Usage my $risk = $analyser->risk_assessment(); printf "Risk: %s (score: %d)\n", $risk->{level}, $risk->{score}; for my $f (@{ $risk->{flags} }) { printf " [%s] %s\n", $f->{severity}, $f->{detail}; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A hashref with keys C<level> (HIGH/MEDIUM/LOW/INFO), C<score> (integer), and C<flags> (arrayref of hashrefs with C<severity>, C<flag>, C<detail>). =head3 Side Effects Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()> if not already cached. =head3 Notes Scores: HIGH >= 9, MEDIUM >= 5, LOW >= 2, INFO < 2. Flag weights: HIGH=3, MEDIUM=2, LOW=1, INFO=0. =head3 API Specification =head4 Input [] =head4 Output { type => 'hashref', keys => { level => { type => 'scalar', regex => qr/^(?:HIGH\|MEDIUM\|LOW\|INFO)$/ }, score => { type => 'scalar', regex => qr/^\d+$/ }, flags => { type => 'arrayref' }, }, } =head3 FORMAL SPECIFICATION -- Z notation risk_assessment == [ Xi Email::Abuse::Investigator; result! : RISK_INFO ] post: result!.score = sum({ w(f.severity) \| f in result!.flags }) /\ result!.level = classify(result!.score) where: w(HIGH) = 3; w(MEDIUM) = 2; w(LOW) = 1; w(INFO) = 0 classify(s) = HIGH if s >= 9 \| MEDIUM if s >= 5 \| LOW if s >= 2 \| INFO otherwise =cut
1450
1451						sub risk_assessment {
1452	261			261	6828	my $self = $_[0];
1453
1454	261	100			301	return $self->{_risk} if $self->{_risk};
1455
1456	241				174	my (@flags, $score);
1457	241				162	$score = 0;
1458
1459						# Closure to record a flag and accumulate its weight
1460						my $flag = sub {
1461	256			256	273	my ($severity, $name, $detail) = @_;
1462	256		50		509	$score += $FLAG_WEIGHT{$severity} // 1;
1463	256				1100	push @flags, { severity => $severity, flag => $name, detail => $detail };
1464	241				458	};
1465
1466						# --- Group 1: Originating IP checks ---
1467	241				259	my $orig = $self->originating_ip();
1468	241	100			261	if ($orig) {
1469						# Residential / broadband rDNS patterns suggest a compromised host
1470	198	100	100		776	if ($orig->{rdns} && $orig->{rdns} =~ /
1471						\d+[-_.]\d+[-_.]\d+[-_.]\d+ # dotted-quad in rDNS
1472						\| (?:dsl\|adsl\|cable\|broad\|dial\|dynamic\|dhcp\|ppp\|
1473						residential\|cust\|home\|pool\|client\|user\|
1474						static\d\|host\d)
1475						/xi) {
1476	27				43	$flag->('HIGH', 'residential_sending_ip',
1477						"Sending IP $orig->{ip} rDNS '$orig->{rdns}' looks like a broadband/residential line, not a legitimate mail server");
1478						}
1479
1480						# Absence of rDNS is a strong spam indicator
1481	198	100	100		345	if (!$orig->{rdns} \|\| $orig->{rdns} eq '(no reverse DNS)') {
1482	8				35	$flag->('HIGH', 'no_reverse_dns',
1483						"Sending IP $orig->{ip} has no reverse DNS -- legitimate mail servers always have rDNS");
1484						}
1485
1486						# Low-confidence origin means the IP came from an unverifiable header
1487	198	100			206	if ($orig->{confidence} eq 'low') {
1488	3				7	$flag->('MEDIUM', 'low_confidence_origin',
1489						"Originating IP taken from unverified header ($orig->{note})");
1490						}
1491
1492						# Statistically high-volume spam countries (informational only)
1493	198	100	100		281	if ($orig->{country} && $orig->{country} =~ /^(?:CN\|RU\|NG\|VN\|IN\|PK\|BD)$/) {
1494						$flag->('INFO', 'high_spam_country',
1495	18				37	'Sending IP is in ' . _country_name($orig->{country}) .
1496						" ($orig->{country}) -- statistically high spam volume country");
1497						}
1498						}
1499
1500						# --- Group 2: Email authentication checks ---
1501	241				269	my $auth = $self->_parse_auth_results_cached();
1502	241	100			245	if (defined $auth->{spf}) {
1503	19	100 100 50			52	if ($auth->{spf} =~ /^fail/i) {
1504	6				10	$flag->('HIGH', 'spf_fail',
1505						"SPF result: $auth->{spf} -- sending IP not authorised by domain's SPF record");
1506						} elsif ($auth->{spf} =~ /^softfail/i) {
1507	1				2	$flag->('MEDIUM', 'spf_softfail',
1508						"SPF result: softfail (~all) -- sending IP not explicitly authorised");
1509						} elsif ($auth->{spf} !~ /^pass/i) {
1510	0				0	$flag->('HIGH', 'spf_fail',
1511						"SPF result: $auth->{spf} -- sending IP not authorised");
1512						}
1513						}
1514	241	100	100		298	if (defined $auth->{dkim} && $auth->{dkim} !~ /^pass/i) {
1515	7				15	$flag->('HIGH', 'dkim_fail',
1516						"DKIM result: $auth->{dkim} -- message signature invalid or absent");
1517						}
1518	241	100	100		301	if (defined $auth->{dmarc} && $auth->{dmarc} !~ /^pass/i) {
1519	5				8	$flag->('HIGH', 'dmarc_fail', "DMARC result: $auth->{dmarc}");
1520						}
1521
1522						# DKIM signing domain vs From: domain mismatch check
1523	241	100			210	if ($auth->{dkim_domain}) {
1524	4		50		6	my ($from_domain) = ($self->_header_value('from') // '') =~ /\@([\w.-]+)/;
1525	4	50			6	if ($from_domain) {
1526	4		33		6	my $reg_dkim = _registrable($auth->{dkim_domain}) // $auth->{dkim_domain};
1527	4		33		6	my $reg_from = _registrable(lc $from_domain) // lc $from_domain;
1528	4	100			5	if ($reg_dkim ne $reg_from) {
1529						# Passing DKIM with a different domain is normal for ESPs
1530	2	100	67		5	if ($auth->{dkim} && $auth->{dkim} =~ /^pass/i) {
1531	1				3	$flag->('INFO', 'dkim_domain_mismatch',
1532						"DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'"
1533						. ' -- message sent via third-party sender (normal for bulk/ESP mail)');
1534						} else {
1535						# Failing DKIM plus mismatched domain is more suspicious
1536	1				2	$flag->('MEDIUM', 'dkim_domain_mismatch',
1537						"DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'"
1538						. ' and DKIM did not pass -- possible impersonation');
1539						}
1540						}
1541						}
1542						}
1543
1544						# --- Group 3: Date: header checks ---
1545	241				246	my $date_raw = $self->_header_value('date');
1546	241	100	67		468	if (!$date_raw \|\| $date_raw !~ /\S/) {
1547	35				34	$flag->('MEDIUM', 'missing_date',
1548						'No Date: header -- violates RFC 5322; common in spam');
1549						} else {
1550						# Check for an implausible timezone offset (outside real-world bounds)
1551	206	50			442	if ($date_raw =~ /([+-])(\d{2})(\d{2})\s*$/) {
1552	206				309	my ($sign, $hh, $mm) = ($1, $2, $3);
1553	206				229	my $offset_mins = $hh * 60 + $mm;
1554	206		100		545	my $implausible = $mm >= 60
1555						\|\| ($sign eq '+' && $offset_mins > $TZ_MAX_POS_MINS)
1556						\|\| ($sign eq '-' && $offset_mins > $TZ_MAX_NEG_MINS);
1557	206	100			217	if ($implausible) {
1558	8				17	$flag->('MEDIUM', 'implausible_timezone',
1559						"Date: '$date_raw' contains an implausible timezone offset "
1560						. "($sign$hh$mm) -- header is likely forged");
1561						}
1562						}
1563
1564						# Check for dates more than DATE_SKEW_DAYS outside the analysis window
1565	206				202	my $date_epoch = _parse_rfc2822_date($date_raw);
1566	206	50			3220	if (defined $date_epoch) {
1567	206				186	my $delta = time() - $date_epoch;
1568	206	100 100			272	if ($delta > $DATE_SKEW_DAYS * $SECS_PER_DAY) {
1569	25				43	$flag->('LOW', 'suspicious_date',
1570						"Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the past");
1571						} elsif ($delta < -($DATE_SKEW_DAYS * $SECS_PER_DAY)) {
1572	2				3	$flag->('LOW', 'suspicious_date',
1573						"Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the future");
1574						}
1575						}
1576						}
1577
1578						# --- Group 4: Header identity checks ---
1579	241		100		230	my $from_raw = $self->_header_value('from') // '';
1580	241				273	my $from_decoded = $self->_decode_mime_words($from_raw);
1581
1582						# Display-name domain spoofing: "PayPal paypal.com" <phish@evil.example>
1583	241	100			522	if ($from_decoded =~ /^"?([^"<]+?)"?\s*<([^>]+)>/) {
1584	130				156	my ($display, $addr) = ($1, $2);
1585	130				179	while ($display =~ /\b([\w-]+\.(?:com\|net\|org\|io\|co\|uk\|au\|gov\|edu))\b/gi) {
1586	5				7	my $disp_domain = lc $1;
1587	5				9	my ($addr_domain) = $addr =~ /\@([\w.-]+)/;
1588	5		50		9	$addr_domain = lc($addr_domain // '');
1589	5				8	my $reg_disp = _registrable($disp_domain);
1590	5				8	my $reg_addr = _registrable($addr_domain);
1591	5	50	25		17	if ($reg_disp && $reg_addr && $reg_disp ne $reg_addr) {
1592	5				9	$flag->('HIGH', 'display_name_domain_spoof',
1593						"From: display name mentions '$disp_domain' but actual address is <$addr>");
1594						}
1595						}
1596						}
1597
1598						# Free webmail sender flag (no corporate infrastructure)
1599	241	100	100		515	if ($from_raw =~ /\@(gmail\|yahoo\|hotmail\|outlook\|live\|aol\|protonmail\|yandex)\./i
1600						\|\| $from_raw =~ /\@mail\.ru(?:[\s>]\|$)/i) {
1601	26				60	$flag->('MEDIUM', 'free_webmail_sender',
1602						"Message sent from free webmail address ($from_raw)");
1603						}
1604
1605						# Reply-To differs from From: -- replies harvested by different address
1606	241				213	my $reply_to = $self->_header_value('reply-to');
1607	241	100			212	if ($reply_to) {
1608	6				13	my ($from_addr) = $from_raw =~ /([\w.+%-]+\@[\w.-]+)/;
1609	6				11	my ($reply_addr) = $reply_to =~ /([\w.+%-]+\@[\w.-]+)/;
1610	6	100	50		27	if ($from_addr && $reply_addr && lc($from_addr) ne lc($reply_addr)) {
1611	4				6	$flag->('MEDIUM', 'reply_to_differs_from_from',
1612						"Reply-To ($reply_addr) differs from From: ($from_addr)");
1613						}
1614						}
1615
1616						# Undisclosed or absent To: header
1617	241		100		204	my $to = $self->_header_value('to') // '';
1618	241	100	100		531	if ($to =~ /undisclosed\|:;/ \|\| $to eq '') {
1619	37				45	$flag->('MEDIUM', 'undisclosed_recipients',
1620						"To: header is '$to' -- message was bulk-sent with hidden recipient list");
1621						}
1622
1623						# MIME-encoded Subject (potential filter evasion)
1624	241		100		216	my $subj_raw = $self->_header_value('subject') // '';
1625	241	100			275	if ($subj_raw =~ /=\?[^?]+\?[BQ]\?/i) {
1626	7				12	$flag->('LOW', 'encoded_subject',
1627						"Subject line is MIME-encoded: '$subj_raw' (decoded: '"
1628						. $self->_decode_mime_words($subj_raw) . "')");
1629						}
1630
1631						# --- Group 5: URL and domain checks ---
1632	241				170	my (%shortener_seen, %url_host_seen);
1633	241				261	for my $u ($self->embedded_urls()) {
1634						# Skip trusted infrastructure -- these are not spam indicators
1635	49				52	my $bare = lc $u->{host};
1636	49				47	$bare =~ s/^www\.//;
1637	49	50			58	next if $self->{trusted_domains}->{$bare};
1638	49	100			55	next if $TRUSTED_DOMAINS{$bare};
1639
1640						# URL shortener hides real destination
1641	47	100	40		120	if(($URL_SHORTENERS{$bare} \|\| $self->{url_shorteners}->{$bare}) && !$shortener_seen{$bare}++) {
1642	6				9	$flag->('MEDIUM', 'url_shortener',
1643						"$u->{host} is a URL shortener -- the real destination is hidden");
1644						}
1645						# Plain HTTP provides no encryption
1646	47	100	67		91	if ($u->{url} =~ m{^http://}i && !$url_host_seen{ $u->{host} }++) {
1647	5				11	$flag->('LOW', 'http_not_https',
1648						"$u->{host} linked over plain HTTP -- no encryption");
1649						}
1650						}
1651
1652						# Domain-level checks against contact/reply domains
1653	241				233	for my $d ($self->mailto_domains()) {
1654						# Recently registered domain is a common phishing indicator
1655	109	100			107	if ($d->{recently_registered}) {
1656	5				35	$flag->('HIGH', 'recently_registered_domain',
1657	5				12	"$d->{domain} was registered $d->{registered} (less than ${\$RECENT_REG_DAYS} days ago)");
1658						}
1659
1660						# Domain expiry checks
1661	109	100			100	if ($d->{expires}) {
1662	13	50			16	if(my $exp = $self->_parse_date_to_epoch($d->{expires})) {
1663	13				176	my $now = time();
1664	13				11	my $remaining = $exp - $now;
1665	13	100 100	100		38	if ($remaining > 0 && $remaining < $EXPIRY_WARN_DAYS * $SECS_PER_DAY) {
1666	3				6	$flag->('HIGH', 'domain_expires_soon',
1667						"$d->{domain} expires $d->{expires} -- may be a throwaway domain");
1668						} elsif ($remaining <= 0) {
1669	3				6	$flag->('HIGH', 'domain_expired',
1670						"$d->{domain} expired $d->{expires} -- domain has lapsed");
1671						}
1672						}
1673						}
1674
1675						# Lookalike domain check (brand name in a non-brand domain)
1676	109				109	for my $brand (qw(paypal apple google amazon microsoft netflix ebay
1677						instagram facebook twitter linkedin bankofamerica
1678						wellsfargo chase barclays hsbc lloyds santander)) {
1679	1833	100	100		4688	if ($d->{domain} =~ /\Q$brand\E/i &&
1680						$d->{domain} !~ /^\Q$brand\E\.(?:com\|co\.uk\|net\|org)$/) {
1681	8				11	$flag->('HIGH', 'lookalike_domain',
1682						"$d->{domain} contains brand name '$brand' but is not the real domain -- possible phishing");
1683	8				11	last;
1684						}
1685						}
1686						}
1687
1688						# Determine overall risk level from accumulated score
1689	241	100 100 100			331	my $level = $score >= $SCORE_HIGH ? 'HIGH'
1690						: $score >= $SCORE_MEDIUM ? 'MEDIUM'
1691						: $score >= $SCORE_LOW ? 'LOW'
1692						: 'INFO';
1693
1694	241				378	$self->{_risk} = { level => $level, score => $score, flags => \@flags };
1695	241				839	return $self->{_risk};
1696						}
1697
1698						# -----------------------------------------------------------------------
1699						# Public: abuse report text
1700						# -----------------------------------------------------------------------
1701
1702 - 1758						=head2 abuse_report_text() Produces a compact, plain-text string suitable for sending as the body of an abuse report email. It summarises risk level, red flags, originating IP, abuse contacts, and original message headers. The message body is omitted to keep the report concise. Use C<abuse_contacts()> to get the recipient addresses and this method for the body text. =head3 Usage my $text = $analyser->abuse_report_text(); my @contacts = $analyser->abuse_contacts(); for my $c (@contacts) { send_email(to => $c->{address}, body => $text); } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A plain scalar string, newline-terminated, Unix line endings. Never empty or undef. =head3 Side Effects Calls C<risk_assessment()>, C<originating_ip()>, and C<abuse_contacts()> if not already cached. =head3 Notes Output text is sanitised: control characters that could affect terminal or HTML rendering are stripped from all user-derived content before inclusion. =head3 API Specification =head4 Input [] =head4 Output { type => 'scalar' } =head3 FORMAL SPECIFICATION -- Z notation abuse_report_text == [ Xi Email::Abuse::Investigator; result! : STRING ] post: result! /= '' /\ result! ends_with '\n' =cut
1759
1760						sub abuse_report_text {
1761	14			14	3874	my $self = $_[0];
1762	14				12	my @out;
1763
1764	14				18	push @out, 'This is an automated abuse report generated by Email::Abuse::Investigator.',
1765						'Please investigate the following spam/phishing message.',
1766						'';
1767
1768	14				47	my $risk = $self->risk_assessment();
1769	14				29	push @out, "RISK LEVEL: $risk->{level} (score: $risk->{score})",
1770						'';
1771
1772						# List each red flag with its severity prefix
1773	14 14	100			8 21	if (@{ $risk->{flags} }) {
1774	9				10	push @out, 'RED FLAGS IDENTIFIED:';
1775	9 9				9 11	for my $f (@{ $risk->{flags} }) {
1776	17				24	push @out, " [$f->{severity}] " . _sanitise_output($f->{detail});
1777						}
1778	9				9	push @out, '';
1779						}
1780
1781						# Originating IP summary block
1782	14				16	my $orig = $self->originating_ip();
1783	14	100			17	if ($orig) {
1784						push @out, 'ORIGINATING IP: ' . _sanitise_output("$orig->{ip} ($orig->{rdns})"),
1785	11				16	'NETWORK OWNER: ' . _sanitise_output($orig->{org}),
1786						'';
1787						}
1788
1789						# Email abuse contacts
1790	14				27	my @contacts = $self->abuse_contacts();
1791	14	100			17	if (@contacts) {
1792	12				12	push @out, 'ABUSE CONTACTS:';
1793	12				25	push @out, ' ' . _sanitise_output("$_->{address} ($_->{role})") for @contacts;
1794	12				11	push @out, '';
1795						}
1796
1797						# Web-form contacts (providers that reject email)
1798	14	50			21	if(my @form_cs = $self->form_contacts()) {
1799	0				0	push @out, 'WEB-FORM REPORTS REQUIRED:',
1800						' The following parties do not accept email -- submit manually:';
1801	0				0	for my $c (@form_cs) {
1802						push @out, " [$c->{role}]",
1803	0				0	' Form : ' . _sanitise_output($c->{form});
1804	0	0			0	push @out, ' Domain : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain};
1805	0	0			0	push @out, ' Paste : ' . _sanitise_output($c->{form_paste}) if $c->{form_paste};
1806	0	0			0	push @out, ' Upload : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload};
1807						}
1808	0				0	push @out, '';
1809						}
1810
1811						# Separator and raw headers (body excluded for brevity)
1812	14				24	push @out, '-' x 72,
1813						'ORIGINAL MESSAGE HEADERS:',
1814						'-' x 72;
1815
1816	14 14				11 15	for my $h (@{ $self->{_headers} }) {
1817	114				105	push @out, _sanitise_output("$h->{name}: $h->{value}");
1818						}
1819	14				14	push @out, '';
1820
1821	14				67	return join("\n", @out);
1822						}
1823
1824						# -----------------------------------------------------------------------
1825						# Public: abuse contacts
1826						# -----------------------------------------------------------------------
1827
1828 - 1895						=head2 abuse_contacts() Collates the complete set of parties that should receive an abuse report: the sending ISP, URL host operators, contact domain web/mail/DNS/registrar contacts, account providers identified from key headers, the DKIM signer, and the ESP identified via List-Unsubscribe. Addresses are deduplicated globally; if the same address is found via multiple routes, a single entry is kept and role strings are merged. =head3 Usage my @contacts = $analyser->abuse_contacts(); my @addrs = map { $_->{address} } @contacts; =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs, one per unique abuse address, in discovery order. Each hashref has keys C<role>, C<roles> (arrayref), C<address>, C<note>, C<via>. Returns an empty list if no contacts can be determined. =head3 Side Effects Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()> if not already cached. =head3 Notes The result is not independently cached; each call recomputes the contact list from the cached results of the underlying methods. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { role => { type => 'scalar' }, roles => { type => 'arrayref' }, address => { type => 'scalar', regex => qr/\@/ }, note => { type => 'scalar' }, via => { type => 'scalar', regex => qr/^(?:provider-table\|ip-whois\|domain-whois)$/ }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation abuse_contacts == [ Xi Email::Abuse::Investigator; result! : seq CONTACT_INFO ] post: forall c : result! @ c.address contains '@' /\ forall c1, c2 : result! @ c1 /= c2 => c1.address /= c2.address =cut
1896
1897						sub abuse_contacts {
1898	136			136	3008	my $self = $_[0];
1899
1900	136				104	my (@contacts, %seen_idx);
1901
1902						# Inner closure: add one contact entry, merging roles for duplicate addresses
1903						my $add = sub {
1904	296			296	473	my (%args) = @_;
1905	296		50		346	my $addr = lc($args{address} // '');
1906	296	100	67		442	return unless $addr && $addr =~ /\@/;
1907
1908						# Suppress addresses belonging to form-only providers (no email accepted)
1909	294	50			428	if ($addr =~ /\@([\w.-]+)$/) {
1910	294				248	my $dom = $1;
1911	294				258	my $pa = $self->_provider_abuse_for_host($dom);
1912	294	100	75		373	return if $pa && $pa->{form} && !$pa->{email};
1913						}
1914
1915	290	100			324	if (exists $seen_idx{$addr}) {
1916						# Merge the new role into the existing entry
1917	145				108	my $entry = $contacts[ $seen_idx{$addr} ];
1918	145 145				88 148	push @{ $entry->{roles} }, $args{role};
1919
1920						# Collapse repeated role labels to avoid unreadable strings
1921	145				100	my (%role_counts, @ordered_roles);
1922	145 145				86 114	for my $r (@{ $entry->{roles} }) {
1923	1219	100			1155	push @ordered_roles, $r unless $role_counts{$r}++;
1924						}
1925						my @display = map {
1926	145 1212	100			131 1020	$role_counts{$_} > 1 ? "$_ (x$role_counts{$_})" : $_
1927						} @ordered_roles;
1928	145				211	my $joined = join(' and ', @display);
1929
1930						# Summarise if the merged string is too long to read
1931	145	100			157	if (length($joined) > $ROLE_MAX_LEN) {
1932						my @short = map {
1933	73 1051				55 1103	(my $s = $_) =~ s/[:(\d].*//;
1934	1051				980	$s =~ s/\s+$//;
1935	1051				764	$s;
1936						} @display;
1937	73				142	$joined = scalar(@display) . ' routes: ' . join(', ', @short);
1938						}
1939	145				111	$entry->{role} = $joined;
1940	145				315	return;
1941						}
1942
1943						# First time seeing this address -- record and store
1944	145				163	$seen_idx{$addr} = scalar @contacts;
1945	145				173	$args{roles} = [ $args{role} ];
1946	145				185	push @contacts, \%args;
1947	136				369	};
1948
1949						# Route 1 -- Sending ISP (originating IP)
1950	136				148	my $orig = $self->originating_ip();
1951	136	100			161	if ($orig) {
1952	71				111	my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns});
1953	71	100			110	if ($pa) {
1954						$add->(
1955						role => 'Sending ISP',
1956						address => $pa->{email},
1957	13				29	note => "$orig->{ip} ($orig->{rdns}) -- $pa->{note}",
1958						via => 'provider-table',
1959						);
1960						}
1961	71	100	67		156	if ($orig->{abuse} && $orig->{abuse} ne '(unknown)') {
1962						$add->(
1963						role => 'Sending ISP',
1964						address => $orig->{abuse},
1965	64				124	note => "Network owner of originating IP $orig->{ip} ($orig->{org})",
1966						via => 'ip-whois',
1967						);
1968						}
1969						}
1970
1971						# Route 2 -- URL hosts
1972	136				108	my %url_host_seen;
1973	136				150	for my $u ($self->embedded_urls()) {
1974	78	100			120	next if $url_host_seen{ $u->{host} }++;
1975	66				69	my $bare_host = lc $u->{host};
1976	66				71	$bare_host =~ s/^www\.//;
1977						# Skip trusted infrastructure (Google, W3C, etc.)
1978	66	50			79	next if $self->{trusted_domains}->{$bare_host};
1979	66	100			86	next if $TRUSTED_DOMAINS{$bare_host};
1980	63				73	my $pa = $self->_provider_abuse_for_host($u->{host});
1981	63	100			64	if ($pa) {
1982						$add->(
1983						role => "URL host: $u->{host}",
1984						address => $pa->{email},
1985	15				71	note => "$u->{host} -- $pa->{note}",
1986						via => 'provider-table',
1987						);
1988						}
1989	63	100	67		117	if ($u->{abuse} && $u->{abuse} ne '(unknown)') {
1990						$add->(
1991						role => "URL host: $u->{host}",
1992						address => $u->{abuse},
1993	39				84	note => "Hosting $u->{host} ($u->{ip}, $u->{org})",
1994						via => 'ip-whois',
1995						);
1996						}
1997						}
1998
1999						# Route 3 -- Contact domain hosting and registration
2000	136				147	for my $d ($self->mailto_domains()) {
2001	152				144	my $dom = $d->{domain};
2002
2003						# Web host contact
2004	152	100			187	if ($d->{web_abuse}) {
2005	57				58	my $pa = $self->_provider_abuse_for_host($dom);
2006	57	50			45	if ($pa) {
2007						$add->(role => "Web host of $dom", address => $pa->{email},
2008	0				0	note => $pa->{note}, via => 'provider-table');
2009						}
2010						$add->(
2011						role => "Web host of $dom",
2012						address => $d->{web_abuse},
2013						note => sprintf('Hosting %s (%s, %s)',
2014						$dom // '(unknown domain)',
2015						$d->{web_ip} // '(unknown IP)',
2016	57		100 50 100		192	$d->{web_org} // '(unknown org)'),
2017						via => 'ip-whois',
2018						);
2019						}
2020
2021						# MX (mail host) contact
2022	152	100			150	if ($d->{mx_abuse}) {
2023						$add->(
2024						role => "Mail host (MX) for $dom",
2025						address => $d->{mx_abuse},
2026						note => sprintf('MX %s (%s, %s)',
2027						$d->{mx_host} // '(unknown host)',
2028						$d->{mx_ip} // '(unknown IP)',
2029	23		100 100 100		93	$d->{mx_org} // '(unknown org)'),
2030						via => 'ip-whois',
2031						);
2032						}
2033
2034						# NS (DNS host) contact
2035	152	100			150	if ($d->{ns_abuse}) {
2036						$add->(
2037						role => "DNS host (NS) for $dom",
2038						address => $d->{ns_abuse},
2039						note => sprintf('NS %s (%s, %s)',
2040						$d->{ns_host} // '(unknown host)',
2041						$d->{ns_ip} // '(unknown IP)',
2042	15		100 100 100		50	$d->{ns_org} // '(unknown org)'),
2043						via => 'ip-whois',
2044						);
2045						}
2046
2047						# Domain registrar (skip if domain only seen in spoofable headers)
2048	152	100			151	if ($d->{registrar_abuse}) {
2049						my $spoofable_only =
2050						$d->{source} =~ /^(?:From:\|Return-Path:\|Sender:) header$/ &&
2051						!scalar(grep {
2052	36		100		67	$_->{host} &&
2053						_registrable($_->{host}) eq (_registrable($dom) // $dom)
2054						} $self->embedded_urls());
2055	36	100			38	unless ($spoofable_only) {
2056						$add->(
2057						role => "Domain registrar for $dom",
2058						address => $d->{registrar_abuse},
2059	28		100		66	note => 'Registrar: ' . ($d->{registrar} // '(unknown)'),
2060						via => 'domain-whois',
2061						);
2062						}
2063						}
2064						}
2065
2066						# Route 4 -- From:/Reply-To:/Return-Path:/Sender: account provider
2067	136				132	for my $hname (qw(from reply-to return-path sender)) {
2068	544		100		452	my $val = $self->_header_value($hname) // next;
2069
2070						# Extract addr-spec from angle-bracket form to avoid display-name @-signs
2071	245	100			501	my $addr_spec = ($val =~ /<([^>])>\s$/) ? $1 : $val;
2072	245				379	my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/;
2073	245	50			230	next unless $addr_domain;
2074
2075						# Skip SRS-rewritten forwarder addresses (not the real sender)
2076	245	100			232	next if $addr_spec =~ /\+SRS[0-9]?=/i;
2077
2078	243				201	my $pa = $self->_provider_abuse_for_host($addr_domain);
2079	243	100			253	if ($pa) {
2080	37	50			49	my $role_addr = $addr_spec =~ /\@/ ? $addr_spec : $val;
2081	37				103	$role_addr =~ s/^\s+\|\s+$//g;
2082						$add->(
2083						role => "Account provider ($hname: $role_addr)",
2084						address => $pa->{email},
2085						note => $pa->{note},
2086	37				73	via => 'provider-table',
2087						);
2088						}
2089						}
2090
2091						# Route 5 -- DKIM signing organisation
2092	136				139	my $auth = $self->_parse_auth_results_cached();
2093	136	100			138	if ($auth->{dkim_domain}) {
2094	3				4	my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain});
2095	3	100			3	if ($pa) {
2096						$add->(
2097						role => "DKIM signer: $auth->{dkim_domain}",
2098						address => $pa->{email},
2099						note => $pa->{note},
2100	1				3	via => 'provider-table',
2101						);
2102						}
2103						}
2104
2105						# Route 6 -- List-Unsubscribe ESP domain
2106	136				119	my $unsub = $self->_header_value('list-unsubscribe');
2107	136	50			129	if ($unsub) {
2108	0				0	my @unsub_domains;
2109	0				0	while ($unsub =~ m{https?://([^/:?\s>]+)}gi) {
2110	0				0	push @unsub_domains, lc $1;
2111						}
2112	0				0	while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) {
2113	0				0	push @unsub_domains, lc $1;
2114						}
2115	0				0	my %unsub_seen;
2116	0 0				0 0	for my $dom (grep { !$unsub_seen{$_}++ } @unsub_domains) {
2117	0				0	my $pa = $self->_provider_abuse_for_host($dom);
2118	0	0			0	if ($pa) {
2119						$add->(
2120						role => "ESP / bulk sender (List-Unsubscribe: $dom)",
2121						address => $pa->{email},
2122	0				0	note => "$pa->{note} -- responsible for this bulk delivery",
2123						via => 'provider-table',
2124						);
2125						}
2126						}
2127						}
2128
2129						# Route 7 -- Reply addresses embedded in the message body
2130	136				100	my %body_addr_seen;
2131	136				142	my $combined_body = $self->{_body_plain} . "\n" . $self->{_body_html};
2132	136				154	for my $addr_dom ($self->_domains_from_text($combined_body)) {
2133	20	50			22	next if $body_addr_seen{$addr_dom}++;
2134	20				26	my $pa = $self->_provider_abuse_for_host($addr_dom);
2135	20	100	67		33	next unless $pa && $pa->{email};
2136	4				55	my ($example_addr) = $combined_body =~ /(\S+\@\Q$addr_dom\E)/i;
2137	4		33		4	$example_addr //= "\@$addr_dom";
2138						$add->(
2139						role => "Reply address in body ($example_addr)",
2140						address => $pa->{email},
2141						note => $pa->{note},
2142	4				6	via => 'provider-table',
2143						);
2144						}
2145
2146	136				991	return @contacts;
2147						}
2148
2149						# -----------------------------------------------------------------------
2150						# Public: form contacts (providers that require web-form submission)
2151						# -----------------------------------------------------------------------
2152
2153 - 2219						=head2 form_contacts() Returns the list of parties that require abuse reports via a web form rather than email. These are providers whose C<%PROVIDER_ABUSE> entry has a C<form> key. Each hashref includes the form URL, paste instructions, upload instructions, and the discovery role. =head3 Usage my @forms = $analyser->form_contacts(); for my $c (@forms) { printf "Open: %s\n", $c->{form}; } =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A list of hashrefs, one per unique form contact. Each hashref has keys C<form>, C<role>, C<note>, C<form_paste> (optional), C<form_upload> (optional), and C<via>. Returns an empty list if no form contacts are found. =head3 Side Effects Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()> if not already cached. =head3 Notes Deduplication is by form URL. =head3 API Specification =head4 Input [] =head4 Output ( { type => 'hashref', keys => { form => { type => 'scalar', regex => qr{^https?://} }, role => { type => 'scalar' }, note => { type => 'scalar' }, form_paste => { type => 'scalar', optional => 1 }, form_upload => { type => 'scalar', optional => 1 }, via => { type => 'scalar' }, }, }, ... ) =head3 FORMAL SPECIFICATION -- Z notation form_contacts == [ Xi Email::Abuse::Investigator; result! : seq FORM_CONTACT_INFO ] post: forall c : result! @ c.form =~ m{^https?://} /\ forall c1, c2 : result! @ c1 /= c2 => c1.form /= c2.form =cut
2220
2221						sub form_contacts {
2222	81			81	938	my $self = $_[0];
2223
2224	81				64	my (@contacts, %seen);
2225
2226						# Inner closure: add one form-contact entry, deduplicating by form URL
2227						my $add = sub {
2228	11			11	45	my (%args) = @_;
2229	11		50		15	my $form = $args{form} // '';
2230	11	50			12	return unless $form;
2231	11	100			15	return if $seen{$form}++;
2232	9				14	push @contacts, \%args;
2233	81				133	};
2234
2235						# Route 1 -- Sending ISP
2236	81				85	my $orig = $self->originating_ip();
2237	81	100			80	if ($orig) {
2238	49				65	my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns});
2239	49	50	67		59	if ($pa && $pa->{form}) {
2240						$add->(
2241						role => 'Sending ISP',
2242						form => $pa->{form},
2243						note => $pa->{note} // '',
2244						form_paste => $pa->{form_paste} // '',
2245	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2246						via => 'provider-table',
2247						);
2248						}
2249						}
2250
2251						# Route 2 -- URL hosts
2252	81				53	my %url_host_seen;
2253	81				76	for my $u ($self->embedded_urls()) {
2254	48	100			65	next if $url_host_seen{ $u->{host} }++;
2255	39				45	my $pa = $self->_provider_abuse_for_host($u->{host});
2256	39	50	67		65	if ($pa && $pa->{form}) {
2257						$add->(
2258						role => "URL host: $u->{host}",
2259						form => $pa->{form},
2260						form_domain => $u->{host},
2261						note => $pa->{note} // '',
2262						form_paste => $pa->{form_paste} // '',
2263	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2264						via => 'provider-table',
2265						);
2266						}
2267						}
2268
2269						# Route 3 -- Contact domains (web host + registrar)
2270	81				81	for my $d ($self->mailto_domains()) {
2271	88				83	my $dom = $d->{domain};
2272	88				72	my $pa = $self->_provider_abuse_for_host($dom);
2273	88	0	33		92	if ($pa && $pa->{form}) {
2274						$add->(
2275						role => "Web host of $dom",
2276						form => $pa->{form},
2277						form_domain => $dom,
2278						note => $pa->{note} // '',
2279						form_paste => $pa->{form_paste} // '',
2280	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2281						via => 'provider-table',
2282						);
2283						}
2284
2285						# Registrar identified via WHOIS -- check for form-only registrar
2286	88	100	67		132	if ($d->{registrar_abuse} && $d->{registrar_abuse} =~ /\@([\w.-]+)/) {
2287	18				21	my $reg_domain = lc $1;
2288	18				14	my $rpa = $self->_provider_abuse_for_host($reg_domain);
2289	18	100	100		32	if ($rpa && $rpa->{form}) {
2290						$add->(
2291						role => "Domain registrar for $dom (web form only)",
2292						form => $rpa->{form},
2293						form_domain => $dom,
2294						note => $rpa->{note} // '',
2295						form_paste => $rpa->{form_paste} // '',
2296	11		50 50 50		34	form_upload => $rpa->{form_upload} // '',
2297						via => 'provider-table',
2298						);
2299						}
2300						}
2301						}
2302
2303						# Route 4 -- Account provider headers
2304	81				79	for my $hname (qw(from reply-to return-path sender)) {
2305	324		100		240	my $val = $self->_header_value($hname) // next;
2306	142	100			226	my $addr_spec = ($val =~ /<([^>])>\s$/) ? $1 : $val;
2307	142				214	my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/;
2308	142	50			121	next unless $addr_domain;
2309						# Skip SRS forwarder rewrite addresses
2310	142	50			141	next if $addr_spec =~ /\+SRS[0-9]?=/i;
2311	142				125	my $pa = $self->_provider_abuse_for_host($addr_domain);
2312	142	50	67		150	if ($pa && $pa->{form}) {
2313	0	0			0	my $role_addr = $addr_spec =~ /@/ ? $addr_spec : $val;
2314	0				0	$role_addr =~ s/^\s+\|\s+$//g;
2315						$add->(
2316						role => "Account provider ($hname: $role_addr)",
2317						form => $pa->{form},
2318						note => $pa->{note} // '',
2319						form_paste => $pa->{form_paste} // '',
2320	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2321						via => 'provider-table',
2322						);
2323						}
2324						}
2325
2326						# Route 5 -- DKIM signer
2327	81				74	my $auth = $self->_parse_auth_results_cached();
2328	81	100			85	if ($auth->{dkim_domain}) {
2329	1				2	my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain});
2330	1	0	33		2	if ($pa && $pa->{form}) {
2331						$add->(
2332						role => "DKIM signer: $auth->{dkim_domain}",
2333						form => $pa->{form},
2334						note => $pa->{note} // '',
2335						form_paste => $pa->{form_paste} // '',
2336	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2337						via => 'provider-table',
2338						);
2339						}
2340						}
2341
2342						# Route 6 -- List-Unsubscribe ESP domains
2343	81				71	my $unsub = $self->_header_value('list-unsubscribe');
2344	81	50			79	if ($unsub) {
2345	0				0	my @unsub_domains;
2346	0 0				0 0	while ($unsub =~ m{https?://([^/:?\s>]+)}gi) { push @unsub_domains, lc $1 }
2347	0 0				0 0	while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) { push @unsub_domains, lc $1 }
2348	0				0	my %useen;
2349	0 0				0 0	for my $dom (grep { !$useen{$_}++ } @unsub_domains) {
2350	0				0	my $pa = $self->_provider_abuse_for_host($dom);
2351	0	0	0		0	if ($pa && $pa->{form}) {
2352						$add->(
2353						role => "ESP / bulk sender (List-Unsubscribe: $dom)",
2354						form => $pa->{form},
2355						note => $pa->{note} // '',
2356						form_paste => $pa->{form_paste} // '',
2357	0		0 0 0		0	form_upload => $pa->{form_upload} // '',
2358						via => 'provider-table',
2359						);
2360						}
2361						}
2362						}
2363
2364	81				233	return @contacts;
2365						}
2366
2367						# -----------------------------------------------------------------------
2368						# Public: full analyst report
2369						# -----------------------------------------------------------------------
2370
2371 - 2427						=head2 report() Produces a comprehensive, analyst-facing plain-text report covering all findings: envelope fields, risk assessment, originating host, sending software, received chain tracking IDs, embedded URLs, contact domain intelligence, and recommended abuse contacts. Use C<report()> for human review or ticketing systems. Use C<abuse_report_text()> for sending to ISP abuse desks. =head3 Usage print $analyser->report(); open my $fh, '>', 'report.txt' or croak "Cannot open: $!"; print $fh $analyser->report(); close $fh; =head3 Arguments None. C<parse_email()> must have been called first. =head3 Returns A plain scalar string, newline-terminated, Unix line endings. Never empty or undef. =head3 Side Effects Triggers all analysis methods if not already cached. =head3 Notes The report is idempotent: calling it multiple times on the same object always returns an identical string. All user-derived content is sanitised before output. =head3 API Specification =head4 Input [] =head4 Output { type => 'scalar' } =head3 FORMAL SPECIFICATION -- Z notation report == [ Xi Email::Abuse::Investigator; result! : STRING ] post: result! /= '' /\ result! ends_with '\n' =cut
2428
2429						sub report {
2430	48			48	6246	my $self = $_[0];
2431
2432	48				37	my @out;
2433
2434						# Banner header
2435	48				52	push @out, '=' x 72;
2436	48				61	push @out, " Email::Abuse::Investigator Report (v$VERSION)";
2437	48				38	push @out, '=' x 72;
2438	48				33	push @out, '';
2439
2440						# Envelope summary -- decode MIME encoded-words for readability
2441	48				51	for my $f (qw(from reply-to return-path subject date message-id)) {
2442	288				266	my $v = $self->_header_value($f);
2443	288	100			257	next unless defined $v;
2444	207				174	my $decoded = $self->_decode_mime_words($v);
2445	207				204	my $label = ucfirst($f);
2446	207	100			212	push @out, sprintf(' %-14s : %s', $label,
2447						_sanitise_output($decoded ne $v ? "$decoded [encoded: $v]" : $v));
2448						}
2449	48				39	push @out, '';
2450
2451						# Risk assessment section
2452	48				59	my $risk = $self->risk_assessment();
2453	48				74	push @out, "[ RISK ASSESSMENT: $risk->{level} (score: $risk->{score}) ]";
2454	48 48	100			35 59	if (@{ $risk->{flags} }) {
2455	25 25				18 28	for my $f (@{ $risk->{flags} }) {
2456	50				68	push @out, " [$f->{severity}] " . _sanitise_output($f->{detail});
2457						}
2458						} else {
2459	23				19	push @out, ' (no specific red flags detected)';
2460						}
2461	48				44	push @out, '';
2462
2463						# Originating host section
2464	48				36	push @out, '[ ORIGINATING HOST ]';
2465	48				74	my $orig = $self->originating_ip();
2466	48	100			75	if ($orig) {
2467	34				40	push @out, ' IP : ' . _sanitise_output($orig->{ip});
2468	34	50			58	push @out, ' Reverse DNS : ' . _sanitise_output($orig->{rdns}) if $orig->{rdns};
2469	34	100			44	push @out, ' Country : ' . _sanitise_output($orig->{country}) if $orig->{country};
2470	34	50			51	push @out, ' Organisation : ' . _sanitise_output($orig->{org}) if $orig->{org};
2471	34	50			50	push @out, ' Abuse addr : ' . _sanitise_output($orig->{abuse}) if $orig->{abuse};
2472	34				35	push @out, " Confidence : $orig->{confidence}";
2473	34	100			46	push @out, ' Note : ' . _sanitise_output($orig->{note}) if $orig->{note};
2474						} else {
2475	14				13	push @out, ' (could not determine originating IP)';
2476						}
2477	48				38	push @out, '';
2478
2479						# Sending software section (omitted if none found)
2480	48				61	my @sw = $self->sending_software();
2481	48	50			54	if (@sw) {
2482	0				0	push @out, '[ SENDING SOFTWARE / INFRASTRUCTURE CLUES ]';
2483	0				0	for my $s (@sw) {
2484	0				0	push @out, sprintf(' %-14s : %s', $s->{header}, _sanitise_output($s->{value}));
2485	0				0	push @out, " Note : $s->{note}";
2486	0				0	push @out, '';
2487						}
2488						}
2489
2490						# Received chain tracking IDs (only hops with id or for are shown)
2491	48 41	50			62 109	my @trail = grep { defined $_->{id} \|\| defined $_->{for} }
2492						$self->received_trail();
2493	48	50			55	if (@trail) {
2494	0				0	push @out, '[ RECEIVED CHAIN TRACKING IDs ]';
2495	0				0	push @out, ' (Supply these to the relevant ISP abuse team to trace the session)';
2496	0				0	push @out, '';
2497	0				0	for my $hop (@trail) {
2498	0		0		0	push @out, ' IP : ' . (_sanitise_output($hop->{ip}) // '(unknown)');
2499	0	0			0	push @out, ' Envelope for : ' . _sanitise_output($hop->{for}) if $hop->{for};
2500	0	0			0	push @out, ' Server ID : ' . _sanitise_output($hop->{id}) if $hop->{id};
2501	0				0	push @out, '';
2502						}
2503						}
2504
2505						# Embedded URLs section -- grouped by hostname
2506	48				39	push @out, '[ EMBEDDED HTTP/HTTPS URLs ]';
2507	48				49	my @urls = $self->embedded_urls();
2508	48	100			49	if (@urls) {
2509	19				14	my (%host_order, %host_meta, %host_paths);
2510	19				16	my $seq = 0;
2511	19				19	for my $u (@urls) {
2512	30				23	my $h = $u->{host};
2513	30	100			33	unless (exists $host_order{$h}) {
2514	21				21	$host_order{$h} = $seq++;
2515						$host_meta{$h} = {
2516						ip => $u->{ip},
2517						org => $u->{org},
2518						abuse => $u->{abuse},
2519						country => $u->{country},
2520	21				44	};
2521						}
2522	30 30				20 42	push @{ $host_paths{$h} }, $u->{url};
2523						}
2524
2525						# Output each host group in first-seen order
2526	19 2				26 4	for my $h (sort { $host_order{$a} <=> $host_order{$b} } keys %host_order) {
2527	21				19	my $m = $host_meta{$h};
2528	21 21				24 22	my $bare = lc $h; $bare =~ s/^www\.//;
2529						push @out, ' Host : ' . _sanitise_output($h) .
2530	21	100	67		24	(($URL_SHORTENERS{$bare} \|\| $self->{url_shorteners}->{$bare})
2531						? ' * URL SHORTENER -- real destination hidden *' : '');
2532	21	50			35	push @out, ' IP : ' . _sanitise_output($m->{ip}) if $m->{ip};
2533	21	100			29	push @out, ' Country : ' . _sanitise_output($m->{country}) if $m->{country};
2534	21	50			32	push @out, ' Organisation : ' . _sanitise_output($m->{org}) if $m->{org};
2535	21	50			34	push @out, ' Abuse addr : ' . _sanitise_output($m->{abuse}) if $m->{abuse};
2536	21 21				19 22	my @paths = @{ $host_paths{$h} };
2537	21	100			24	if (@paths == 1) {
2538	15				17	push @out, ' URL : ' . _sanitise_output($paths[0]);
2539						} else {
2540	6				6	push @out, ' URLs (' . scalar(@paths) . ') :';
2541	6				11	push @out, ' ' . _sanitise_output($_) for @paths;
2542						}
2543	21				48	push @out, '';
2544						}
2545						} else {
2546	29				25	push @out, ' (none found)';
2547	29				23	push @out, '';
2548						}
2549
2550						# Contact / reply-to domains section
2551	48				39	push @out, '[ CONTACT / REPLY-TO DOMAINS ]';
2552	48				71	my @mdoms = $self->mailto_domains();
2553	48	100			68	if (@mdoms) {
2554	36				33	for my $d (@mdoms) {
2555	49				55	push @out, ' Domain : ' . _sanitise_output($d->{domain});
2556	49				74	push @out, ' Found in : ' . _sanitise_output($d->{source});
2557	49	100			50	if ($d->{recently_registered}) {
2558	3				3	push @out, ' * WARNING: RECENTLY REGISTERED - possible phishing domain *';
2559						}
2560	49	100			53	push @out, ' Registered : ' . $d->{registered} if $d->{registered};
2561	49	100			46	push @out, ' Expires : ' . $d->{expires} if $d->{expires};
2562	49	100			44	push @out, ' Registrar : ' . _sanitise_output($d->{registrar}) if $d->{registrar};
2563	49	100			51	push @out, ' Reg. abuse : ' . _sanitise_output($d->{registrar_abuse}) if $d->{registrar_abuse};
2564	49	100			49	if ($d->{web_ip}) {
2565	22				24	push @out, ' Web host IP : ' . _sanitise_output($d->{web_ip});
2566	22	100			34	push @out, ' Web host org : ' . _sanitise_output($d->{web_org}) if $d->{web_org};
2567	22	100			25	push @out, ' Web abuse : ' . _sanitise_output($d->{web_abuse}) if $d->{web_abuse};
2568						} else {
2569	27				21	push @out, ' Web host : (no A record / unreachable)';
2570						}
2571	49	100			40	if ($d->{mx_host}) {
2572	3				4	push @out, ' MX host : ' . _sanitise_output($d->{mx_host});
2573	3	50			7	push @out, ' MX IP : ' . _sanitise_output($d->{mx_ip}) if $d->{mx_ip};
2574	3	50			6	push @out, ' MX org : ' . _sanitise_output($d->{mx_org}) if $d->{mx_org};
2575	3	50			5	push @out, ' MX abuse : ' . _sanitise_output($d->{mx_abuse}) if $d->{mx_abuse};
2576						} else {
2577	46				35	push @out, ' MX host : (none found)';
2578						}
2579	49	100			45	if ($d->{ns_host}) {
2580	2				3	push @out, ' NS host : ' . _sanitise_output($d->{ns_host});
2581	2	50			4	push @out, ' NS IP : ' . _sanitise_output($d->{ns_ip}) if $d->{ns_ip};
2582	2	50			4	push @out, ' NS org : ' . _sanitise_output($d->{ns_org}) if $d->{ns_org};
2583	2	50			4	push @out, ' NS abuse : ' . _sanitise_output($d->{ns_abuse}) if $d->{ns_abuse};
2584						}
2585	49				42	push @out, '';
2586						}
2587						} else {
2588	12				11	push @out, ' (none found)';
2589	12				12	push @out, '';
2590						}
2591
2592						# Abuse contacts summary
2593	48				38	push @out, '[ WHERE TO SEND ABUSE REPORTS ]';
2594	48				54	my @contacts = $self->abuse_contacts();
2595	48	100			51	if (@contacts) {
2596	39				38	for my $c (@contacts) {
2597	58				57	push @out, ' Role : ' . _sanitise_output($c->{role});
2598	58				90	push @out, ' Send to : ' . _sanitise_output($c->{address});
2599	58	50			70	push @out, ' Note : ' . _sanitise_output($c->{note}) if $c->{note};
2600	58				86	push @out, " Discovered : $c->{via}";
2601	58				52	push @out, '';
2602						}
2603						} else {
2604	9				8	push @out, ' (no abuse contacts could be determined)';
2605	9				7	push @out, '';
2606						}
2607
2608						# Web-form contacts (providers that require manual form submission)
2609	48				51	my @form_cs = $self->form_contacts();
2610	48	100			51	if (@form_cs) {
2611	2				2	push @out, '[ WHERE TO FILE WEB-FORM REPORTS ]';
2612	2				3	push @out, ' The following parties require manual submission via a web form.';
2613	2				1	push @out, ' Open each URL in a browser, then follow the instructions below it.';
2614	2				2	push @out, '';
2615	2				2	for my $c (@form_cs) {
2616	2				3	push @out, ' Role : ' . _sanitise_output($c->{role});
2617	2				2	push @out, ' Form URL : ' . _sanitise_output($c->{form});
2618	2	50			5	push @out, ' Domain/URL : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain};
2619	2	50			4	push @out, ' Note : ' . _sanitise_output($c->{note}) if $c->{note};
2620	2	50			3	if ($c->{form_paste}) {
2621						# Word-wrap the paste hint at ROLE_WRAP_LEN characters
2622	2				2	my $hint = $c->{form_paste};
2623	2				6	my @words = split /\s+/, $hint;
2624	2				3	my (@lines, $line);
2625	2				2	for my $w (@words) {
2626	59	100	100		57	if (defined $line && length("$line $w") > $ROLE_WRAP_LEN) {
2627	4				3	push @lines, $line;
2628	4				4	$line = $w;
2629						} else {
2630	55	100			49	$line = defined $line ? "$line $w" : $w;
2631						}
2632						}
2633	2	50			3	push @lines, $line if defined $line;
2634	2	50			4	push @out, ' Paste : ' . shift @lines if @lines;
2635	2				5	push @out, ' ' . $_ for @lines;
2636						}
2637	2	50			5	push @out, ' Upload : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload};
2638	2				2	push @out, '';
2639						}
2640						}
2641
2642	48				42	push @out, '=' x 72;
2643	48				353	return join("\n", @out) . "\n";
2644						}
2645
2646						# -----------------------------------------------------------------------
2647						# Private: output sanitisation
2648						# -----------------------------------------------------------------------
2649
2650						# _sanitise_output( $str ) -> $str
2651						#
2652						# Purpose:
2653						# Strip control characters that could affect terminal rendering or HTML
2654						# injection from any string that will appear in a report or abuse email.
2655						# Preserves printable ASCII, high-bytes (for UTF-8 content), tabs, and
2656						# line endings.
2657						#
2658						# Entry criteria:
2659						# $str -- a defined or undef scalar.
2660						#
2661						# Exit status:
2662						# Returns the sanitised string, or the empty string if $str is undef.
2663						#
2664						# Notes:
2665						# Only strips C0 control characters below 0x20 (except \t) and the DEL
2666						# character (0x7F). High bytes (0x80-0xFF) are preserved because they
2667						# form valid UTF-8 multi-byte sequences in headers and body text.
2668
2669						sub _sanitise_output {
2670	1153			1153	20873	my ($str) = @_;
2671	1153	100			835	return '' unless defined $str;
2672						# Remove C0 controls (except tab) and DEL
2673	1149				893	$str =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]//g;
2674	1149				1273	return $str;
2675						}
2676
2677						# -----------------------------------------------------------------------
2678						# Private: message parsing
2679						# -----------------------------------------------------------------------
2680
2681						# _split_message( $text )
2682						#
2683						# Purpose:
2684						# Split a raw RFC 2822 email into headers and body, parse all headers,
2685						# decode the body (including multipart), extract sending-software
2686						# fingerprints, and populate per-hop tracking data.
2687						#
2688						# Entry criteria:
2689						# $text -- defined scalar, already dereferenced by parse_email().
2690						# $self->{_sending_sw} and $self->{_rcvd_tracking} reset to [] by caller.
2691						#
2692						# Exit status:
2693						# Returns undef silently if the header block is empty/whitespace-only.
2694						# Otherwise all results are communicated via side effects on $self.
2695						#
2696						# Side effects:
2697						# Populates _headers, _received, _body_plain, _body_html, _sending_sw,
2698						# and _rcvd_tracking.
2699						#
2700						# Notes:
2701						# Delegates to _decode_multipart() for multipart/* content types.
2702						# Lines not matching the header pattern are silently discarded.
2703						# Boundary extraction uses a simple regex; missing boundary causes the
2704						# body to be skipped silently.
2705
2706						sub _split_message {
2707	481			481	470	my ($self, $text) = @_;
2708
2709						# Split at the first blank line (RFC 2822 header/body separator)
2710	481				2324	my ($header_block, $body_raw) = split /\r?\n\r?\n/, $text, 2;
2711
2712	481	50	33		1008	return unless defined $header_block && $header_block =~ /\S/;
2713	481		100		464	$body_raw //= '';
2714
2715						# Unfold RFC 2822 continuation lines (s2.2.3)
2716	481				1382	$header_block =~ s/\r?\n([ \t]+)/ $1/g;
2717
2718						# Parse each header line into a { name, value } pair
2719	481				326	my @headers;
2720	481				1774	for my $line (split /\r?\n/, $header_block) {
2721	4207	100			5647	if ($line =~ /^([\w-]+)\s:\s(.*)/) {
2722	4204				6018	push @headers, { name => lc($1), value => $2 };
2723						}
2724						}
2725	481				615	$self->{_headers} = \@headers;
2726
2727						# Collect all Received: header values (most-recent first, as in message)
2728						$self->{_received} = [
2729	1416				1301	map { $_->{value} }
2730	481 4204				517 3342	grep { $_->{name} eq 'received' } @headers
2731						];
2732
2733						# Determine content type and transfer encoding from top-level headers
2734	481 4204				451 3050	my ($ct_h) = grep { $_->{name} eq 'content-type' } @headers;
2735	481 4204				361 2892	my ($cte_h) = grep { $_->{name} eq 'content-transfer-encoding' } @headers;
2736	481	100			554	my $ct = defined $ct_h ? $ct_h->{value} : '';
2737	481	100			421	my $cte = defined $cte_h ? $cte_h->{value} : '';
2738
2739						# Decode multipart or single-part body as appropriate
2740	481	100			521	if ($ct =~ /multipart/i) {
2741	17				38	my ($boundary) = $ct =~ /boundary="?([^";]+)"?/i;
2742						# Pass depth=0 to enforce the MAX_MULTIPART_DEPTH recursion guard
2743	17	50			36	$self->_decode_multipart($body_raw, $boundary, 0) if $boundary;
2744						} else {
2745	464				515	my $decoded = $self->_decode_body($body_raw, $cte);
2746	464 8	100			495 9	if ($ct =~ /html/i) { $self->{_body_html} = $decoded }
2747	456				430	else { $self->{_body_plain} = $decoded }
2748						}
2749
2750						$self->_debug(sprintf 'Parsed %d headers, %d Received lines',
2751	481 481				369 1169	scalar @headers, scalar @{ $self->{_received} });
2752
2753						# --- Sending software fingerprints ---
2754						# These headers identify the mailer or shared-hosting script that sent
2755						# the message; invaluable for shared-hosting abuse reports.
2756	481				1513	my %sw_notes = (
2757						'x-php-originating-script' => 'PHP script on shared hosting -- report to hosting abuse team',
2758						'x-source' => 'Source file on shared hosting -- report to hosting abuse team',
2759						'x-source-host' => 'Sending hostname injected by shared hosting provider',
2760						'x-source-args' => 'Command-line args injected by shared hosting provider',
2761						'x-mailer' => 'Email client or bulk-mailer identifier',
2762						'user-agent' => 'Email client identifier',
2763						);
2764	481				1061	for my $sw_hdr (sort keys %sw_notes) {
2765	2886 25224				1847 17097	my ($h) = grep { $_->{name} eq $sw_hdr } @headers;
2766	2886	100			2327	next unless $h;
2767	7				15	push @{ $self->{_sending_sw} }, {
2768						header => $sw_hdr,
2769						value => $h->{value},
2770	7				3	note => $sw_notes{$sw_hdr},
2771						};
2772						}
2773
2774						# --- Per-hop tracking IDs from Received: chain ---
2775						# Walk oldest-first (reverse) so _rcvd_tracking is oldest-first
2776	481 481				403 521	for my $rcvd (reverse @{ $self->{_received} }) {
2777	1416				1158	my $ip = $self->_extract_ip_from_received($rcvd);
2778	1416				1117	my ($for_addr) = $rcvd =~ /\bfor\s+<?([^\s>]+\@[\w.-]+\.[\w]+)>?/i;
2779	1416				2222	my ($srv_id) = $rcvd =~ /\bid\s+([\w.-]+)/i;
2780						# Skip hops with no actionable tracking data
2781	1416	50	50		1190	next unless defined $ip \|\| defined $for_addr \|\| defined $srv_id;
2782	1415 1415				815 2678	push @{ $self->{_rcvd_tracking} }, {
2783						received => $rcvd,
2784						ip => $ip,
2785						for => $for_addr,
2786						id => $srv_id,
2787						};
2788						}
2789						}
2790
2791						# _decode_multipart( $body, $boundary, $depth )
2792						#
2793						# Purpose:
2794						# Recursively split a MIME multipart body on its boundary and decode each
2795						# text/plain and text/html part. Nested multipart/* containers are
2796						# recursed into up to MAX_MULTIPART_DEPTH levels deep.
2797						#
2798						# Entry criteria:
2799						# $body -- the raw body text of the multipart container.
2800						# $boundary -- the boundary string from the Content-Type header.
2801						# $depth -- current recursion depth (starts at 0 from _split_message).
2802						#
2803						# Exit status:
2804						# Returns undef if $depth >= MAX_MULTIPART_DEPTH (recursion guard).
2805						# Otherwise all results via side effects.
2806						#
2807						# Side effects:
2808						# Appends decoded text to $self->{_body_plain} and $self->{_body_html}.
2809						#
2810						# Notes:
2811						# Whitespace-only MIME segments between boundaries are silently skipped.
2812						# Decoding errors are silenced; raw bytes are used as fallback.
2813
2814						sub _decode_multipart {
2815	49			49	111	my ($self, $body, $boundary, $depth) = @_;
2816	49		50		47	$depth //= 0;
2817
2818						# Enforce the recursion depth limit to prevent stack exhaustion on
2819						# pathological crafted messages with deeply nested multipart structures.
2820	49	100			49	if ($depth >= $MAX_MULTIPART_DEPTH) {
2821	7				15	Carp::carp 'Email::Abuse::Investigator: multipart nesting depth limit',
2822						"($MAX_MULTIPART_DEPTH) exceeded; stopping recursion";
2823	7				18	return;
2824						}
2825
2826						# Split on the boundary marker; the (?:--)? suffix handles closing boundary
2827	42				332	my @parts = split /--\Q$boundary\E(?:--)?/, $body;
2828
2829	42				53	for my $part (@parts) {
2830						# Skip whitespace-only segments between boundaries
2831	135	100			160	next unless $part =~ /\S/;
2832
2833	52				80	$part =~ s/^\r?\n//;
2834
2835						# Each MIME part has its own headers separated from body by a blank line
2836	52				104	my ($phdr_block, $pbody) = split /\r?\n\r?\n/, $part, 2;
2837	52	100			48	next unless defined $pbody;
2838
2839						# Unfold continuation header lines within this part
2840	51				41	$phdr_block =~ s/\r?\n([ \t]+)/ $1/g;
2841
2842						# Parse this part's headers into a simple hash
2843	51				38	my %phdr;
2844	51				56	for my $line (split /\r?\n/, $phdr_block) {
2845	54	50			124	$phdr{ lc($1) } = $2 if $line =~ /^([\w-]+)\s:\s(.*)/;
2846						}
2847
2848	51		100		58	my $pct = $phdr{'content-type'} // '';
2849	51		100		72	my $pcte = $phdr{'content-transfer-encoding'} // '';
2850
2851						# Nested multipart/* must be recursed into; without this URLs in
2852						# multipart/alternative inside multipart/mixed would be missed.
2853	51	100			59	if ($pct =~ /multipart/i) {
2854	24				35	my ($inner_boundary) = $pct =~ /boundary\s=\s"?([^";]+)"?/i;
2855	24	50			24	if ($inner_boundary) {
2856	24				19	$inner_boundary =~ s/\s+$//;
2857						# Increment depth counter for the recursion guard
2858	24				43	$self->_decode_multipart($pbody, $inner_boundary, $depth + 1);
2859						}
2860	24				22	next;
2861						}
2862
2863						# Decode transfer encoding and accumulate by content type
2864	27				30	my $decoded = $self->_decode_body($pbody, $pcte);
2865	27 10	100 100	100		69 17	if ($pct =~ /text\/html/i) { $self->{_body_html} .= $decoded }
2866	16				26	elsif ($pct =~ /text/i \|\| !$pct) { $self->{_body_plain} .= $decoded }
2867						}
2868						}
2869
2870						# _decode_body( $body, $cte ) -> string
2871						#
2872						# Purpose:
2873						# Decode a MIME body part according to its Content-Transfer-Encoding.
2874						#
2875						# Entry criteria:
2876						# $body -- raw body string (may be undef).
2877						# $cte -- Content-Transfer-Encoding value string (may be undef).
2878						#
2879						# Exit status:
2880						# Returns the decoded string, or the original string if the encoding is
2881						# 7bit/8bit/binary or unrecognised.
2882						#
2883						# Notes:
2884						# decode_qp and decode_base64 are imported from MIME:: modules; errors
2885						# from malformed content are silenced by the eval wrappers they provide.
2886
2887						sub _decode_body {
2888	497			497	1124	my ($self, $body, $cte) = @_;
2889	497		100		451	$cte //= '';
2890	497	100			453	return decode_qp($body) if $cte =~ /quoted-printable/i;
2891	491	100			408	return decode_base64($body) if $cte =~ /base64/i;
2892	488		100		587	return $body // '';
2893						}
2894
2895						# -----------------------------------------------------------------------
2896						# Private: Received-chain -> originating IP
2897						# -----------------------------------------------------------------------
2898
2899						# _find_origin()
2900						#
2901						# Purpose:
2902						# Walk the Received: chain (oldest-first) to find the first external IP,
2903						# or fall back to X-Originating-IP. Enrich with rDNS and WHOIS.
2904						#
2905						# Entry criteria:
2906						# $self->{_received} populated by _split_message().
2907						# $self->{trusted_relays} set by new().
2908						#
2909						# Exit status:
2910						# Returns { ip, rdns, org, abuse, country, confidence, note } on success.
2911						# Returns undef if no usable IP can be identified.
2912						#
2913						# Side effects:
2914						# Network I/O via _enrich_ip(): one PTR lookup, one RDAP/WHOIS query.
2915						# Results are also stored in the CHI cross-message cache if available.
2916						#
2917						# Notes:
2918						# confidence 'high' = 2+ distinct external IPs;
2919						# 'medium' = exactly one external IP;
2920						# 'low' = taken from X-Originating-IP.
2921
2922						sub _find_origin {
2923	267			267	191	my $self = $_[0];
2924
2925	267				203	my @candidates;
2926
2927						# Walk oldest-first (reverse) to collect external IPs
2928	267 267				219 247	for my $hdr (reverse @{ $self->{_received} }) {
2929	1222		100		960	my $ip = $self->_extract_ip_from_received($hdr) // next;
2930	1221	100			974	next if $self->_is_private($ip);
2931	89	100			1078	next if $self->_is_trusted($ip);
2932	84				86	push @candidates, $ip;
2933						}
2934
2935						# Fall back to X-Originating-IP if no external IPs in Received: chain
2936	267	100			262	unless (@candidates) {
2937	187				196	my $xoip = $self->_header_value('x-originating-ip');
2938	187	100			178	if ($xoip) {
2939	6				8	$xoip =~ s/[\[\]\s]//g;
2940	6	100			7	return $self->_enrich_ip($xoip, 'low',
2941						'Taken from X-Originating-IP (webmail, unverified)')
2942						unless $self->_is_private($xoip);
2943						}
2944	182				267	return undef;
2945						}
2946
2947						# Report the oldest (first) external IP; confidence depends on count
2948	80	100			149	return $self->_enrich_ip(
2949						$candidates[0],
2950						@candidates > 1 ? 'high' : 'medium',
2951						'First external hop in Received: chain',
2952						);
2953						}
2954
2955						# _extract_ip_from_received( $hdr ) -> ipv4_or_ipv6_string \| undef
2956						#
2957						# Purpose:
2958						# Extract the most-significant IP address from a raw Received: header
2959						# value, trying patterns in priority order. Supports both IPv4 dotted-
2960						# quad and IPv6 bracket notation.
2961						#
2962						# Entry criteria:
2963						# $hdr -- a defined Received: header value string.
2964						#
2965						# Exit status:
2966						# Returns the IP string on success, undef if no IP can be extracted.
2967						#
2968						# Notes:
2969						# IPv4 addresses are validated (all octets <= 255).
2970						# IPv6 addresses are returned as-is if they contain colons.
2971
2972						sub _extract_ip_from_received {
2973	2653			2653	3281	my ($self, $hdr) = @_;
2974	2653				1749	for my $re (@RECEIVED_IP_RE) {
2975	2719	100			4730	if ($hdr =~ $re) {
2976	2659				2086	my $ip = $1;
2977
2978						# Accept IPv6 addresses (contain colons) without further validation
2979	2659	100			2076	return $ip if $ip =~ /:/;
2980
2981						# Validate IPv4 format and octet range
2982	2656	100			3089	next unless $ip =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/;
2983	2650 10600	100			2457 8556	next if grep { $_ > 255 } split /\./, $ip;
2984	2642				2942	return $ip;
2985						}
2986						}
2987	8				15	return undef;
2988						}
2989
2990						# _is_private( $ip ) -> bool
2991						#
2992						# Purpose:
2993						# Test whether an IP address falls in any private, reserved, or special-
2994						# use range (IPv4 or IPv6) that should never be reported as a spam origin.
2995						#
2996						# Entry criteria:
2997						# $ip -- a scalar IP string (IPv4 or IPv6); may be undef.
2998						#
2999						# Exit status:
3000						# Returns 1 (true) if the IP is private/reserved, 0 (false) otherwise.
3001						# Returns 1 for undef or empty strings.
3002						#
3003						# Notes:
3004						# Uses the module-level @PRIVATE_RANGES array of pre-compiled regexes.
3005						# Covers all ranges listed in RFC 1122, 1918, 5737, 6598, and RFC 4193.
3006
3007						sub _is_private {
3008	1263			1263	1089	my ($self, $ip) = @_;
3009	1263	100	100		1404	return 1 unless defined $ip && $ip ne '';
3010	1261 6320	100			829 7363	for my $re (@PRIVATE_RANGES) { return 1 if $ip =~ $re }
3011	101				119	return 0;
3012						}
3013
3014						# _is_trusted( $ip ) -> bool
3015						#
3016						# Purpose:
3017						# Test whether an IP address matches any entry in the caller-supplied
3018						# trusted_relays list (exact IP or CIDR block).
3019						#
3020						# Entry criteria:
3021						# $ip -- a defined IPv4 address string.
3022						# $self->{trusted_relays} -- arrayref of exact IPs or CIDR strings.
3023						#
3024						# Exit status:
3025						# Returns 1 (true) if the IP matches any trusted relay, 0 otherwise.
3026
3027						sub _is_trusted {
3028	95			95	83	my ($self, $ip) = @_;
3029	95 95				69 99	for my $cidr (@{ $self->{trusted_relays} }) {
3030	19	100			17	return 1 if $self->_ip_in_cidr($ip, $cidr);
3031						}
3032	87				98	return 0;
3033						}
3034
3035						# -----------------------------------------------------------------------
3036						# Private: HTTP/HTTPS URL extraction and resolution
3037						# -----------------------------------------------------------------------
3038
3039						# _extract_and_resolve_urls() -> arrayref of url hashrefs
3040						#
3041						# Purpose:
3042						# Extract all HTTP/HTTPS URLs from the decoded body, resolve each unique
3043						# hostname to an IP, and enrich with WHOIS/RDAP data. Optionally uses
3044						# AnyEvent::DNS to parallelise the DNS resolution step.
3045						#
3046						# Entry criteria:
3047						# $self->{_body_plain} and $self->{_body_html} populated by _split_message().
3048						#
3049						# Exit status:
3050						# Returns an arrayref of url hashrefs (possibly empty).
3051						#
3052						# Side effects:
3053						# Network I/O per unique hostname: one A/AAAA lookup, one RDAP/WHOIS.
3054						# Results stored in the CHI cross-message cache if available.
3055
3056						sub _extract_and_resolve_urls {
3057	144			144	105	my $self = $_[0];
3058	144				182	my (%url_seen, %host_cache);
3059	144				0	my @results;
3060	144				165	my $combined = $self->{_body_plain} . "\n" . $self->{_body_html};
3061
3062						# Collect unique URLs from body
3063	144 614				168 560	my @urls = grep { !$url_seen{$_}++ } $self->_extract_http_urls($combined);
3064
3065						# Extract unique hostnames for parallel DNS resolution
3066	144				105	my %hostname_needed;
3067	144				122	for my $url (@urls) {
3068	614				805	my ($host) = $url =~ m{https?://([^/:?\s#]+)}i;
3069	614	50			586	$hostname_needed{$host}++ if $host;
3070						}
3071
3072						# Parallelise DNS lookups if AnyEvent::DNS is available
3073	144	50	33		176	if ($HAS_ANYEVENT_DNS && scalar(keys %hostname_needed) > 1) {
3074	0				0	$self->_parallel_resolve_hosts(\%hostname_needed, \%host_cache);
3075						}
3076
3077						# Process each URL: resolve hostname and WHOIS-enrich
3078	144				113	for my $url (@urls) {
3079	614				780	my ($host) = $url =~ m{https?://([^/:?\s#]+)}i;
3080	614	50			493	next unless $host;
3081
3082						# Resolve and WHOIS once per unique hostname, then cache the result
3083	614	100			492	unless (exists $host_cache{$host}) {
3084						# Check the cross-message CHI cache first
3085	99	50			94	my $cached = $_cache ? $_cache->get("url:$host") : undef;
3086	99	50			81	if ($cached) {
3087	0				0	$host_cache{$host} = $cached;
3088						} else {
3089	99		100		132	my $ip = $self->_resolve_host($host) // '(unresolved)';
3090	99	100			298	my $whois = $ip ne '(unresolved)'
3091						? $self->_whois_ip($ip)
3092						: {};
3093
3094						# Fall back to domain WHOIS if IP lookup returned nothing
3095	99	100			191	if (!$whois->{abuse}) {
3096	56		33		84	my $reg = _registrable($host) // $host;
3097	56				76	my $dw = $self->_parse_domain_whois_abuse($reg);
3098	56	100			142	$whois = $dw if $dw->{abuse};
3099						}
3100
3101						my $entry = {
3102						ip => $ip,
3103						org => $whois->{org} // '(unknown)',
3104						abuse => $whois->{abuse} // '(unknown)',
3105						country => $whois->{country} // undef,
3106	99		100 100 100		337	};
3107	99				92	$host_cache{$host} = $entry;
3108
3109						# Store in cross-message cache for reuse across messages
3110	99	50			139	$_cache->set("url:$host", $entry) if $_cache;
3111						}
3112						}
3113
3114	614 614				372 1028	push @results, { url => $url, host => $host, %{ $host_cache{$host} } };
3115						}
3116	144				377	return \@results;
3117						}
3118
3119						# _parallel_resolve_hosts( \%hostnames, \%cache )
3120						#
3121						# Purpose:
3122						# Resolve multiple hostnames to IPs in parallel using AnyEvent::DNS.
3123						# Populates the cache with resolved IPs so the sequential loop in
3124						# _extract_and_resolve_urls() can skip the DNS step for pre-resolved hosts.
3125						#
3126						# Entry criteria:
3127						# $hostnames_ref -- hashref keyed by hostname (values ignored).
3128						# $cache_ref -- hashref to populate with { ip => '...' } results.
3129						# AnyEvent::DNS must be installed ($HAS_ANYEVENT_DNS is true).
3130						#
3131						# Exit status:
3132						# Returns undef; all results written to %$cache_ref via side effects.
3133						#
3134						# Notes:
3135						# Errors (NXDOMAIN, timeout) are silently swallowed; the sequential
3136						# resolution loop will return '(unresolved)' for those hosts.
3137
3138						sub _parallel_resolve_hosts {
3139	2			2	419	my ($self, $hostnames_ref, $cache_ref) = @_;
3140	2	50			6	return unless $HAS_ANYEVENT_DNS;
3141
3142						# Build an AnyEvent condvar to wait for all lookups to complete
3143	0				0	my $cv = AnyEvent->condvar;
3144	0				0	my $pending = scalar keys %$hostnames_ref;
3145
3146	0				0	for my $host (keys %$hostnames_ref) {
3147						# Fire an async A (and AAAA) query for each hostname
3148						AnyEvent::DNS::resolve(
3149						$host, 'A',
3150						sub {
3151	0			0	0	my @answers = @_;
3152	0	0			0	if (@answers) {
3153						# Cache the first A record result
3154	0				0	$cache_ref->{$host} = { ip => $answers[0][4] };
3155						}
3156						# Decrement the pending counter; signal when all done
3157	0	0			0	$cv->send if --$pending <= 0;
3158						},
3159	0				0	);
3160						}
3161
3162						# Block until all DNS queries complete (subject to AnyEvent's own timeouts)
3163	0				0	$cv->recv;
3164						}
3165
3166						# _extract_http_urls( $body ) -> list of url strings
3167						#
3168						# Purpose:
3169						# Extract all HTTP and HTTPS URLs from a body string, using both
3170						# structural HTML parsing (if HTML::LinkExtor is available) and a
3171						# plain-text regex pass. Deduplicates and strips trailing punctuation.
3172						#
3173						# Entry criteria:
3174						# $body -- combined plain+HTML body string.
3175						#
3176						# Exit status:
3177						# Returns a list of URL strings (possibly empty), deduplicated.
3178
3179						sub _extract_http_urls {
3180	147			147	693	my ($self, $body) = @_;
3181	147				102	my @urls;
3182
3183						# Structural HTML link extraction (handles quoted attributes correctly)
3184	147	50			148	if ($HAS_HTML_LINKEXTOR) {
3185						my $p = HTML::LinkExtor->new(sub {
3186	16			16	251	my ($tag, %attrs) = @_;
3187	16				14	for my $attr (qw(href src action)) {
3188	48		100		61	my $val = $attrs{$attr} // '';
3189	48	100 100			120	if ($val =~ m{^https?://}i) {
3190	12				11	push @urls, $val;
3191						} elsif ($val =~ m{^//[\w.-]}) {
3192						# Protocol-relative -- assume https
3193	2				2	push @urls, 'https:' . $val;
3194						}
3195						}
3196	147				476	});
3197	147				5285	$p->parse($body);
3198						}
3199
3200						# Plain-text regex pass for bare URLs not in HTML attributes
3201	147				1030	while ($body =~ m{(https?://[^\s<>"'\)\]]+)}gi) {
3202	1116				1702	push @urls, $1;
3203						}
3204
3205						# Protocol-relative URLs not caught above
3206	147				388	while ($body =~ m{(?:^\|[\s"'=])(//[\w.-][^\s<>"'\)\]]*)}gim) {
3207	2				4	push @urls, 'https:' . $1;
3208						}
3209
3210						# Deduplicate and strip trailing punctuation
3211	147				104	my %seen;
3212	147 1132				147 999	my @all = grep { !$seen{$_}++ } @urls;
3213	147				335	s/[.,;:!?\)>\]]+$// for @all;
3214	147				292	return @all;
3215						}
3216
3217						# -----------------------------------------------------------------------
3218						# Private: domain extraction and full analysis
3219						# -----------------------------------------------------------------------
3220
3221						# _extract_and_analyse_domains() -> arrayref of domain hashrefs
3222						#
3223						# Purpose:
3224						# Collect all non-infrastructure contact domains from headers and body,
3225						# run the full domain intelligence pipeline on each, and return an arrayref
3226						# suitable for storage in $self->{_mailto_domains}.
3227						#
3228						# Entry criteria:
3229						# _split_message() must have been called.
3230						#
3231						# Exit status:
3232						# Always returns an arrayref; never undef.
3233						#
3234						# Side effects:
3235						# Network I/O per domain via _analyse_domain().
3236						# Results stored in $self->{_domain_info} and CHI cache.
3237
3238						sub _extract_and_analyse_domains {
3239	138			138	103	my $self = $_[0];
3240	138				149	my (%seen, @domains_with_source);
3241
3242						# Build a set of recipient domains to exclude (victims, not senders)
3243	138				0	my %recipient_domains;
3244	138				119	for my $hname (qw(to cc)) {
3245	276		100		272	my $val = $self->_header_value($hname) // next;
3246	129				168	for my $dom ($self->_domains_from_text($val)) {
3247	126		33		135	my $reg = _registrable($dom) // $dom;
3248	126				142	$recipient_domains{$dom}++;
3249	126				146	$recipient_domains{$reg}++;
3250						}
3251						}
3252
3253						# Also exclude domains from Received: "for" envelope recipients
3254	138 138				87 132	for my $hop (@{ $self->{_rcvd_tracking} }) {
3255	135	50	33		179	next unless $hop->{for} && $hop->{for} =~ /\@([\w.-]+)/;
3256	0				0	my $dom = lc $1;
3257	0		0		0	my $reg = _registrable($dom) // $dom;
3258	0				0	$recipient_domains{$dom}++;
3259	0				0	$recipient_domains{$reg}++;
3260						}
3261
3262						# Inner closure: record a domain if it passes all filters
3263						my $record = sub {
3264	624			624	511	my ($dom, $source) = @_;
3265	624				466	$dom = lc $dom;
3266	624				403	$dom =~ s/\.$//;
3267	624	50			570	next if $self->{trusted_domains}->{$dom};
3268	624	100			520	return if $TRUSTED_DOMAINS{$dom};
3269	585	100			422	return if $recipient_domains{$dom};
3270	583	50	67		435	return if $recipient_domains{ _registrable($dom) // $dom };
3271						# Discard non-routable hostnames (single-label, pseudo-TLDs, etc.)
3272	583	100			713	return unless $dom =~ /\.[a-zA-Z]{2,}$/;
3273	519	100			613	return if $dom =~ /\.(?:local\|internal\|lan\|localdomain\|arpa)$/i;
3274	517	100			554	return if $seen{$dom}++;
3275	383				518	push @domains_with_source, { domain => $dom, source => $source };
3276	138				289	};
3277
3278						# Collect from standard sender/reply headers
3279	138				274	my %header_sources = (
3280						'from' => 'From: header',
3281						'reply-to' => 'Reply-To: header',
3282						'return-path' => 'Return-Path: header',
3283						'sender' => 'Sender: header',
3284						);
3285	138				244	for my $hname (sort keys %header_sources) {
3286	552		100		441	my $val = $self->_header_value($hname) // next;
3287						$record->($_, $header_sources{$hname})
3288	262				258	for $self->_domains_from_text($val);
3289						}
3290
3291						# Message-ID domain often reveals the real bulk-sending platform
3292	138				161	my $mid = $self->_header_value('message-id');
3293	138	100	67		282	if ($mid && $mid =~ /\@([\w.-]+)/) {
3294	128				120	my $mid_dom = lc $1;
3295	128		67		118	my $mid_reg = _registrable($mid_dom) // $mid_dom;
3296						$record->($mid_dom, 'Message-ID: header')
3297	128	50	40		421	unless $TRUSTED_DOMAINS{$mid_dom} \|\| $TRUSTED_DOMAINS{$mid_reg} \|\| $self->{trusted_domains}->{$mid_dom} \|\| $self->{trusted_domains}->{$mid_reg};
3298						}
3299
3300						# DKIM signing domain(s) -- the organisation that vouches for the message
3301	138				151	my $auth = $self->_parse_auth_results_cached();
3302	138 138		100		101 229	for my $dkim_d (@{ $auth->{dkim_domains} // [] }) {
3303	5				5	$record->($dkim_d, 'DKIM-Signature: d= (signing domain)');
3304						}
3305
3306						# List-Unsubscribe identifies the ESP or bulk sender
3307	138				147	my $unsub = $self->_header_value('list-unsubscribe');
3308	138	50			132	if ($unsub) {
3309	0				0	while ($unsub =~ m{https?://([^/:?\s>]+)}gi) {
3310	0				0	$record->(lc $1, 'List-Unsubscribe: header');
3311						}
3312	0				0	while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) {
3313	0				0	$record->(lc $1, 'List-Unsubscribe: header');
3314						}
3315						}
3316
3317						# Body email addresses (mailto: and bare user@domain forms)
3318	138				144	my $combined = $self->{_body_plain} . "\n" . $self->{_body_html};
3319						$record->($_, 'email address / mailto in body')
3320	138				124	for $self->_domains_from_text($combined);
3321
3322						# Run the full intelligence pipeline on each collected domain
3323	138				100	my @results;
3324	138				103	for my $entry (@domains_with_source) {
3325	383				358	my $info = $self->_analyse_domain($entry->{domain});
3326	383				655	push @results, { %$entry, %$info };
3327						}
3328	138				803	return \@results;
3329						}
3330
3331						# _domains_from_text( $text ) -> list of domain strings
3332						#
3333						# Purpose:
3334						# Extract unique domain names from mailto: links and bare user@domain
3335						# addresses in a block of text.
3336						#
3337						# Entry criteria:
3338						# $text -- a defined scalar of decoded body or header text.
3339						#
3340						# Exit status:
3341						# Returns a list of lower-cased domain strings (possibly empty).
3342
3343						sub _domains_from_text {
3344	674			674	2009	my ($self, $text) = @_;
3345	674				416	my (%seen, @out);
3346
3347						# mailto: links (including HTML-entity-encoded @ signs from QP)
3348	674				738	while ($text =~ /mailto:(?:[^@\s<>"]+)@([\w.-]+)/gi) {
3349	14 14				18 12	my $dom = lc $1; $dom =~ s/\.$//;
3350	14	100			30	push @out, $dom unless $seen{$dom}++;
3351						}
3352
3353						# Bare user@domain patterns
3354	674				1239	while ($text =~ /\b[\w.+%-]+@([\w.-]+\.[a-zA-Z]{2,})\b/g) {
3355	662 662				563 468	my $dom = lc $1; $dom =~ s/\.$//;
3356	662	100			1132	push @out, $dom unless $seen{$dom}++;
3357						}
3358	674				960	return @out;
3359						}
3360
3361						# _analyse_domain( $domain ) -> hashref
3362						#
3363						# Purpose:
3364						# Run the complete intelligence pipeline for a single domain: A record
3365						# (web hosting), MX record (mail hosting), NS record (DNS hosting),
3366						# and WHOIS (registrar, creation/expiry dates, abuse contact).
3367						# Each IP is enriched via RDAP/WHOIS. Results are cached per domain
3368						# in $self->{_domain_info} and in the CHI cross-message cache.
3369						#
3370						# Entry criteria:
3371						# $domain -- lower-cased, no trailing dot, not in TRUSTED_DOMAINS.
3372						# $self->{timeout} used for all network operations.
3373						#
3374						# Exit status:
3375						# Always returns a hashref reference; never undef; may be empty ({}).
3376						# Possible keys: web_ip, web_org, web_abuse, mx_host, mx_ip, mx_org,
3377						# mx_abuse, ns_host, ns_ip, ns_org, ns_abuse, registrar,
3378						# registrar_abuse, registered, expires, recently_registered, whois_raw.
3379						#
3380						# Side effects:
3381						# Network I/O; writes result to $self->{_domain_info}{$domain} and CHI.
3382						#
3383						# Notes:
3384						# MX/NS lookups require Net::DNS; absent without it.
3385						# recently_registered is set to 1 (not 0) when the threshold is met.
3386						# whois_raw is truncated to WHOIS_RAW_MAX bytes.
3387
3388						sub _analyse_domain {
3389	386			386	3654	my ($self, $domain) = @_;
3390
3391						# Return the per-message cached result if already analysed
3392						return $self->{_domain_info}{$domain}
3393	386	100			355	if $self->{_domain_info}{$domain};
3394
3395						# Check the cross-message CHI cache before hitting the network
3396	382	50			293	if ($_cache) {
3397	0				0	my $cached = $_cache->get("dom:$domain");
3398	0	0			0	if ($cached) {
3399	0				0	$self->{_domain_info}{$domain} = $cached;
3400	0				0	return $cached;
3401						}
3402						}
3403
3404	382				452	$self->_debug("Analysing domain: $domain");
3405	382				250	my %info;
3406
3407						# --- A record -> web hosting IP ---
3408	382				367	my $web_ip = $self->_resolve_host($domain);
3409	382	100			608	if ($web_ip) {
3410	50				58	$info{web_ip} = $web_ip;
3411	50				53	my $w = $self->_whois_ip($web_ip);
3412	50	100			122	$info{web_org} = $w->{org} if $w->{org};
3413	50	100			72	$info{web_abuse} = $w->{abuse} if $w->{abuse};
3414						}
3415
3416						# MX and NS lookups require Net::DNS
3417	382	50			276	if ($HAS_NET_DNS) {
3418						my $res = Net::DNS::Resolver->new(
3419						tcp_timeout => $self->{timeout},
3420						udp_timeout => $self->{timeout},
3421	0				0	);
3422
3423						# --- MX record -> mail hosting ---
3424	0				0	my $mxq = $res->search($domain, 'MX');
3425	0	0			0	if ($mxq) {
3426	0				0	my ($best) = sort { $a->preference <=> $b->preference }
3427	0 0				0 0	grep { $_->type eq 'MX' } $mxq->answer;
3428	0	0			0	if ($best) {
3429	0				0	(my $mx_host = lc $best->exchange) =~ s/\.$//;
3430	0				0	$info{mx_host} = $mx_host;
3431	0				0	my $mx_ip = $self->_resolve_host($mx_host);
3432	0	0			0	if ($mx_ip) {
3433	0				0	$info{mx_ip} = $mx_ip;
3434	0				0	my $mw = $self->_whois_ip($mx_ip);
3435	0	0			0	$info{mx_org} = $mw->{org} if $mw->{org};
3436	0	0			0	$info{mx_abuse} = $mw->{abuse} if $mw->{abuse};
3437						}
3438						}
3439						}
3440
3441						# --- NS record -> DNS hosting ---
3442	0				0	my $nsq = $res->search($domain, 'NS');
3443	0	0			0	if ($nsq) {
3444	0 0				0 0	my ($first) = grep { $_->type eq 'NS' } $nsq->answer;
3445	0	0			0	if ($first) {
3446	0				0	(my $ns_host = lc $first->nsdname) =~ s/\.$//;
3447	0				0	$info{ns_host} = $ns_host;
3448	0				0	my $ns_ip = $self->_resolve_host($ns_host);
3449	0	0			0	if ($ns_ip) {
3450	0				0	$info{ns_ip} = $ns_ip;
3451	0				0	my $nw = $self->_whois_ip($ns_ip);
3452	0	0			0	$info{ns_org} = $nw->{org} if $nw->{org};
3453	0	0			0	$info{ns_abuse} = $nw->{abuse} if $nw->{abuse};
3454						}
3455						}
3456						}
3457						}
3458
3459						# --- Domain WHOIS -> registrar + dates ---
3460	382				339	my $domain_whois = $self->_domain_whois($domain);
3461	382	100			515	if ($domain_whois) {
3462						# Truncate raw WHOIS for storage but parse structured fields from full text
3463	19				47	$info{whois_raw} = substr($domain_whois, 0, $WHOIS_RAW_MAX);
3464
3465						# Registrar name
3466	19	100			42	if ($domain_whois =~ /Registrar:\s*(.+)/i) {
3467	15				35	($info{registrar} = $1) =~ s/\s+$//;
3468						}
3469
3470						# Registrar abuse contact email (try multiple field names)
3471	19				37	for my $pat (
3472						qr/Registrar Abuse Contact Email:\s*(\S+@\S+)/i,
3473						qr/Abuse Contact Email:\s*(\S+@\S+)/i,
3474						qr/abuse-contact:\s*(\S+@\S+)/i,
3475						) {
3476	57	100	100		139	if (!$info{registrar_abuse} && $domain_whois =~ $pat) {
3477	12				25	($info{registrar_abuse} = $1) =~ s/\s+$//;
3478						}
3479						}
3480
3481						# Domain creation date (multiple registrar field name variations)
3482	19				49	for my $pat (
3483						qr/Creation Date:\s*(\S+)/i,
3484						qr/Created(?:\s+On)?:\s*(\S+)/i,
3485						qr/Registration Time:\s*(\S+)/i,
3486						qr/^registered:\s*(\S+)/im,
3487						) {
3488	76	100	100		127	if (!$info{registered} && $domain_whois =~ $pat) {
3489	15				23	($info{registered} = $1) =~ s/[TZ].*//;
3490						}
3491						}
3492
3493						# Domain expiry date
3494	19				34	for my $pat (
3495						qr/Registry Expiry Date:\s*(\S+)/i,
3496						qr/Expir(?:y\|ation)(?: Date)?:\s*(\S+)/i,
3497						qr/paid-till:\s*(\S+)/i,
3498						) {
3499	57	100	100		110	if (!$info{expires} && $domain_whois =~ $pat) {
3500	13				24	($info{expires} = $1) =~ s/[TZ].*//;
3501						}
3502						}
3503
3504						# Flag recently-registered domains (< RECENT_REG_DAYS old)
3505	19	100			35	if ($info{registered}) {
3506	15				34	my $epoch = $self->_parse_date_to_epoch($info{registered});
3507	15	100	67		300	$info{recently_registered} = 1
3508						if $epoch && (time() - $epoch) < $RECENT_REG_DAYS * $SECS_PER_DAY;
3509						}
3510						}
3511
3512						# Store in per-message and cross-message caches
3513	382				339	$self->{_domain_info}{$domain} = \%info;
3514	382	50			298	$_cache->set("dom:$domain", \%info) if $_cache;
3515
3516	382				303	return \%info;
3517						}
3518
3519						# -----------------------------------------------------------------------
3520						# Private: DNS helpers
3521						# -----------------------------------------------------------------------
3522
3523						# _resolve_host( $host ) -> ip_string \| undef
3524						#
3525						# Purpose:
3526						# Resolve a hostname to an IPv4 (or IPv6) address. Uses Net::DNS for
3527						# both A and AAAA queries when available; falls back to inet_aton for
3528						# pure IPv4 resolution.
3529						#
3530						# Entry criteria:
3531						# $host -- hostname string or already-numeric IP.
3532						#
3533						# Exit status:
3534						# Returns the first resolved IP string, or undef on failure.
3535						#
3536						# Notes:
3537						# When the input is already a dotted-quad IPv4 it is returned immediately.
3538						# AAAA records are tried if the A query fails and Net::DNS is available.
3539
3540						sub _resolve_host {
3541	13			13	212	my ($self, $host) = @_;
3542	13	100			23	return $host if $host =~ /^\d{1,3}(?:\.\d{1,3}){3}$/;
3543
3544						# Check the CHI cache before hitting DNS
3545	12	50			16	if ($_cache) {
3546	0				0	my $cached_ip = $_cache->get("resolve:$host");
3547	0	0			0	return $cached_ip if defined $cached_ip;
3548						}
3549
3550	12				6	my $ip;
3551
3552	12	50			18	if ($HAS_NET_DNS) {
3553						my $res = Net::DNS::Resolver->new(
3554						tcp_timeout => $self->{timeout},
3555						udp_timeout => $self->{timeout},
3556	0				0	);
3557
3558						# Try A record first, then AAAA for IPv6
3559	0				0	for my $type (qw(A AAAA)) {
3560	0				0	my $query = $res->search($host, $type);
3561	0	0			0	if ($query) {
3562	0				0	for my $rr ($query->answer) {
3563	0	0 0			0	if ($rr->type eq 'A') {
3564	0				0	$ip = $rr->address;
3565	0				0	last;
3566						} elsif ($rr->type eq 'AAAA') {
3567	0				0	$ip = $rr->address;
3568	0				0	last;
3569						}
3570						}
3571						}
3572	0	0			0	last if defined $ip;
3573						}
3574						} else {
3575						# Fallback: gethostbyname (IPv4 only)
3576	12 12				9 213098	my $packed = eval { inet_aton($host) };
3577	12	50			52	$ip = $packed ? inet_ntoa($packed) : undef;
3578						}
3579
3580						# Cache the result (including undef as '' to avoid repeated failed lookups)
3581	12	50			19	if ($_cache) {
3582	0		0		0	$_cache->set("resolve:$host", $ip // '');
3583						}
3584
3585	12				29	return $ip;
3586						}
3587
3588						# _reverse_dns( $ip ) -> hostname \| undef
3589						#
3590						# Purpose:
3591						# Perform a PTR (reverse DNS) lookup for an IP address. Supports both
3592						# IPv4 and IPv6 via Net::DNS when available; falls back to gethostbyaddr.
3593						#
3594						# Entry criteria:
3595						# $ip -- a defined IPv4 or IPv6 address string.
3596						#
3597						# Exit status:
3598						# Returns the PTR hostname string, or undef if no record exists.
3599
3600						sub _reverse_dns {
3601	6			6	7	my ($self, $ip) = @_;
3602	6	50			35	return undef unless $ip;
3603
3604	6	50			13	if ($HAS_NET_DNS) {
3605	0				0	my $res = Net::DNS::Resolver->new(tcp_timeout => $self->{timeout});
3606	0				0	my $query = $res->search($ip, 'PTR');
3607	0	0			0	if ($query) {
3608	0				0	for my $rr ($query->answer) {
3609	0	0			0	return $rr->ptrdname if $rr->type eq 'PTR';
3610						}
3611						}
3612	0				0	return undef;
3613						}
3614
3615						# Fallback for IPv4 only
3616	6				176465	return scalar gethostbyaddr(inet_aton($ip), AF_INET);
3617						}
3618
3619						# -----------------------------------------------------------------------
3620						# Private: WHOIS / RDAP
3621						# -----------------------------------------------------------------------
3622
3623						# _whois_ip( $ip ) -> hashref
3624						#
3625						# Purpose:
3626						# Enrich an IP address with organisation name, abuse contact, and country
3627						# code. Tries RDAP first (if LWP is available), then falls back to raw
3628						# WHOIS via IANA referral. Results are cached in CHI if available.
3629						#
3630						# Entry criteria:
3631						# $ip -- a defined IPv4 or IPv6 address string.
3632						#
3633						# Exit status:
3634						# Returns { org, abuse, country } hashref; keys absent when unknown.
3635
3636						sub _whois_ip {
3637	9			9	1150	my ($self, $ip) = @_;
3638
3639						# Check CHI cache before going to the network
3640	9	50			20	if ($_cache) {
3641	0				0	my $cached = $_cache->get("whois_ip:$ip");
3642	0	0			0	return $cached if $cached;
3643						}
3644
3645	9	50			22	my $result = $HAS_LWP ? $self->_rdap_lookup($ip) : {};
3646
3647						# Fall back to raw WHOIS if RDAP returned no organisation
3648	9	50			33	unless ($result->{org}) {
3649	9				24	my $raw = $self->_raw_whois($ip, 'whois.iana.org');
3650	9	100			61	if ($raw) {
3651	8				61	my ($ref) = $raw =~ /whois:\s*([\w.-]+)/i;
3652	8	100			27	my $detail = $ref ? $self->_raw_whois($ip, $ref) : $raw;
3653	8	50			74	$result = $self->_parse_whois_text($detail) if $detail;
3654						}
3655						}
3656
3657						# Cache the enrichment result
3658	9	50	33		24	$_cache->set("whois_ip:$ip", $result) if $_cache && $result;
3659
3660	9				17	return $result;
3661						}
3662
3663						# _domain_whois( $domain ) -> raw_whois_string \| undef
3664						#
3665						# Purpose:
3666						# Perform a two-step WHOIS lookup for a domain: first ask IANA for the
3667						# TLD's authoritative WHOIS server, then query that server.
3668						#
3669						# Entry criteria:
3670						# $domain -- a lower-cased domain name string.
3671						#
3672						# Exit status:
3673						# Returns the raw WHOIS response string, or undef on failure.
3674
3675						sub _domain_whois {
3676	16			16	590	my ($self, $domain) = @_;
3677	16		100		30	my $iana = $self->_raw_whois($domain, 'whois.iana.org') // return undef;
3678	15				122	my ($server) = $iana =~ /whois:\s*([\w.-]+)/i;
3679	15	100			40	return undef unless $server;
3680	1				2	return $self->_raw_whois($domain, $server);
3681						}
3682
3683						# _parse_domain_whois_abuse( $domain ) -> hashref
3684						#
3685						# Purpose:
3686						# Lightweight domain WHOIS lookup to extract only registrar name and
3687						# abuse contact. Used as a fallback in _extract_and_resolve_urls() when
3688						# a URL host cannot be resolved to an IP.
3689						#
3690						# Entry criteria:
3691						# $domain -- a registrable domain name string.
3692						#
3693						# Exit status:
3694						# Returns { org, abuse } hashref; empty hashref on failure.
3695
3696						sub _parse_domain_whois_abuse {
3697	56			56	56	my ($self, $domain) = @_;
3698	56		100		66	my $raw = $self->_domain_whois($domain) // return {};
3699	7				17	my %info;
3700	7	50			13	if ($raw =~ /Registrar:\s*(.+)/i) {
3701	7				13	($info{org} = $1) =~ s/\s+$//;
3702						}
3703						# Try multiple field name patterns for the abuse email
3704	7				10	for my $pat (
3705						qr/Registrar Abuse Contact Email:\s*(\S+\@\S+)/i,
3706						qr/Abuse Contact Email:\s*(\S+\@\S+)/i,
3707						qr/abuse-contact:\s*(\S+\@\S+)/i,
3708						) {
3709	21	100	67		37	if (!$info{abuse} && $raw =~ $pat) {
3710	7				9	($info{abuse} = $1) =~ s/\s+$//;
3711						}
3712						}
3713	7				8	return \%info;
3714						}
3715
3716						# _rdap_lookup( $ip ) -> hashref
3717						#
3718						# Purpose:
3719						# Query the ARIN RDAP API for IP block ownership information. RDAP is
3720						# preferred over raw WHOIS because it returns structured JSON.
3721						#
3722						# Entry criteria:
3723						# $ip -- a defined IPv4 or IPv6 address string.
3724						# LWP::UserAgent must be installed.
3725						#
3726						# Exit status:
3727						# Returns { org, abuse, country } hashref; empty hashref on failure.
3728
3729						sub _rdap_lookup {
3730	0			0	0	my ($self, $ip) = @_;
3731	0	0			0	return {} unless $HAS_LWP;
3732
3733	0				0	my $ua = $self->{ua};
3734	0	0			0	if(!defined($ua)) {
3735						$ua = LWP::UserAgent->new(
3736						timeout => $self->{timeout},
3737	0				0	agent => "Email-Abuse-Investigator/$VERSION",
3738						);
3739
3740	0	0			0	if($HAS_CONN_CACHE) {
3741	0				0	my $conn_cache = LWP::ConnCache->new();
3742	0				0	$conn_cache->total_capacity(10);
3743	0				0	$ua->conn_cache($conn_cache);
3744						}
3745
3746	0				0	$ua->env_proxy(1);
3747	0				0	$self->{ua} = $ua;
3748						}
3749
3750						# Use the ARIN RDAP endpoint; it covers the ARIN region and redirects
3751						# for RIPE/APNIC/LACNIC/AfriNIC allocations.
3752	0 0				0 0	my $res = eval { $ua->get("https://rdap.arin.net/registry/ip/$ip") };
3753	0	0	0		0	return {} unless $res && $res->is_success();
3754
3755	0				0	my $j = $res->decoded_content();
3756	0				0	my %info;
3757
3758						# Extract organisation name from the JSON response
3759	0	0			0	$info{org} = $1 if $j =~ /"name"\s:\s"([^"]+)"/;
3760	0	0			0	$info{handle} = $1 if $j =~ /"handle"\s:\s"([^"]+)"/;
3761
3762						# Extract abuse email from the vcardArray contact block
3763	0	0 0			0	if ($j =~ /"abuse".?"email"\s:\s*"([^"]+)"/s) {
3764	0				0	$info{abuse} = $1;
3765						} elsif ($j =~ /"email"\s:\s"([^@"]+@[^"]+)"/) {
3766	0				0	$info{abuse} = $1;
3767						}
3768
3769						# Country code from the network's country field
3770	0	0			0	$info{country} = $1 if $j =~ /"country"\s:\s"([A-Z]{2})"/;
3771
3772	0				0	return \%info;
3773						}
3774
3775						# _raw_whois( $query, $server ) -> string \| undef
3776						#
3777						# Purpose:
3778						# Open a TCP connection to a WHOIS server on port 43, send the query,
3779						# and return the full response as a string. Uses IO::Select for read
3780						# timeouts so that alarm() is never needed (alarm() is unreliable on
3781						# Windows and in threaded Perl). Supports IPv6 WHOIS servers via
3782						# IO::Socket::IP when that module is available.
3783						#
3784						# Entry criteria:
3785						# $query -- the domain name or IP to query (defined, non-empty).
3786						# $server -- the WHOIS server hostname (default: 'whois.iana.org').
3787						# $self->{timeout} -- seconds used for connect and per-read waits.
3788						#
3789						# Exit status:
3790						# Returns the raw WHOIS response string, or undef on connection/write failure.
3791						#
3792						# Notes:
3793						# Uses IO::Socket::IP (dual-stack) when available, falling back to
3794						# IO::Socket::INET (IPv4 only) otherwise. The IO::Select loop reads
3795						# until the server closes the connection or the per-read timeout expires.
3796
3797						sub _raw_whois {
3798	23			23	274	my ($self, $query, $server) = @_;
3799	23		50		37	$server //= 'whois.iana.org';
3800	23				71	$self->_debug("WHOIS $server -> $query");
3801
3802						# Choose the socket class based on what is installed.
3803						# IO::Socket::IP supports both IPv4 and IPv6 WHOIS servers.
3804	23	50			46	my $sock_class = $HAS_IO_SOCKET_IP ? 'IO::Socket::IP' : 'IO::Socket::INET';
3805
3806						# Attempt TCP connection to port 43 on the WHOIS server
3807	23				28	my $sock = eval {
3808						$sock_class->new(
3809						PeerAddr => $server,
3810						PeerPort => $WHOIS_PORT,
3811						Proto => 'tcp',
3812						Timeout => $self->{timeout},
3813	23				154	);
3814						};
3815	23	50			2419586	return undef unless $sock;
3816
3817						# Send the WHOIS query in wire format (CRLF-terminated per RFC 3912)
3818	23 0 0	50			123 0 0	$sock->print("$query\r\n") or do { $sock->close(); return undef };
3819
3820						# Use IO::Select to implement per-read timeouts without alarm()
3821	23				1138	my $sel = IO::Select->new($sock);
3822	23				1095	my $response = '';
3823	23				95	my $buf = '';
3824
3825						# Read until EOF (server closes) or timeout
3826	23				87	while ($sel->can_read($self->{timeout})) {
3827						# Wrap in eval to catch 'Connection reset by peer' thrown by Fatal/autodie
3828	75 75				1289566 180	my $n = eval { sysread($sock, $buf, $WHOIS_READ_CHUNK) };
3829
3830	75	100	50		8443	if ($@ \|\| !defined $n \|\| $n <= 0) {
3831	23	50			37	$self->_debug("WHOIS read failed: $@") if $@;
3832	23				28	last;
3833						}
3834	52	50	33		132	last unless defined $n && $n > 0;
3835	52				243	$response .= $buf;
3836						}
3837
3838	23				111	$sock->close();
3839	23		50		1252	return $response \|\| undef;
3840						}
3841
3842						# _parse_whois_text( $text ) -> hashref
3843						#
3844						# Purpose:
3845						# Parse a raw WHOIS IP block response to extract organisation name,
3846						# abuse contact email, and country code.
3847						#
3848						# Entry criteria:
3849						# $text -- a defined WHOIS response string.
3850						#
3851						# Exit status:
3852						# Returns { org, abuse, country } hashref; keys absent when not found.
3853
3854						sub _parse_whois_text {
3855	37			37	5885	my ($self, $text) = @_;
3856	37	100			47	return {} unless $text;
3857	35				31	my %info;
3858
3859						# Try multiple field names for the organisation name
3860	35				110	for my $pat (
3861						qr/^OrgName:\s(.+)/mi, qr/^org-name:\s(.+)/mi,
3862						qr/^owner:\s(.+)/mi, qr/^descr:\s(.+)/mi,
3863						) {
3864	140	100	100		417	if (!$info{org} && $text =~ $pat) {
3865	20				69	($info{org} = $1) =~ s/\s+$//;
3866						}
3867						}
3868
3869						# Try multiple field names for the abuse email
3870	35				75	for my $pat (
3871						qr/OrgAbuseEmail:\s*(\S+@\S+)/mi,
3872						qr/abuse-mailbox:\s*(\S+@\S+)/mi,
3873						) {
3874	70	100	100		178	if (!$info{abuse} && $text =~ $pat) {
3875	6				14	($info{abuse} = $1) =~ s/\s+$//;
3876						}
3877						}
3878
3879						# Last-resort: any abuse@ address in the response
3880	35	100	67		108	$info{abuse} //= $1 if $text =~ /(abuse\@[\w.-]+)/i;
3881
3882						# Country code (case-insensitive match, normalised to uppercase)
3883	35	100			66	if ($text =~ /^country:\s([A-Za-z]{2})\s$/m) {
3884	12				35	$info{country} = uc $1;
3885						}
3886	35				63	return \%info;
3887						}
3888
3889						# -----------------------------------------------------------------------
3890						# Private: authentication results parsing
3891						# -----------------------------------------------------------------------
3892
3893						# _parse_auth_results_cached() -> hashref
3894						#
3895						# Purpose:
3896						# Parse the Authentication-Results: header(s) from the message once,
3897						# cache the result, and return it. Extracts SPF, DKIM, DMARC, ARC
3898						# results and the DKIM signing domain(s).
3899						#
3900						# Entry criteria:
3901						# $self->{_headers} populated by _split_message().
3902						#
3903						# Exit status:
3904						# Returns { spf, dkim, dmarc, arc, dkim_domain, dkim_domains } hashref.
3905						# Keys absent when the corresponding header or field is not present.
3906
3907						sub _parse_auth_results_cached {
3908	609			609	1828	my $self = $_[0];
3909	609	100			686	return $self->{_auth_results} if $self->{_auth_results};
3910
3911	339				231	my %auth;
3912
3913						# Concatenate all Authentication-Results: header values
3914						my $raw = join('; ',
3915	37				48	map { $_->{value} }
3916	2571				2064	grep { $_->{name} eq 'authentication-results' }
3917	339 339				301 341	@{ $self->{_headers} }
3918						);
3919
3920						# Extract individual authentication mechanism results
3921	339	100			401	$auth{spf} = $1 if $raw =~ /\bspf=(\S+)/i;
3922	339	100			329	$auth{dkim} = $1 if $raw =~ /\bdkim=(\S+)/i;
3923	339	100			328	$auth{dmarc} = $1 if $raw =~ /\bdmarc=(\S+)/i;
3924	339	100			782	$auth{arc} = $1 if $raw =~ /\barc=(\S+)/i;
3925
3926						# Strip trailing punctuation captured by the greedy \S+
3927	339				295	for my $k (qw(spf dkim dmarc arc)) {
3928	1356	100			1115	$auth{$k} =~ s/[;,\s]+$// if defined $auth{$k};
3929						}
3930
3931						# Extract DKIM signing domains from all DKIM-Signature: d= tags.
3932						# Prefer the first domain that matches the provider table (identifies ESP).
3933	339				226	my @dkim_domains;
3934	339 2571 339				243 1846 287	for my $h (grep { $_->{name} eq 'dkim-signature' } @{ $self->{_headers} }) {
3935	14	50			25	if ($h->{value} =~ /\bd=([^;,\s]+)/) {
3936	14				17	push @dkim_domains, lc $1;
3937						}
3938						}
3939
3940	339	100			298	if (@dkim_domains) {
3941						# Check if any signing domain matches a known provider
3942	9				6	my $preferred;
3943	9				8	for my $d (@dkim_domains) {
3944	14	100			12	if ($self->_provider_abuse_for_host($d)) {
3945	4				2	$preferred = $d;
3946	4				5	last;
3947						}
3948						}
3949	9		67		17	$auth{dkim_domain} = $preferred // $dkim_domains[0];
3950	9				11	$auth{dkim_domains} = \@dkim_domains;
3951						}
3952
3953	339				295	$self->{_auth_results} = \%auth;
3954	339				355	return \%auth;
3955						}
3956
3957						# -----------------------------------------------------------------------
3958						# Private: provider-table lookups
3959						# -----------------------------------------------------------------------
3960
3961						# _provider_abuse_for_host( $host ) -> hashref \| undef
3962						#
3963						# Purpose:
3964						# Look up a hostname (and each of its parent domains, stripping one label
3965						# at a time from the left) in the %PROVIDER_ABUSE table.
3966						#
3967						# Entry criteria:
3968						# $host -- a defined hostname or domain string.
3969						#
3970						# Exit status:
3971						# Returns the %PROVIDER_ABUSE entry hashref on match, undef otherwise.
3972
3973						sub _provider_abuse_for_host {
3974	1131			1131	6535	my ($self, $host) = @_;
3975	1131				845	$host = lc $host;
3976						# Strip successive subdomains until we find a match or exhaust labels
3977	1131				1140	while ($host =~ /\./) {
3978	1253	50			1055	return $self->{provider_abuse}->{$host} if $self->{provider_abuse}->{$host};
3979	1253	100			1164	return $PROVIDER_ABUSE{$host} if $PROVIDER_ABUSE{$host};
3980	1003				1513	$host =~ s/^[^.]+\.//;
3981						}
3982	881				689	return undef;
3983						}
3984
3985						# _provider_abuse_for_ip( $ip, $rdns ) -> hashref \| undef
3986						#
3987						# Purpose:
3988						# Look up an IP's reverse-DNS hostname in the %PROVIDER_ABUSE table to
3989						# identify well-known provider networks by rDNS pattern.
3990						#
3991						# Entry criteria:
3992						# $ip -- IPv4 or IPv6 address string (used as fallback if $rdns absent).
3993						# $rdns -- optional rDNS hostname string.
3994						#
3995						# Exit status:
3996						# Returns the %PROVIDER_ABUSE entry on match, undef otherwise.
3997
3998						sub _provider_abuse_for_ip {
3999	125			125	870	my ($self, $ip, $rdns) = @_;
4000	125	100			181	return $self->_provider_abuse_for_host($rdns) if $rdns;
4001	2				5	return undef;
4002						}
4003
4004						# -----------------------------------------------------------------------
4005						# Private: eTLD+1 normalisation
4006						# -----------------------------------------------------------------------
4007
4008						# _registrable( $host ) -> string \| undef
4009						#
4010						# Purpose:
4011						# Return the registrable eTLD+1 form of a hostname. Uses
4012						# Domain::PublicSuffix when installed for accurate results; falls back
4013						# to a built-in heuristic for the common two-letter ccTLD+2 pattern.
4014						#
4015						# Entry criteria:
4016						# $host -- a hostname string (may include subdomains).
4017						#
4018						# Exit status:
4019						# Returns the registrable domain string, or undef for single-label
4020						# hostnames (e.g. 'localhost').
4021						#
4022						# Notes:
4023						# The heuristic handles co.uk, com.au, net.jp, org.nz etc. but not
4024						# uncommon second-level delegations like ltd.uk or plc.uk.
4025
4026						sub _registrable {
4027	976			976	12405	my ($host) = @_;
4028	976	100	100		1481	return undef unless $host && $host =~ /\./;
4029
4030						# Use Domain::PublicSuffix for accurate PSL-based normalisation
4031	839	50			644	if ($HAS_PUBLIC_SUFFIX) {
4032	0				0	my $psl = Domain::PublicSuffix->new();
4033	0				0	my $root = $psl->get_root_domain(lc $host);
4034	0	0			0	return $root if $root;
4035						}
4036
4037						# Built-in heuristic fallback
4038	839				811	my @labels = split /\./, lc $host;
4039	839	100			1172	return $host if @labels <= 2;
4040
4041						# Detect common ccTLD second-level patterns (e.g. co.uk, com.au)
4042	95	100	100		231	if ($labels[-1] =~ /^[a-z]{2}$/ &&
4043						$labels[-2] =~ /^(?:co\|com\|net\|org\|gov\|edu\|ac\|me)$/) {
4044	47				106	return join('.', @labels[-3..-1]);
4045						}
4046	48				151	return join('.', @labels[-2..-1]);
4047						}
4048
4049						# -----------------------------------------------------------------------
4050						# Private: utilities
4051						# -----------------------------------------------------------------------
4052
4053						# _enrich_ip( $ip, $confidence, $note ) -> origin hashref
4054						#
4055						# Purpose:
4056						# Perform rDNS and WHOIS/RDAP for a single IP and package the results
4057						# into the standard origin hashref returned by originating_ip().
4058						#
4059						# Entry criteria:
4060						# $ip -- a defined, non-private IPv4 or IPv6 address string.
4061						# $confidence -- 'high', 'medium', or 'low'.
4062						# $note -- human-readable explanation of why this IP was chosen.
4063						#
4064						# Exit status:
4065						# Returns { ip, rdns, org, abuse, country, confidence, note } hashref.
4066
4067						sub _enrich_ip {
4068	89			89	2371	my ($self, $ip, $confidence, $note) = @_;
4069	89				129	my $rdns = $self->_reverse_dns($ip);
4070	89				223	my $whois = $self->_whois_ip($ip);
4071						return {
4072						ip => $ip,
4073						rdns => $rdns // '(no reverse DNS)',
4074						org => $whois->{org} // '(unknown)',
4075						abuse => $whois->{abuse} // '(unknown)',
4076						country => $whois->{country} // undef,
4077	89		100 100 100 100		714	confidence => $confidence,
4078						note => $note,
4079						};
4080						}
4081
4082						# _header_value( $name ) -> value_string \| undef
4083						#
4084						# Purpose:
4085						# Return the value of the first header matching the given lower-cased
4086						# header name.
4087						#
4088						# Entry criteria:
4089						# $name -- a lower-cased header name string.
4090						# $self->{_headers} populated by _split_message().
4091						#
4092						# Exit status:
4093						# Returns the value string, or undef if the header is not present.
4094
4095						sub _header_value {
4096	3892			3892	3997	my ($self, $name) = @_;
4097	3892 3892				2279 2969	for my $h (@{ $self->{_headers} }) {
4098	22126	100			18296	return $h->{value} if $h->{name} eq lc($name);
4099						}
4100	1883				1755	return undef;
4101						}
4102
4103						# _ip_in_cidr( $ip, $cidr ) -> bool
4104						#
4105						# Purpose:
4106						# Test whether an IPv4 address falls within a CIDR block or is an exact
4107						# match (when $cidr contains no '/' separator).
4108						#
4109						# Entry criteria:
4110						# $ip -- a defined dotted-quad IPv4 address string.
4111						# $cidr -- a CIDR string like '10.0.0.0/8' or an exact IP.
4112						#
4113						# Exit status:
4114						# Returns 1 (true) if the IP is within the CIDR block, 0 otherwise.
4115
4116						sub _ip_in_cidr {
4117	51			51	838	my ($self, $ip, $cidr) = @_;
4118	51	100			105	return $ip eq $cidr unless $cidr =~ m{/};
4119	45				61	my ($net_addr, $prefix) = split m{/}, $cidr;
4120	45	100	50		186	return 0 unless defined $prefix && $prefix =~ /^\d+$/ && $prefix <= 32;
4121
4122						# Compute the network mask and compare masked network addresses
4123	44				43	my $mask = ~0 << (32 - $prefix);
4124	44		50		157	my $net_n = unpack 'N', (inet_aton($net_addr) // return 0);
4125	44		50		79	my $ip_n = unpack 'N', (inet_aton($ip) // return 0);
4126	44				97	return ($ip_n & $mask) == ($net_n & $mask);
4127						}
4128
4129						# _decode_mime_words( $str ) -> decoded_string
4130						#
4131						# Purpose:
4132						# Decode MIME encoded-words (=?charset?B/Q?...?=) in a header value
4133						# string for human-readable display in reports.
4134						#
4135						# Entry criteria:
4136						# $str -- a defined header value string; may be undef.
4137						#
4138						# Exit status:
4139						# Returns the decoded string, or '' if $str is undef.
4140
4141						sub _decode_mime_words {
4142	473			473	2628	my ($self, $str) = @_;
4143	473	100			413	return '' unless defined $str;
4144						# Replace each encoded-word with its decoded equivalent
4145	471 33				428 37	$str =~ s/=\?([^?]+)\?([BbQq])\?([^?]*)\?=/_decode_ew($1,$2,$3)/ge;
4146	471				385	return $str;
4147						}
4148
4149						# _decode_ew( $charset, $enc, $text ) -> decoded_bytes
4150						#
4151						# Purpose:
4152						# Decode a single MIME encoded-word component (base64 or quoted-printable).
4153						#
4154						# Notes:
4155						# Non-UTF-8 charsets return raw bytes; good enough for display-name spoof
4156						# detection which only needs ASCII matching.
4157
4158						sub _decode_ew {
4159	33			33	53	my ($charset, $enc, $text) = @_;
4160	33				25	my $raw;
4161	33	100			42	if (uc($enc) eq 'B') {
4162	31				55	$raw = decode_base64($text);
4163						} else {
4164						# Quoted-printable encoded-word uses underscore for space
4165	2				3	$text =~ s/_/ /g;
4166	2				4	$raw = decode_qp($text);
4167						}
4168	33				66	return $raw;
4169						}
4170
4171						# _parse_date_to_epoch( $str ) -> epoch_int \| undef
4172						#
4173						# Purpose:
4174						# Parse common WHOIS date strings to a Unix epoch integer.
4175						# Handles YYYY-MM-DD, YYYY-MM-DDThh:mm:ssZ, and DD-Mon-YYYY formats.
4176						#
4177						# Entry criteria:
4178						# $str -- a defined date string; may be undef.
4179						#
4180						# Exit status:
4181						# Returns epoch integer on success, undef if the string cannot be parsed.
4182
4183						sub _parse_date_to_epoch {
4184	61			61	5112	my ($self, $str) = @_;
4185	61	100			86	return undef unless $str;
4186
4187						# Clean the string of trailing whitespace/newlines
4188	60				132	$str =~ s/^\s+\|\s+$//g;
4189
4190						# Guard Regex: Validates the strict YYYY-MM-DDThh:mm:ssZ format
4191	60	100			67	if ($str =~ /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.\d+)?Z$/) {
4192						# Parse the string
4193						# We use 'strptime' to create a Time::Piece object.
4194						# The 'Z' indicates UTC (Zulu time).
4195	2				2	my $epoch = eval {
4196	2				11	my $t = Time::Piece->strptime($1, '%Y-%m-%dT%H:%M:%S');
4197
4198						# Return seconds since the epoch
4199						# Time::Piece handles the timezone offset internally when calling ->epoch
4200
4201						# strptime returns a local time object.
4202						# We must subtract the local timezone offset to get the true UTC epoch.
4203	2				217	return $t->epoch - $t->tzoffset->seconds;
4204						};
4205	2	50			99	return $epoch if defined $epoch;
4206						}
4207	58				42	my ($y, $m, $d);
4208
4209	58 39	100 100 100			124 71	if ($str =~ /^(\d{4})-(\d{2})-(\d{2})/) { ($y,$m,$d)=($1,$2,$3) }
4210	17		100		42	elsif ($str =~ /^(\d{2})-([A-Za-z]{3})-(\d{4})/) { ($d,$m,$y)=($1,$Readonly::Values::Months::months{lc$2}//0,$3) }
4211	1				2	elsif ($str =~ /^(\d{2})\/(\d{2})\/(\d{4})/) { ($m,$d,$y)=($1,$2,$3) }
4212
4213	58	100	75		185	return undef unless $y && $m && $d;
4214
4215	56 56 56	50			42 103 61	if (eval { require Time::Local; 1 }) {
4216	56 56				32 105	return eval { Time::Local::timegm(0,0,0,$d,$m-1,$y-1900) };
4217						}
4218						# Approximate fallback without Time::Local
4219	0				0	return ($y-1970)365.25$SECS_PER_DAY + ($m-1)30.5$SECS_PER_DAY + ($d-1)*$SECS_PER_DAY;
4220						}
4221
4222						# _parse_rfc2822_date( $str ) -> epoch_int \| undef
4223						#
4224						# Purpose:
4225						# Parse an RFC 2822 Date: header value to a Unix epoch integer.
4226						# Timezone offsets are intentionally ignored; the function returns a
4227						# UTC-equivalent value. For the 7-day suspicious_date window the
4228						# maximum error is ~14 hours, well within the tolerance.
4229						#
4230						# Entry criteria:
4231						# $str -- a defined Date: header value string.
4232						#
4233						# Exit status:
4234						# Returns epoch integer on success, undef if the string cannot be parsed.
4235
4236						sub _parse_rfc2822_date {
4237	206			206	172	my ($str) = @_;
4238	206	50			168	return undef unless $str;
4239
4240						# Match: DD Mon YYYY HH:MM:SS (timezone offset ignored)
4241	206	50			475	if ($str =~ /(\d{1,2})\s+([A-Za-z]{3})\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})/) {
4242						my ($d, $m, $y, $H, $M, $S) =
4243	206		50		555	($1, $Readonly::Values::Months::months{ lc $2 } // 0, $3, $4, $5, $6);
4244	206	50			1038	return undef unless $m;
4245	206 206 206	50			184 485 216	if (eval { require Time::Local; 1 }) {
4246	206 206				138 372	return eval { Time::Local::timegm($S, $M, $H, $d, $m - 1, $y - 1900) };
4247						}
4248						}
4249	0				0	return undef;
4250						}
4251
4252						# _country_name( $cc ) -> country_name_string
4253						#
4254						# Purpose:
4255						# Return a human-readable country name for a two-letter ISO 3166-1
4256						# alpha-2 country code. Only the small set of statistically high-volume
4257						# spam-originating countries is covered; other codes are returned as-is.
4258						#
4259						# Entry criteria:
4260						# $cc -- a two-letter uppercase country code string.
4261						#
4262						# Exit status:
4263						# Returns the country name string, or the code itself if not in the table.
4264
4265						sub _country_name {
4266	35			35	4711	my ($cc) = @_;
4267	35				74	my %names = (
4268						CN => 'China', RU => 'Russia', NG => 'Nigeria',
4269						VN => 'Vietnam', IN => 'India', PK => 'Pakistan',
4270						BD => 'Bangladesh',
4271						);
4272	35		67		95	return $names{$cc} // $cc;
4273						}
4274
4275						# _debug( $msg )
4276						#
4277						# Purpose:
4278						# Write a diagnostic message to STDERR when verbose mode is enabled.
4279						#
4280						# Entry criteria:
4281						# $msg -- a defined message string.
4282						#
4283						# Notes:
4284						# Messages are prefixed with the class name for easy grepping.
4285
4286						sub _debug {
4287	888			888	709	my ($self, $msg) = @_;
4288
4289	888	100			1007	if($self->{verbose}) {
4290	2	50			4	if(my $logger = $self->{logger}) { # May have been set in Object::Configure
4291	0				0	$logger->debug("[Email::Abuse::Investigator] $msg");
4292						} else {
4293	2				4	print STDERR "[Email::Abuse::Investigator] $msg\n";
4294						}
4295						}
4296						}
4297
4298						1;
4299