| File: | blib/lib/Email/Abuse/Investigator.pm |
| Coverage: | 78.3% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package Email::Abuse::Investigator; | |||||
| 2 | ||||||
| 3 | 15 15 15 | 750166 14 204 | use strict; | |||
| 4 | 15 15 15 | 28 7 303 | use warnings; | |||
| 5 | 15 15 15 | 2697 87209 27 | use autodie qw(:all); | |||
| 6 | 15 15 15 | 104579 56364 33 | use Time::Piece; | |||
| 7 | ||||||
| 8 - 17 | =head1 NAME Email::Abuse::Investigator - Analyse spam email to identify originating hosts, hosted URLs, and suspicious domains =head1 VERSION Version 0.08 =cut | |||||
| 18 | ||||||
| 19 | our $VERSION = '0.08'; | |||||
| 20 | ||||||
| 21 - 102 | =head1 SYNOPSIS use Email::Abuse::Investigator; my $analyser = Email::Abuse::Investigator->new( verbose => 1 ); $analyser->parse_email($raw_email_text); # Originating IP and its network owner my $origin = $analyser->originating_ip(); # All HTTP/HTTPS URLs found in the body my @urls = $analyser->embedded_urls(); # All domains extracted from mailto: links and bare addresses in the body my @mdoms = $analyser->mailto_domains(); # All domains mentioned anywhere (union of the above) my @adoms = $analyser->all_domains(); # Full printable report print $analyser->report(); =head1 DESCRIPTION C<Email::Abuse::Investigator> examines the raw source of a spam/phishing e-mail and answers the questions manual abuse investigators ask: =over 4 =item 1. Where did the message really come from? Walks the C<Received:> chain, skips private/trusted IPs, and identifies the first external hop. Enriches with rDNS, WHOIS/RDAP org name and abuse contact. Both IPv4 and IPv6 addresses are supported. =item 2. Who hosts the advertised web sites? Extracts every C<http://> and C<https://> URL from both plain-text and HTML parts, resolves each hostname to an IP, and looks up the network owner. =item 3. Who owns the reply-to / contact domains? Extracts domains from C<mailto:> links, bare e-mail addresses in the body, the C<From:>/C<Reply-To:>/C<Sender:>/C<Return-Path:> headers, C<DKIM-Signature: d=> (the signing domain), C<List-Unsubscribe:> (the ESP or bulk-sender domain), and the C<Message-ID:> domain. For each unique domain it gathers: =over 8 =item * Domain registrar and registrant (WHOIS) =item * Web-hosting IP and network owner (A record -> RDAP) =item * Mail-hosting IP and network owner (MX record -> RDAP) =item * DNS nameserver operator (NS record -> RDAP) =item * Whether the domain was recently registered (potential flag) =back =back =head1 REQUIRED MODULES The following modules are mandatory: Readonly::Values::Months Socket (core since Perl 5) IO::Socket::INET (core since Perl 5) MIME::QuotedPrint (core since Perl 5.8) MIME::Base64 (core since Perl 5.8) The following are optional but strongly recommended: Net::DNS -- enables MX, NS, AAAA record lookups LWP::UserAgent -- enables RDAP (faster and richer than raw WHOIS) HTML::LinkExtor -- enables structural HTML link extraction CHI -- enables cross-message IP/domain result caching IO::Socket::IP -- enables IPv6 WHOIS connections =cut | |||||
| 103 | ||||||
| 104 | 15 15 15 | 769 16 318 | use Carp qw(croak carp); | |||
| 105 | 15 15 15 | 2894 9702 331 | use IO::Select; | |||
| 106 | 15 15 15 | 1674 71289 61 | use IO::Socket::INET; | |||
| 107 | 15 15 15 | 5274 2565 382 | use MIME::QuotedPrint qw( decode_qp ); | |||
| 108 | 15 15 15 | 34 11 215 | use MIME::Base64 qw( decode_base64 ); | |||
| 109 | 15 15 15 | 3101 637275 240 | use Object::Configure; | |||
| 110 | 15 15 15 | 48 30 223 | use Params::Get; | |||
| 111 | 15 15 15 | 27 18 167 | use Params::Validate::Strict; | |||
| 112 | 15 15 15 | 25 12 225 | use Readonly; | |||
| 113 | 15 15 15 | 2646 8585 1072 | use Readonly::Values::Months; | |||
| 114 | 15 15 15 | 48 15 1519 | use Socket qw( inet_aton inet_ntoa AF_INET ); | |||
| 115 | ||||||
| 116 | # ----------------------------------------------------------------------- | |||||
| 117 | # Optional modules -- gracefully degraded when absent | |||||
| 118 | # ----------------------------------------------------------------------- | |||||
| 119 | ||||||
| 120 | # Net::DNS enables MX, NS, AAAA lookups; falls back to gethostbyname | |||||
| 121 | my $HAS_NET_DNS; | |||||
| 122 | ||||||
| 123 | # LWP::UserAgent enables RDAP queries; falls back to raw WHOIS | |||||
| 124 | my $HAS_LWP; | |||||
| 125 | my $HAS_CONN_CACHE; | |||||
| 126 | ||||||
| 127 | # HTML::LinkExtor enables structural HTML link extraction | |||||
| 128 | my $HAS_HTML_LINKEXTOR; | |||||
| 129 | ||||||
| 130 | # CHI enables a persistent cross-message cache for IP/domain data | |||||
| 131 | my $HAS_CHI; | |||||
| 132 | ||||||
| 133 | # IO::Socket::IP provides dual-stack (IPv4+IPv6) socket support | |||||
| 134 | my $HAS_IO_SOCKET_IP; | |||||
| 135 | ||||||
| 136 | # Domain::PublicSuffix enables accurate eTLD+1 normalisation | |||||
| 137 | my $HAS_PUBLIC_SUFFIX; | |||||
| 138 | ||||||
| 139 | # AnyEvent::DNS enables parallel DNS queries | |||||
| 140 | my $HAS_ANYEVENT_DNS; | |||||
| 141 | ||||||
| 142 | BEGIN { | |||||
| 143 | 15 15 0 | 20 821 0 | $HAS_NET_DNS = eval { require Net::DNS; 1 }; | |||
| 144 | 15 15 0 | 3712 766 0 | $HAS_LWP = eval { require LWP::UserAgent; 1 }; | |||
| 145 | 15 15 0 | 2925 678 0 | $HAS_CONN_CACHE = eval { require LWP::ConnCache; 1 }; | |||
| 146 | 15 15 15 | 2760 2911 52565 | $HAS_HTML_LINKEXTOR= eval { require HTML::LinkExtor; 1 }; | |||
| 147 | 15 15 0 | 16 891 0 | $HAS_CHI = eval { require CHI; 1 }; | |||
| 148 | 15 15 15 | 3201 2218 25591 | $HAS_IO_SOCKET_IP = eval { require IO::Socket::IP; 1 }; | |||
| 149 | 15 15 0 | 15 786 0 | $HAS_PUBLIC_SUFFIX = eval { require Domain::PublicSuffix; 1 }; | |||
| 150 | 15 15 0 | 2971 20369 0 | $HAS_ANYEVENT_DNS = eval { require AnyEvent::DNS; 1 }; | |||
| 151 | } | |||||
| 152 | ||||||
| 153 | # ----------------------------------------------------------------------- | |||||
| 154 | # Constants -- all magic numbers and strings live here | |||||
| 155 | # ----------------------------------------------------------------------- | |||||
| 156 | ||||||
| 157 | # WHOIS protocol port (IANA-assigned) | |||||
| 158 | Readonly::Scalar my $WHOIS_PORT => 43; | |||||
| 159 | ||||||
| 160 | # Bytes to read per sysread() call from a WHOIS socket | |||||
| 161 | Readonly::Scalar my $WHOIS_READ_CHUNK => 4096; | |||||
| 162 | ||||||
| 163 | # Maximum WHOIS response bytes stored in whois_raw (keep reports compact) | |||||
| 164 | Readonly::Scalar my $WHOIS_RAW_MAX => 2048; | |||||
| 165 | ||||||
| 166 | # Maximum multipart nesting depth (recursion guard -- RFC 2046 has no limit | |||||
| 167 | # but we cap it to prevent stack exhaustion on crafted messages) | |||||
| 168 | Readonly::Scalar my $MAX_MULTIPART_DEPTH => 20; | |||||
| 169 | ||||||
| 170 | # Number of days before registration that triggers recently_registered flag | |||||
| 171 | Readonly::Scalar my $RECENT_REG_DAYS => 180; | |||||
| 172 | ||||||
| 173 | # Number of days ahead of expiry that triggers domain_expires_soon flag | |||||
| 174 | Readonly::Scalar my $EXPIRY_WARN_DAYS => 30; | |||||
| 175 | ||||||
| 176 | # Seconds in a day -- used in date arithmetic throughout | |||||
| 177 | Readonly::Scalar my $SECS_PER_DAY => 86400; | |||||
| 178 | ||||||
| 179 | # Suspicious date window: dates outside +/- 7 days raise a flag | |||||
| 180 | Readonly::Scalar my $DATE_SKEW_DAYS => 7; | |||||
| 181 | ||||||
| 182 | # Maximum positive timezone offset in minutes (+14:00 = Line Islands) | |||||
| 183 | Readonly::Scalar my $TZ_MAX_POS_MINS => 840; | |||||
| 184 | ||||||
| 185 | # Maximum negative timezone offset in minutes (-12:00 = Baker Island) | |||||
| 186 | Readonly::Scalar my $TZ_MAX_NEG_MINS => 720; | |||||
| 187 | ||||||
| 188 | # High-risk score threshold | |||||
| 189 | Readonly::Scalar my $SCORE_HIGH => 9; | |||||
| 190 | ||||||
| 191 | # Medium-risk score threshold | |||||
| 192 | Readonly::Scalar my $SCORE_MEDIUM => 5; | |||||
| 193 | ||||||
| 194 | # Low-risk score threshold | |||||
| 195 | Readonly::Scalar my $SCORE_LOW => 2; | |||||
| 196 | ||||||
| 197 | # Flag severity weights (contribute to the numeric risk score) | |||||
| 198 | Readonly::Hash my %FLAG_WEIGHT => ( | |||||
| 199 | HIGH => 3, | |||||
| 200 | MEDIUM => 2, | |||||
| 201 | LOW => 1, | |||||
| 202 | INFO => 0, | |||||
| 203 | ); | |||||
| 204 | ||||||
| 205 | # Maximum merged-role display string length before summarisation kicks in | |||||
| 206 | Readonly::Scalar my $ROLE_MAX_LEN => 80; | |||||
| 207 | ||||||
| 208 | # CHI cache TTL in seconds (1 hour -- IP allocations change slowly) | |||||
| 209 | Readonly::Scalar my $CACHE_TTL_SECS => 3600; | |||||
| 210 | ||||||
| 211 | # Default constructor timeout for network operations (seconds) | |||||
| 212 | Readonly::Scalar my $DEFAULT_TIMEOUT => 10; | |||||
| 213 | ||||||
| 214 | # Maximum role string length before truncation | |||||
| 215 | Readonly::Scalar my $ROLE_WRAP_LEN => 66; | |||||
| 216 | ||||||
| 217 | # ----------------------------------------------------------------------- | |||||
| 218 | # Private ranges -- IPs that are never actionable abuse targets | |||||
| 219 | # ----------------------------------------------------------------------- | |||||
| 220 | ||||||
| 221 | # Both IPv4 and IPv6 private/reserved ranges. Each entry is a compiled | |||||
| 222 | # regex; _is_private() iterates over them and returns true on first match. | |||||
| 223 | my @PRIVATE_RANGES = ( | |||||
| 224 | # IPv4 ranges | |||||
| 225 | qr/^0\./, # 0.0.0.0/8 this-network (RFC 1122) | |||||
| 226 | qr/^127\./, # 127.0.0.0/8 loopback | |||||
| 227 | qr/^10\./, # 10.0.0.0/8 RFC 1918 | |||||
| 228 | qr/^192\.168\./, # 192.168.0.0/16 RFC 1918 | |||||
| 229 | qr/^172\.(?:1[6-9]|2\d|3[01])\./, # 172.16.0.0/12 RFC 1918 | |||||
| 230 | qr/^169\.254\./, # 169.254.0.0/16 link-local | |||||
| 231 | qr/^100\.(?:6[4-9]|[7-9]\d|1(?:[01]\d|2[0-7]))\./, # 100.64.0.0/10 CGN (RFC 6598) | |||||
| 232 | qr/^192\.0\.0\./, # 192.0.0.0/24 IETF protocol (RFC 6890) | |||||
| 233 | qr/^192\.0\.2\./, # 192.0.2.0/24 TEST-NET-1 (RFC 5737) | |||||
| 234 | qr/^198\.51\.100\./, # 198.51.100.0/24 TEST-NET-2 (RFC 5737) | |||||
| 235 | qr/^203\.0\.113\./, # 203.0.113.0/24 TEST-NET-3 (RFC 5737) | |||||
| 236 | qr/^255\./, # 255.0.0.0/8 broadcast | |||||
| 237 | # IPv6 ranges | |||||
| 238 | qr/^::1$/, # IPv6 loopback | |||||
| 239 | qr/^fe80:/i, # IPv6 link-local (fe80::/10) | |||||
| 240 | qr/^fc/i, # IPv6 ULA fc00::/7 | |||||
| 241 | qr/^fd/i, # IPv6 ULA fd00::/8 | |||||
| 242 | qr/^2001:db8:/i, # IPv6 documentation range (RFC 3849) | |||||
| 243 | qr/^64:ff9b:/i, # IPv6 NAT64 well-known prefix | |||||
| 244 | ); | |||||
| 245 | ||||||
| 246 | # Priority-ordered patterns for extracting IPs from Received: headers. | |||||
| 247 | # Covers bracketed IPv4, bracketed IPv6, parenthesised address, and bare dotted-quad. | |||||
| 248 | my @RECEIVED_IP_RE = ( | |||||
| 249 | qr/\[\s*([\d.]+)\s*\]/, # [1.2.3.4] | |||||
| 250 | qr/\[\s*([0-9a-fA-F:]+)\s*\]/, # [IPv6 address] | |||||
| 251 | qr/\(\s*[\w.-]*\s*\[?\s*([\d.]+)\s*\]?\s*\)/, # (hostname [1.2.3.4]) | |||||
| 252 | qr/from\s+[\w.-]+\s+([\d.]+)/, # from hostname addr | |||||
| 253 | qr/([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})/, # bare dotted-quad fallback | |||||
| 254 | ); | |||||
| 255 | ||||||
| 256 | # ----------------------------------------------------------------------- | |||||
| 257 | # Default configuration -- overridable via Object::Configure | |||||
| 258 | # ----------------------------------------------------------------------- | |||||
| 259 | ||||||
| 260 | # Object::Configure may overlay | |||||
| 261 | # values from a file before new() uses them. Use Readonly for constants | |||||
| 262 | # that should never be overridden at runtime. | |||||
| 263 | ||||||
| 264 | # ----------------------------------------------------------------------- | |||||
| 265 | # Trusted domains (infrastructure -- never report these as abuse targets) | |||||
| 266 | # Can be overrideen at runtime by Object::Configure | |||||
| 267 | # ----------------------------------------------------------------------- | |||||
| 268 | ||||||
| 269 | my %TRUSTED_DOMAINS = map { $_ => 1 } qw( | |||||
| 270 | gmail.com googlemail.com yahoo.com outlook.com hotmail.com | |||||
| 271 | google.com microsoft.com apple.com amazon.com | |||||
| 272 | googlegroups.com groups.google.com | |||||
| 273 | w3.org | |||||
| 274 | fedex.com ups.com dhl.com usps.com royalmail.com | |||||
| 275 | ); | |||||
| 276 | ||||||
| 277 | # ----------------------------------------------------------------------- | |||||
| 278 | # URL shortener domains (real destination is hidden behind these) | |||||
| 279 | # ----------------------------------------------------------------------- | |||||
| 280 | ||||||
| 281 | my %URL_SHORTENERS = map { $_ => 1 } qw( | |||||
| 282 | bit.ly bitly.com tinyurl.com t.co ow.ly | |||||
| 283 | goo.gl is.gd buff.ly ift.tt dlvr.it | |||||
| 284 | short.link rebrand.ly tiny.cc cutt.ly rb.gy | |||||
| 285 | shorturl.at bl.ink smarturl.it yourls.org clicky.me | |||||
| 286 | snip.ly adf.ly bc.vc lnkd.in fb.me | |||||
| 287 | youtu.be | |||||
| 288 | ); | |||||
| 289 | ||||||
| 290 | # ----------------------------------------------------------------------- | |||||
| 291 | # Well-known provider abuse contacts | |||||
| 292 | # Can be overrideen at runtime by Object::Configure | |||||
| 293 | # ----------------------------------------------------------------------- | |||||
| 294 | ||||||
| 295 | # Curated table of provider abuse contacts. Entries with only a 'form' | |||||
| 296 | # key (no 'email') require web-form submission; abuse_contacts() suppresses | |||||
| 297 | # email addresses for those providers and form_contacts() surfaces them. | |||||
| 298 | my %PROVIDER_ABUSE = ( | |||||
| 299 | # Google / Gmail | |||||
| 300 | 'google.com' => { email => 'abuse@google.com', note => 'Also report Gmail accounts via https://support.google.com/mail/contact/abuse' }, | |||||
| 301 | 'gmail.com' => { email => 'abuse@google.com', note => 'Report Gmail spam via https://support.google.com/mail/contact/abuse' }, | |||||
| 302 | 'googlemail.com' => { email => 'abuse@google.com', note => 'Report via https://support.google.com/mail/contact/abuse' }, | |||||
| 303 | '1e100.net' => { email => 'abuse@google.com', note => 'Google infrastructure' }, | |||||
| 304 | 'blogspot.com' => { email => 'abuse@google.com', note => 'Blogger/Blogspot -- report via https://support.google.com/blogger/answer/76315' }, | |||||
| 305 | 'blogger.com' => { email => 'abuse@google.com', note => 'Blogger platform abuse' }, | |||||
| 306 | 'sites.google.com' => { email => 'abuse@google.com', note => 'Google Sites hosted content' }, | |||||
| 307 | # Microsoft | |||||
| 308 | 'microsoft.com' => { email => 'abuse@microsoft.com', note => 'Also report via https://www.microsoft.com/en-us/wdsi/support/report-unsafe-site' }, | |||||
| 309 | 'outlook.com' => { email => 'abuse@microsoft.com', note => 'Report Outlook spam: https://support.microsoft.com/en-us/office/report-phishing' }, | |||||
| 310 | 'hotmail.com' => { email => 'abuse@microsoft.com', note => 'Report via https://support.microsoft.com/en-us/office/report-phishing' }, | |||||
| 311 | 'live.com' => { email => 'abuse@microsoft.com', note => 'Microsoft consumer mail' }, | |||||
| 312 | 'office365.com' => { email => 'abuse@microsoft.com', note => 'Microsoft 365 infrastructure' }, | |||||
| 313 | 'protection.outlook.com' => { email => 'abuse@microsoft.com', note => 'Microsoft EOP gateway' }, | |||||
| 314 | # Yahoo | |||||
| 315 | 'yahoo.com' => { email => 'abuse@yahoo-inc.com', note => 'Also use https://io.help.yahoo.com/contact/index' }, | |||||
| 316 | 'yahoo.co.uk' => { email => 'abuse@yahoo-inc.com', note => 'Yahoo UK' }, | |||||
| 317 | # Apple | |||||
| 318 | 'apple.com' => { email => 'reportphishing@apple.com', note => 'iCloud / Apple Mail abuse' }, | |||||
| 319 | 'icloud.com' => { email => 'reportphishing@apple.com', note => 'iCloud abuse' }, | |||||
| 320 | 'me.com' => { email => 'reportphishing@apple.com', note => 'Apple legacy mail' }, | |||||
| 321 | # Amazon / AWS | |||||
| 322 | 'amazon.com' => { email => 'abuse@amazonaws.com', note => 'Also https://aws.amazon.com/forms/report-abuse' }, | |||||
| 323 | 'amazonaws.com' => { email => 'abuse@amazonaws.com', note => 'AWS abuse form: https://aws.amazon.com/forms/report-abuse' }, | |||||
| 324 | 'amazonses.com' => { email => 'abuse@amazonaws.com', note => 'Amazon SES sending infrastructure' }, | |||||
| 325 | # Cloudflare | |||||
| 326 | 'cloudflare.com' => { email => 'abuse@cloudflare.com', note => 'Report via https://www.cloudflare.com/abuse/' }, | |||||
| 327 | # Fastly / Akamai | |||||
| 328 | 'fastly.net' => { email => 'abuse@fastly.com', note => 'Fastly CDN' }, | |||||
| 329 | 'akamai.com' => { email => 'abuse@akamai.com', note => 'Akamai CDN' }, | |||||
| 330 | 'akamaitechnologies.com' => { email => 'abuse@akamai.com', note => 'Akamai CDN' }, | |||||
| 331 | # Namecheap | |||||
| 332 | 'namecheap.com' => { email => 'abuse@namecheap.com', note => 'Registrar abuse' }, | |||||
| 333 | # GoDaddy -- web form only; email bounces | |||||
| 334 | 'godaddy.com' => { | |||||
| 335 | form => 'https://supportcenter.godaddy.com/AbuseReport', | |||||
| 336 | form_paste => 'Select the abuse type (spam, phishing, malware etc). ' | |||||
| 337 | . 'Enter the domain name in the Domain field. ' | |||||
| 338 | . 'Paste the originating IP, risk flags, and the relevant ' | |||||
| 339 | . 'Received: headers from the report below.', | |||||
| 340 | form_upload => 'Take a screenshot of the report as a .png or .jpg, ' | |||||
| 341 | . 'or export it as a .pdf.', | |||||
| 342 | note => 'Registrar/host -- email reports not monitored, use web form', | |||||
| 343 | }, | |||||
| 344 | # SendGrid / Twilio | |||||
| 345 | 'sendgrid.net' => { email => 'abuse@sendgrid.com', note => 'ESP -- include full headers' }, | |||||
| 346 | 'sendgrid.com' => { email => 'abuse@sendgrid.com', note => 'ESP -- include full headers' }, | |||||
| 347 | # Mailchimp / Mandrill | |||||
| 348 | 'mailchimp.com' => { email => 'abuse@mailchimp.com', note => 'ESP abuse' }, | |||||
| 349 | 'mandrillapp.com' => { email => 'abuse@mailchimp.com', note => 'Mandrill transactional ESP' }, | |||||
| 350 | # OVH | |||||
| 351 | 'ovh.net' => { email => 'abuse@ovh.net', note => 'OVH hosting' }, | |||||
| 352 | 'ovh.com' => { email => 'abuse@ovh.com', note => 'OVH hosting' }, | |||||
| 353 | # Hetzner | |||||
| 354 | 'hetzner.com' => { email => 'abuse@hetzner.com', note => 'Hetzner hosting' }, | |||||
| 355 | # Digital Ocean | |||||
| 356 | 'digitalocean.com' => { email => 'abuse@digitalocean.com',note => 'DO abuse form: https://www.digitalocean.com/company/contact/#abuse' }, | |||||
| 357 | # Linode / Akamai | |||||
| 358 | 'linode.com' => { email => 'abuse@linode.com', note => 'Linode/Akamai Cloud' }, | |||||
| 359 | # Constant Contact | |||||
| 360 | 'constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'ESP abuse' }, | |||||
| 361 | 'r.constantcontact.com' => { email => 'abuse@constantcontact.com', note => 'Constant Contact sending infrastructure' }, | |||||
| 362 | # HubSpot | |||||
| 363 | 'hubspot.com' => { email => 'abuse@hubspot.com', note => 'ESP abuse' }, | |||||
| 364 | 'hs-analytics.net' => { email => 'abuse@hubspot.com', note => 'HubSpot analytics infrastructure' }, | |||||
| 365 | # Campaign Monitor | |||||
| 366 | 'createsend.com' => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor ESP' }, | |||||
| 367 | 'cmail20.com' => { email => 'abuse@campaignmonitor.com', note => 'Campaign Monitor sending infrastructure' }, | |||||
| 368 | # Klaviyo | |||||
| 369 | 'klaviyo.com' => { email => 'abuse@klaviyo.com', note => 'ESP abuse' }, | |||||
| 370 | # Brevo (formerly Sendinblue) | |||||
| 371 | 'sendinblue.com' => { email => 'abuse@sendinblue.com', note => 'ESP abuse' }, | |||||
| 372 | 'brevo.com' => { email => 'abuse@brevo.com', note => 'ESP abuse' }, | |||||
| 373 | # Mailgun | |||||
| 374 | 'mailgun.com' => { email => 'abuse@mailgun.com', note => 'ESP abuse' }, | |||||
| 375 | 'mailgun.org' => { email => 'abuse@mailgun.com', note => 'Mailgun sending infrastructure' }, | |||||
| 376 | # Postmark | |||||
| 377 | 'postmarkapp.com' => { email => 'abuse@postmarkapp.com', note => 'ESP abuse' }, | |||||
| 378 | # WordPress.com | |||||
| 379 | 'wordpress.com' => { email => 'abuse@wordpress.com', note => 'WordPress.com hosted blog -- report via https://en.wordpress.com/abuse/' }, | |||||
| 380 | 'wp.com' => { email => 'abuse@wordpress.com', note => 'WordPress.com short domain' }, | |||||
| 381 | # Substack | |||||
| 382 | 'substack.com' => { email => 'abuse@substack.com', note => 'Substack newsletter platform abuse' }, | |||||
| 383 | # ActiveCampaign | |||||
| 384 | 'activecampaign.com' => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign ESP' }, | |||||
| 385 | 'ac-tinker.com' => { email => 'abuse@activecampaign.com', note => 'ActiveCampaign tracking infrastructure' }, | |||||
| 386 | # Salesforce Marketing Cloud | |||||
| 387 | 'salesforce.com' => { email => 'abuse@salesforce.com', note => 'Salesforce Marketing Cloud / ExactTarget ESP' }, | |||||
| 388 | 'mc.salesforce.com' => { email => 'abuse@salesforce.com', note => 'Salesforce Marketing Cloud sending infrastructure' }, | |||||
| 389 | 'exacttarget.com' => { email => 'abuse@salesforce.com', note => 'ExactTarget / Salesforce Marketing Cloud ESP' }, | |||||
| 390 | 'et.exacttarget.com' => { email => 'abuse@salesforce.com', note => 'ExactTarget sending infrastructure' }, | |||||
| 391 | # Vultr | |||||
| 392 | 'vultr.com' => { email => 'abuse@vultr.com', note => 'Vultr hosting' }, | |||||
| 393 | # Contabo | |||||
| 394 | 'contabo.com' => { email => 'abuse@contabo.com', note => 'Contabo hosting' }, | |||||
| 395 | # Leaseweb | |||||
| 396 | 'leaseweb.com' => { email => 'abuse@leaseweb.com', note => 'Leaseweb hosting' }, | |||||
| 397 | # M247 | |||||
| 398 | 'm247.com' => { email => 'abuse@m247.com', note => 'M247 hosting' }, | |||||
| 399 | # MarkMonitor -- web form only | |||||
| 400 | 'markmonitor.com' => { | |||||
| 401 | form => 'https://corp.markmonitor.com/domain/ui/abuse-report', | |||||
| 402 | form_paste => 'Complete all fields including the domain name and your ' | |||||
| 403 | . 'description of the abuse. Paste the originating IP, ' | |||||
| 404 | . 'risk flags, and the relevant Received: headers from the ' | |||||
| 405 | . 'report below.', | |||||
| 406 | form_upload => 'Take a screenshot of the report as a .png or .jpg, or export it as a .pdf. MarkMonitor does not accept .eml files.', | |||||
| 407 | note => 'Brand-protection registrar -- email reports not processed', | |||||
| 408 | }, | |||||
| 409 | # URL shortener operators | |||||
| 410 | 'is.gd' => { email => 'abuse@is.gd', note => 'URL shortener -- report via https://is.gd/contact.php' }, | |||||
| 411 | 'bitly.com' => { email => 'abuse@bitly.com', note => 'URL shortener abuse' }, | |||||
| 412 | 'bit.ly' => { email => 'abuse@bitly.com', note => 'URL shortener abuse' }, | |||||
| 413 | 'tinyurl.com' => { email => 'abuse@tinyurl.com', note => 'URL shortener abuse' }, | |||||
| 414 | 'ow.ly' => { email => 'abuse@hootsuite.com', note => 'Hootsuite URL shortener' }, | |||||
| 415 | 'buff.ly' => { email => 'abuse@buffer.com', note => 'Buffer URL shortener' }, | |||||
| 416 | 'rb.gy' => { email => 'abuse@rb.gy', note => 'URL shortener abuse' }, | |||||
| 417 | 'cutt.ly' => { email => 'abuse@cutt.ly', note => 'URL shortener abuse' }, | |||||
| 418 | 'shorturl.at' => { email => 'abuse@shorturl.at', note => 'URL shortener abuse' }, | |||||
| 419 | # Dynadot -- web form only | |||||
| 420 | 'dynadot.com' => { | |||||
| 421 | form => 'https://www.dynadot.com/report-abuse', | |||||
| 422 | form_paste => 'Complete all fields including the domain name and your ' | |||||
| 423 | . 'description of the abuse. Paste the originating IP, ' | |||||
| 424 | . 'risk flags, and the relevant Received: headers from the ' | |||||
| 425 | . 'report below.', | |||||
| 426 | form_upload => 'Take a screenshot of the report as a .png or .jpg, ' | |||||
| 427 | . 'or export it as a .pdf.', | |||||
| 428 | note => 'Registrar -- email reports not monitored, use web form', | |||||
| 429 | }, | |||||
| 430 | # Global Domain Group -- web form only | |||||
| 431 | 'globaldomaingroup.com' => { | |||||
| 432 | form => 'https://globaldomaingroup.com/report-abuse', | |||||
| 433 | form_paste => 'Complete all fields including the domain name and your ' | |||||
| 434 | . 'description of the abuse. Paste the originating IP, ' | |||||
| 435 | . 'risk flags, and the relevant Received: headers from the ' | |||||
| 436 | . 'report below.', | |||||
| 437 | form_upload => 'Attach the original spam message as an .eml file.', | |||||
| 438 | note => 'Registrar -- email reports explicitly not accepted', | |||||
| 439 | }, | |||||
| 440 | # TPG / Internode (Australia) | |||||
| 441 | 'tpgi.com.au' => { email => 'abuse@tpg.com.au', note => 'TPG Telecom Australia' }, | |||||
| 442 | 'tpg.com.au' => { email => 'abuse@tpg.com.au', note => 'TPG Telecom Australia' }, | |||||
| 443 | 'internode.on.net' => { email => 'abuse@internode.on.net',note => 'Internode Australia' }, | |||||
| 444 | ); | |||||
| 445 | ||||||
| 446 | # ----------------------------------------------------------------------- | |||||
| 447 | # Constructor | |||||
| 448 | # ----------------------------------------------------------------------- | |||||
| 449 | ||||||
| 450 - 585 | =head1 METHODS
=head2 new( %options )
Constructs and returns a new C<Email::Abuse::Investigator> analyser object. The
object is stateless until C<parse_email()> is called; all analysis results
are stored on the object and retrieved via the public accessor methods
documented below.
A single object may be reused for multiple emails by calling C<parse_email()>
again: all per-message cached state from the previous message is discarded
automatically. Cross-message IP and domain lookup results are retained
in a shared CHI cache (if C<CHI> is installed) to avoid redundant network
queries across messages processed in the same process.
=head3 Usage
# Minimal -- all options take safe defaults
my $analyser = Email::Abuse::Investigator->new();
# With options
my $analyser = Email::Abuse::Investigator->new(
timeout => 15,
trusted_relays => ['203.0.113.0/24', '10.0.0.0/8'],
verbose => 0,
);
$analyser->parse_email($raw_rfc2822_text);
my $origin = $analyser->originating_ip();
my @urls = $analyser->embedded_urls();
my @domains = $analyser->mailto_domains();
my $risk = $analyser->risk_assessment();
my @contacts = $analyser->abuse_contacts();
print $analyser->report();
=head3 Arguments
All arguments are optional named parameters passed as a flat key-value list.
=over 4
=item C<timeout> (integer, default 10)
Maximum seconds to wait for any single network operation. Set to 0 to
disable timeouts (not recommended for production use).
=item C<trusted_relays> (arrayref of strings, default [])
IP addresses or CIDR blocks to skip during Received: chain analysis.
Each element may be an exact IPv4 address (C<'192.0.2.1'>) or a CIDR
block (C<'192.0.2.0/24'>).
=item C<verbose> (boolean, default 0)
When true, diagnostic messages are written to STDERR.
=back
=head3 Returns
A blessed C<Email::Abuse::Investigator> object. No network I/O is performed
during construction.
=head3 Side Effects
If C<CHI> is installed, a shared in-memory cache is initialised (or
re-used if a cache was already created by a prior call to C<new()>).
This cache persists for the lifetime of the process.
=head3 Notes
=over 4
=item *
Unknown option keys are silently ignored.
=item *
The object is not thread-safe. Use a separate object per thread.
=item *
WHOIS read timeouts use C<IO::Select> rather than C<alarm()>, so they
work correctly on Windows and in threaded Perl interpreters.
=back
=head3 API Specification
=head4 Input
{
timeout => {
type => 'integer',
optional => 1,
min => 0,
default => 10,
},
trusted_relays => {
type => 'arrayref',
element_type => 'string',
optional => 1,
default => [],
},
verbose => {
type => 'boolean',
optional => 1,
default => 0,
},
}
=head4 Output
{
type => 'Email::Abuse::Investigator',
isa => 'Email::Abuse::Investigator',
}
=head3 FORMAL SPECIFICATION
-- Z notation (simplified)
new == [
timeout : N;
trusted_relays : seq STRING;
verbose : BOOL;
_raw : STRING;
_headers : seq (STRING x STRING);
_origin? : IP_INFO | undefined;
_urls? : seq URL_INFO | undefined;
_risk? : RISK_INFO | undefined
]
pre: timeout >= 0
post: self.timeout = params.timeout /\ self._raw = ''
=cut | |||||
| 586 | ||||||
| 587 | # Class-level cross-message CHI cache (shared across all instances). | |||||
| 588 | # Populated lazily on first call to new() when CHI is available. | |||||
| 589 | my $_cache; | |||||
| 590 | ||||||
| 591 | sub new { | |||||
| 592 | 2608 | 747139 | my $class = shift; | |||
| 593 | ||||||
| 594 | # Accept hash or hashref arguments uniformly | |||||
| 595 | 2608 | 4145 | my $params = Params::Validate::Strict::validate_strict({ | |||
| 596 | args => Params::Get::get_params(undef, \@_) || {}, | |||||
| 597 | schema => { | |||||
| 598 | timeout => { | |||||
| 599 | 'type' => 'integer', | |||||
| 600 | 'optional' => 1, | |||||
| 601 | 'min' => 0, | |||||
| 602 | }, | |||||
| 603 | trusted_relays => { | |||||
| 604 | 'type' => 'arrayref', | |||||
| 605 | 'element_type' => 'string', | |||||
| 606 | 'optional' => 1, | |||||
| 607 | }, | |||||
| 608 | verbose => { | |||||
| 609 | 'type' => 'boolean', | |||||
| 610 | 'optional' => 1, | |||||
| 611 | }, | |||||
| 612 | }, | |||||
| 613 | }); | |||||
| 614 | ||||||
| 615 | # Merge in any file-based configuration via Object::Configure | |||||
| 616 | 1630 | 161484 | $params = Object::Configure::configure($class, $params); | |||
| 617 | ||||||
| 618 | # Initialise the cross-message CHI cache on first construction | |||||
| 619 | 1630 | 2716946 | if ($HAS_CHI && !$_cache) { | |||
| 620 | 0 | 0 | $_cache = CHI->new( | |||
| 621 | driver => 'Memory', | |||||
| 622 | global => 1, | |||||
| 623 | expires_in => $CACHE_TTL_SECS, | |||||
| 624 | ); | |||||
| 625 | } | |||||
| 626 | ||||||
| 627 | # Build and bless the object with default slot values | |||||
| 628 | return bless { | |||||
| 629 | timeout => $DEFAULT_TIMEOUT, | |||||
| 630 | trusted_relays => [], | |||||
| 631 | verbose => 0, | |||||
| 632 | _raw => '', | |||||
| 633 | _headers => [], | |||||
| 634 | _body_plain => '', | |||||
| 635 | _body_html => '', | |||||
| 636 | _received => [], | |||||
| 637 | _origin => undef, | |||||
| 638 | _urls => undef, # lazy-computed by embedded_urls() | |||||
| 639 | _mailto_domains=> undef, # lazy-computed by mailto_domains() | |||||
| 640 | _domain_info => {}, # per-message domain analysis cache | |||||
| 641 | _sending_sw => [], # X-Mailer / X-PHP-Originating-Script etc. | |||||
| 642 | _rcvd_tracking => [], # per-hop tracking IDs from Received: headers | |||||
| 643 | 1630 1630 | 2114 8196 | %{$params}, # Override the defaults with Object:Configure and the values passed in | |||
| 644 | }, $class; | |||||
| 645 | } | |||||
| 646 | ||||||
| 647 | # ----------------------------------------------------------------------- | |||||
| 648 | # Public: parse | |||||
| 649 | # ----------------------------------------------------------------------- | |||||
| 650 | ||||||
| 651 - 740 | =head2 parse_email( $text )
Feeds a raw RFC 2822 email message to the analyser and prepares it for
subsequent interrogation. This is the only method that must be called
before any other public method.
If the same object is used for a second message, calling C<parse_email()>
again completely replaces all per-message state from the first message.
The cross-message CHI cache is B<not> flushed; IP and domain lookups
cached from prior messages are retained.
=head3 Usage
my $raw = do { local $/; <STDIN> };
$analyser->parse_email($raw);
# Scalar reference (avoids copying large messages)
$analyser->parse_email(\$raw);
# Chained
my $analyser = Email::Abuse::Investigator->new()->parse_email($raw);
=head3 Arguments
=over 4
=item C<$text> (scalar or scalar reference, required)
Complete raw RFC 2822 email message, including all headers and the body.
Both LF-only and CRLF line endings are accepted.
=back
=head3 Returns
The object itself (C<$self>), enabling method chaining.
=head3 Side Effects
Parses headers, decodes the body (quoted-printable, base64, multipart),
extracts sending-software fingerprints, and populates per-hop tracking
data. All previously computed lazy results are discarded.
=head3 Notes
=over 4
=item *
If C<$text> is empty or contains no header/body separator, all public
methods will return empty/safe values.
=item *
Decoding errors in base64 or quoted-printable payloads are silenced; raw
bytes are used in place of correct output to prevent exceptions.
=back
=head3 API Specification
=head4 Input
[
{
type => 'scalar | scalarref',
},
]
=head4 Output
{
type => 'Email::Abuse::Investigator',
isa => 'Email::Abuse::Investigator',
}
=head3 FORMAL SPECIFICATION
-- Z notation
parse_email == [
Delta Email::Abuse::Investigator;
text? : STRING | ref STRING
]
pre: defined text?
post: self._raw = deref(text?) /\
self._origin = undefined /\
self._urls = undefined /\
self._risk = undefined
=cut | |||||
| 741 | ||||||
| 742 | # TODO: Allow a Mail::Message object to be passed in | |||||
| 743 | sub parse_email { | |||||
| 744 | 485 | 55921 | my $self = shift; | |||
| 745 | ||||||
| 746 | # Accept both positional string and named 'text' argument | |||||
| 747 | 485 | 691 | my $args = Params::Get::get_params('text', \@_); | |||
| 748 | 485 | 4791 | my $text = $args->{text}; | |||
| 749 | ||||||
| 750 | # Dereference if a scalar reference was supplied | |||||
| 751 | 485 | 604 | $text = $$text if ref $text eq 'SCALAR'; | |||
| 752 | ||||||
| 753 | # Sanitise: strip control characters that could affect terminal output. | |||||
| 754 | # Keep \t (tabs in headers), \n (line endings), \r (CRLF mail format). | |||||
| 755 | 485 | 2216 | $text =~ s/[^\x09\x0A\x0D\x20-\x7E\x80-\xFF]//g if defined $text; | |||
| 756 | ||||||
| 757 | # Store the sanitised raw text for later reproduction in reports | |||||
| 758 | 485 | 1082 | $self->{_raw} = $text // ''; | |||
| 759 | ||||||
| 760 | # Invalidate all per-message lazy caches | |||||
| 761 | 485 | 421 | $self->{_origin} = undef; | |||
| 762 | 485 | 382 | $self->{_urls} = undef; | |||
| 763 | 485 | 349 | $self->{_mailto_domains} = undef; | |||
| 764 | 485 | 484 | $self->{_domain_info} = {}; | |||
| 765 | 485 | 423 | $self->{_risk} = undef; | |||
| 766 | 485 | 376 | $self->{_auth_results} = undef; | |||
| 767 | 485 | 414 | $self->{_sending_sw} = []; | |||
| 768 | 485 | 385 | $self->{_rcvd_tracking} = []; | |||
| 769 | ||||||
| 770 | # Perform synchronous header/body parsing (no network I/O) | |||||
| 771 | 485 | 1686 | $self->_split_message($text) if defined $text && $text =~ /\S/; | |||
| 772 | 485 | 799 | return $self; | |||
| 773 | } | |||||
| 774 | ||||||
| 775 | # ----------------------------------------------------------------------- | |||||
| 776 | # Public: originating host | |||||
| 777 | # ----------------------------------------------------------------------- | |||||
| 778 | ||||||
| 779 - 851 | =head2 originating_ip()
Identifies the IP address of the machine that originally injected the
message into the mail system by walking the C<Received:> chain, skipping
private/trusted hops, and enriching the first external hop with rDNS,
WHOIS/RDAP organisation name, abuse contact, and country code.
Both IPv4 and IPv6 addresses are extracted and evaluated.
The result is cached; subsequent calls return the same hashref without
repeating network I/O.
=head3 Usage
my $orig = $analyser->originating_ip();
if (defined $orig) {
printf "Origin: %s (%s)\n", $orig->{ip}, $orig->{rdns};
printf "Owner: %s\n", $orig->{org};
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A hashref with keys C<ip>, C<rdns>, C<org>, C<abuse>, C<confidence>,
C<note>, and C<country> (may be undef). Returns C<undef> if no suitable
originating IP can be determined.
=head3 Side Effects
On first call: one PTR lookup and one RDAP/WHOIS query. Results are cached
in the object and in the cross-message CHI cache (if available).
=head3 Notes
Only the first (oldest) external IP in the chain is reported. See
C<received_trail()> for the full chain.
=head3 API Specification
=head4 Input
[]
=head4 Output
{
type => 'hashref | undef',
keys => {
ip => { type => 'scalar', regex => qr/[\d.:a-fA-F]/ },
rdns => { type => 'scalar' },
org => { type => 'scalar' },
abuse => { type => 'scalar' },
confidence => { type => 'scalar', regex => qr/^(?:high|medium|low)$/ },
note => { type => 'scalar' },
country => { type => 'scalar', optional => 1 },
},
}
=head3 FORMAL SPECIFICATION
-- Z notation
originating_ip == [
Xi Email::Abuse::Investigator;
result! : IP_INFO | undefined
]
pre: self._raw /= ''
post: result! = self._origin /\
(result! /= undefined => result!.ip in EXTERNAL_IPS)
=cut | |||||
| 852 | ||||||
| 853 | sub originating_ip { | |||||
| 854 | 591 | 5397 | my $self = $_[0]; | |||
| 855 | ||||||
| 856 | # Return the cached result if we already have it | |||||
| 857 | 591 | 810 | $self->{_origin} //= $self->_find_origin(); | |||
| 858 | 591 | 563 | return $self->{_origin}; | |||
| 859 | } | |||||
| 860 | ||||||
| 861 | # ----------------------------------------------------------------------- | |||||
| 862 | # Public: HTTP/HTTPS URLs | |||||
| 863 | # ----------------------------------------------------------------------- | |||||
| 864 | ||||||
| 865 - 940 | =head2 embedded_urls()
Extracts every HTTP and HTTPS URL from the message body and enriches each
one with the hosting IP address, network organisation name, abuse contact,
and country code. Both IPv4 and IPv6 host addresses are supported.
URL extraction runs across both plain-text and HTML body parts. DNS
lookups for each unique hostname are optionally parallelised via
C<AnyEvent::DNS> if that module is installed.
The result is cached; subsequent calls return the same list without
repeating network I/O.
=head3 Usage
my @urls = $analyser->embedded_urls();
for my $u (@urls) {
printf "URL: %s host: %s org: %s\n",
$u->{url}, $u->{host}, $u->{org};
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs, one per unique URL, in first-seen order. Returns an
empty list if no HTTP/HTTPS URLs are present. Each hashref has keys
C<url>, C<host>, C<ip>, C<org>, C<abuse>, C<country>.
=head3 Side Effects
Per unique hostname: one A/AAAA lookup and one RDAP/WHOIS query. Results
are cached in the object and in the cross-message CHI cache.
=head3 Notes
Only C<http://> and C<https://> URLs are extracted. URL shortener hosts
are included in the returned list (they are flagged by C<risk_assessment()>).
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
url => { type => 'scalar', regex => qr{^https?://}i },
host => { type => 'scalar' },
ip => { type => 'scalar' },
org => { type => 'scalar' },
abuse => { type => 'scalar' },
country => { type => 'scalar', optional => 1 },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
embedded_urls == [
Xi Email::Abuse::Investigator;
result! : seq URL_INFO
]
pre: self._raw /= ''
post: result! = self._urls /\
forall u : result! @ u.url =~ m{^https?://}i
=cut | |||||
| 941 | ||||||
| 942 | sub embedded_urls { | |||||
| 943 | 604 | 7101 | my $self = $_[0]; | |||
| 944 | ||||||
| 945 | 604 | 741 | $self->{_urls} //= $self->_extract_and_resolve_urls(); | |||
| 946 | 604 604 | 404 708 | return @{ $self->{_urls} }; | |||
| 947 | } | |||||
| 948 | ||||||
| 949 | # ----------------------------------------------------------------------- | |||||
| 950 | # Public: mailto / reply-to / from domains | |||||
| 951 | # ----------------------------------------------------------------------- | |||||
| 952 | ||||||
| 953 - 1022 | =head2 mailto_domains()
Identifies every domain associated with the message as a contact, reply,
or delivery address, then runs a full intelligence pipeline on each one
(A record, MX, NS, WHOIS) to determine hosting and registration details.
The result is cached; subsequent calls return the same list without
repeating network I/O.
=head3 Usage
my @domains = $analyser->mailto_domains();
for my $d (@domains) {
printf "Domain: %s registrar: %s\n",
$d->{domain}, $d->{registrar} // 'unknown';
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs, one per unique domain. See the main POD for the full
set of possible keys. Returns an empty list if no qualifying domains are
found.
=head3 Side Effects
Per unique domain: up to three A lookups, one MX lookup, one NS lookup,
and two WHOIS queries. Results are cached in the object and in the
cross-message CHI cache.
=head3 Notes
MX and NS lookups require C<Net::DNS>. Without it those keys are absent
from every returned hashref.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
domain => { type => 'scalar' },
source => { type => 'scalar' },
# All other keys optional -- see main POD
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
mailto_domains == [
Xi Email::Abuse::Investigator;
result! : seq DOMAIN_INFO
]
pre: self._raw /= ''
post: result! = self._mailto_domains /\
forall d : result! @ d.domain =~ /\.[a-zA-Z]{2,}$/
=cut | |||||
| 1023 | ||||||
| 1024 | sub mailto_domains { | |||||
| 1025 | 567 | 2864 | my $self = $_[0]; | |||
| 1026 | ||||||
| 1027 | 567 | 717 | $self->{_mailto_domains} //= $self->_extract_and_analyse_domains(); | |||
| 1028 | 567 567 | 381 593 | return @{ $self->{_mailto_domains} }; | |||
| 1029 | } | |||||
| 1030 | ||||||
| 1031 - 1086 | =head2 all_domains()
Returns the deduplicated union of every registrable domain seen anywhere
in the message -- URL hosts from C<embedded_urls()> and contact domains
from C<mailto_domains()> -- normalised to eTLD+1 form.
Triggers C<embedded_urls()> and C<mailto_domains()> lazily.
=head3 Usage
my @domains = $analyser->all_domains();
print "$_\n" for @domains;
=head3 Arguments
None.
=head3 Returns
A list of plain strings (registrable domain names), lower-cased, no
duplicates, in first-seen order.
=head3 Side Effects
Triggers C<embedded_urls()> and C<mailto_domains()> if not already cached.
=head3 Notes
Normalisation to eTLD+1 uses C<Domain::PublicSuffix> if installed, falling
back to a built-in heuristic otherwise.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{ type => 'scalar', regex => qr/^[a-z0-9][a-z0-9.-]*\.[a-z]{2,}$/ },
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
all_domains == [
Xi Email::Abuse::Investigator;
result! : seq STRING
]
post: result! = deduplicate(
map(_registrable, url_hosts union mailto_domains)
)
=cut | |||||
| 1087 | ||||||
| 1088 | sub all_domains { | |||||
| 1089 | 11 | 3425 | my $self = $_[0]; | |||
| 1090 | 11 | 9 | my (%seen, @out); | |||
| 1091 | ||||||
| 1092 | # Collect registrable domains from URL hosts first | |||||
| 1093 | 11 | 18 | for my $u ($self->embedded_urls()) { | |||
| 1094 | 9 | 14 | my $dom = _registrable($u->{host}); | |||
| 1095 | 9 | 28 | push @out, $dom if $dom && !$seen{$dom}++; | |||
| 1096 | } | |||||
| 1097 | ||||||
| 1098 | # Then from contact domains (normalise subdomains to registrable parent) | |||||
| 1099 | 11 | 15 | for my $d ($self->mailto_domains()) { | |||
| 1100 | 16 | 20 | my $dom = _registrable($d->{domain}) // $d->{domain}; | |||
| 1101 | 16 | 31 | push @out, $dom if $dom && !$seen{$dom}++; | |||
| 1102 | } | |||||
| 1103 | 11 | 19 | return @out; | |||
| 1104 | } | |||||
| 1105 | ||||||
| 1106 - 1169 | =head2 unresolved_contacts()
Returns a list of domains and URL hosts found in the message for which no
abuse contact could be determined. Useful for surfacing parties that may
warrant manual investigation.
=head3 Usage
my @unresolved = $analyser->unresolved_contacts();
for my $u (@unresolved) {
printf "Unresolved: %s (%s) via %s\n",
$u->{domain}, $u->{type}, $u->{source};
}
=head3 Arguments
None.
=head3 Returns
A list of hashrefs, each with keys C<domain>, C<type> (C<'url_host'> or
C<'domain'>), and C<source> (where the domain was found).
=head3 Side Effects
Triggers C<embedded_urls()>, C<mailto_domains()>, C<abuse_contacts()>,
and C<form_contacts()> if not already cached.
=head3 Notes
Domains sourced only from spoofable sending headers (C<From:>,
C<Return-Path:>, C<Sender:>) are excluded.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
domain => { type => 'scalar' },
type => { type => 'scalar', regex => qr/^(?:url_host|domain)$/ },
source => { type => 'scalar' },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
unresolved_contacts == [
Xi Email::Abuse::Investigator;
result! : seq UNRESOLVED_INFO
]
post: forall u : result! @
u.domain not_in covered_domains(abuse_contacts, form_contacts)
=cut | |||||
| 1170 | ||||||
| 1171 | sub unresolved_contacts { | |||||
| 1172 | 10 | 40 | my $self = $_[0]; | |||
| 1173 | ||||||
| 1174 | # Build a set of domains already covered by email or form contacts | |||||
| 1175 | 10 | 10 | my %covered; | |||
| 1176 | 10 | 16 | for my $c ($self->abuse_contacts(), $self->form_contacts()) { | |||
| 1177 | 3 | 4 | my $dom = $c->{form_domain}; | |||
| 1178 | 3 | 6 | unless ($dom) { | |||
| 1179 | # Extract domain from abuse email address | |||||
| 1180 | 3 | 8 | ($dom) = ($c->{address} // '') =~ /\@([\w.-]+)/; | |||
| 1181 | } | |||||
| 1182 | 3 | 7 | $covered{lc $dom}++ if $dom; | |||
| 1183 | } | |||||
| 1184 | ||||||
| 1185 | # Also mark URL hosts that already have a resolved abuse address | |||||
| 1186 | 10 | 16 | for my $u ($self->embedded_urls()) { | |||
| 1187 | 6 | 14 | (my $bare = lc $u->{host}) =~ s/^www\.//; | |||
| 1188 | 6 | 23 | $covered{$bare}++ if $u->{abuse} && $u->{abuse} ne '(unknown)'; | |||
| 1189 | } | |||||
| 1190 | ||||||
| 1191 | 10 | 11 | my (@out, %seen); | |||
| 1192 | ||||||
| 1193 | # Check URL hosts first | |||||
| 1194 | 10 | 11 | for my $u ($self->embedded_urls()) { | |||
| 1195 | 6 | 12 | (my $bare = lc $u->{host}) =~ s/^www\.//; | |||
| 1196 | 6 | 12 | next if $covered{$bare}; | |||
| 1197 | 6 | 29 | next if $seen{"url:$bare"}++; | |||
| 1198 | push @out, { | |||||
| 1199 | domain => $u->{host}, | |||||
| 1200 | 6 | 23 | type => 'url_host', | |||
| 1201 | source => 'URL in body', | |||||
| 1202 | }; | |||||
| 1203 | } | |||||
| 1204 | ||||||
| 1205 | # Then check contact domains, skipping spoofable-header-only sources | |||||
| 1206 | 10 | 15 | for my $d ($self->mailto_domains()) { | |||
| 1207 | 15 | 17 | my $dom = $d->{domain}; | |||
| 1208 | 15 | 15 | my $source = $d->{source} // ''; | |||
| 1209 | 15 | 32 | next if $source =~ /^(?:From:|Return-Path:|Sender:) header$/; | |||
| 1210 | 3 | 6 | next if $covered{lc $dom}; | |||
| 1211 | 3 | 7 | next if $seen{"dom:$dom"}++; | |||
| 1212 | 3 | 7 | push @out, { | |||
| 1213 | domain => $dom, | |||||
| 1214 | type => 'domain', | |||||
| 1215 | source => $source, | |||||
| 1216 | }; | |||||
| 1217 | } | |||||
| 1218 | ||||||
| 1219 | 10 | 19 | return @out; | |||
| 1220 | } | |||||
| 1221 | ||||||
| 1222 | # ----------------------------------------------------------------------- | |||||
| 1223 | # Public: sending software fingerprint | |||||
| 1224 | # ----------------------------------------------------------------------- | |||||
| 1225 | ||||||
| 1226 - 1289 | =head2 sending_software()
Returns information extracted from headers that identify the software or
server-side infrastructure used to compose or inject the message. Headers
such as C<X-PHP-Originating-Script> reveal the exact PHP script and Unix
account responsible on shared-hosting platforms.
Data is extracted during C<parse_email()> with no network I/O.
=head3 Usage
my @sw = $analyser->sending_software();
for my $s (@sw) {
printf "%-30s : %s\n", $s->{header}, $s->{value};
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs in alphabetical header-name order. Returns an empty
list if none of the watched headers are present. Each hashref has keys
C<header>, C<value>, and C<note>.
=head3 Side Effects
None. Data is pre-collected during C<parse_email()>.
=head3 Notes
Header names are lower-cased. Header values are stored verbatim.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
header => { type => 'scalar' },
value => { type => 'scalar' },
note => { type => 'scalar' },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
sending_software == [
Xi Email::Abuse::Investigator;
result! : seq SW_INFO
]
post: result! = self._sending_sw
=cut | |||||
| 1290 | ||||||
| 1291 | sub sending_software { | |||||
| 1292 | 53 | 50 | my $self = $_[0]; | |||
| 1293 | ||||||
| 1294 | 53 53 | 33 62 | return @{ $self->{_sending_sw} }; | |||
| 1295 | } | |||||
| 1296 | ||||||
| 1297 | # ----------------------------------------------------------------------- | |||||
| 1298 | # Public: per-hop tracking IDs | |||||
| 1299 | # ----------------------------------------------------------------------- | |||||
| 1300 | ||||||
| 1301 - 1366 | =head2 received_trail()
Returns per-hop tracking data extracted from the C<Received:> header chain:
the IP address, envelope recipient address, and server session ID for each
relay. ISP postmasters use these identifiers to locate the SMTP session in
their logs.
=head3 Usage
my @trail = $analyser->received_trail();
for my $hop (@trail) {
printf "IP: %s ID: %s\n",
$hop->{ip} // '?', $hop->{id} // '?';
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs in oldest-first order. Returns an empty list if no
C<Received:> headers are present or none yielded extractable data. Each
hashref has keys C<received>, C<ip> (may be undef), C<for> (may be undef),
C<id> (may be undef).
=head3 Side Effects
None. Data is pre-collected during C<parse_email()>.
=head3 Notes
Private IPs are NOT filtered here; all IPs including RFC 1918 addresses
are returned as found. Filtering is applied only by C<originating_ip()>.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
received => { type => 'scalar' },
ip => { type => 'scalar', optional => 1 },
for => { type => 'scalar', optional => 1 },
id => { type => 'scalar', optional => 1 },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
received_trail == [
Xi Email::Abuse::Investigator;
result! : seq HOP_INFO
]
post: result! = self._rcvd_tracking
=cut | |||||
| 1367 | ||||||
| 1368 | sub received_trail { | |||||
| 1369 | 53 | 40 | my $self = $_[0]; | |||
| 1370 | ||||||
| 1371 | 53 53 | 32 84 | return @{ $self->{_rcvd_tracking} }; | |||
| 1372 | } | |||||
| 1373 | ||||||
| 1374 | # ----------------------------------------------------------------------- | |||||
| 1375 | # Public: risk assessment | |||||
| 1376 | # ----------------------------------------------------------------------- | |||||
| 1377 | ||||||
| 1378 - 1449 | =head2 risk_assessment()
Evaluates the message against heuristic checks and returns an overall risk
level, a weighted numeric score, and a list of every specific red flag.
The assessment covers five categories: originating IP, email authentication,
Date: header validity, identity/header consistency, and URL/domain properties.
The result is cached; subsequent calls return the same hashref without
repeating any analysis.
=head3 Usage
my $risk = $analyser->risk_assessment();
printf "Risk: %s (score: %d)\n", $risk->{level}, $risk->{score};
for my $f (@{ $risk->{flags} }) {
printf " [%s] %s\n", $f->{severity}, $f->{detail};
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A hashref with keys C<level> (HIGH/MEDIUM/LOW/INFO), C<score> (integer),
and C<flags> (arrayref of hashrefs with C<severity>, C<flag>, C<detail>).
=head3 Side Effects
Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.
=head3 Notes
Scores: HIGH >= 9, MEDIUM >= 5, LOW >= 2, INFO < 2.
Flag weights: HIGH=3, MEDIUM=2, LOW=1, INFO=0.
=head3 API Specification
=head4 Input
[]
=head4 Output
{
type => 'hashref',
keys => {
level => { type => 'scalar', regex => qr/^(?:HIGH|MEDIUM|LOW|INFO)$/ },
score => { type => 'scalar', regex => qr/^\d+$/ },
flags => { type => 'arrayref' },
},
}
=head3 FORMAL SPECIFICATION
-- Z notation
risk_assessment == [
Xi Email::Abuse::Investigator;
result! : RISK_INFO
]
post: result!.score = sum({ w(f.severity) | f in result!.flags }) /\
result!.level = classify(result!.score)
where:
w(HIGH) = 3; w(MEDIUM) = 2; w(LOW) = 1; w(INFO) = 0
classify(s) = HIGH if s >= 9
| MEDIUM if s >= 5
| LOW if s >= 2
| INFO otherwise
=cut | |||||
| 1450 | ||||||
| 1451 | sub risk_assessment { | |||||
| 1452 | 261 | 6828 | my $self = $_[0]; | |||
| 1453 | ||||||
| 1454 | 261 | 301 | return $self->{_risk} if $self->{_risk}; | |||
| 1455 | ||||||
| 1456 | 241 | 174 | my (@flags, $score); | |||
| 1457 | 241 | 162 | $score = 0; | |||
| 1458 | ||||||
| 1459 | # Closure to record a flag and accumulate its weight | |||||
| 1460 | my $flag = sub { | |||||
| 1461 | 256 | 273 | my ($severity, $name, $detail) = @_; | |||
| 1462 | 256 | 509 | $score += $FLAG_WEIGHT{$severity} // 1; | |||
| 1463 | 256 | 1100 | push @flags, { severity => $severity, flag => $name, detail => $detail }; | |||
| 1464 | 241 | 458 | }; | |||
| 1465 | ||||||
| 1466 | # --- Group 1: Originating IP checks --- | |||||
| 1467 | 241 | 259 | my $orig = $self->originating_ip(); | |||
| 1468 | 241 | 261 | if ($orig) { | |||
| 1469 | # Residential / broadband rDNS patterns suggest a compromised host | |||||
| 1470 | 198 | 776 | if ($orig->{rdns} && $orig->{rdns} =~ / | |||
| 1471 | \d+[-_.]\d+[-_.]\d+[-_.]\d+ # dotted-quad in rDNS | |||||
| 1472 | | (?:dsl|adsl|cable|broad|dial|dynamic|dhcp|ppp| | |||||
| 1473 | residential|cust|home|pool|client|user| | |||||
| 1474 | static\d|host\d) | |||||
| 1475 | /xi) { | |||||
| 1476 | 27 | 43 | $flag->('HIGH', 'residential_sending_ip', | |||
| 1477 | "Sending IP $orig->{ip} rDNS '$orig->{rdns}' looks like a broadband/residential line, not a legitimate mail server"); | |||||
| 1478 | } | |||||
| 1479 | ||||||
| 1480 | # Absence of rDNS is a strong spam indicator | |||||
| 1481 | 198 | 345 | if (!$orig->{rdns} || $orig->{rdns} eq '(no reverse DNS)') { | |||
| 1482 | 8 | 35 | $flag->('HIGH', 'no_reverse_dns', | |||
| 1483 | "Sending IP $orig->{ip} has no reverse DNS -- legitimate mail servers always have rDNS"); | |||||
| 1484 | } | |||||
| 1485 | ||||||
| 1486 | # Low-confidence origin means the IP came from an unverifiable header | |||||
| 1487 | 198 | 206 | if ($orig->{confidence} eq 'low') { | |||
| 1488 | 3 | 7 | $flag->('MEDIUM', 'low_confidence_origin', | |||
| 1489 | "Originating IP taken from unverified header ($orig->{note})"); | |||||
| 1490 | } | |||||
| 1491 | ||||||
| 1492 | # Statistically high-volume spam countries (informational only) | |||||
| 1493 | 198 | 281 | if ($orig->{country} && $orig->{country} =~ /^(?:CN|RU|NG|VN|IN|PK|BD)$/) { | |||
| 1494 | $flag->('INFO', 'high_spam_country', | |||||
| 1495 | 18 | 37 | 'Sending IP is in ' . _country_name($orig->{country}) . | |||
| 1496 | " ($orig->{country}) -- statistically high spam volume country"); | |||||
| 1497 | } | |||||
| 1498 | } | |||||
| 1499 | ||||||
| 1500 | # --- Group 2: Email authentication checks --- | |||||
| 1501 | 241 | 269 | my $auth = $self->_parse_auth_results_cached(); | |||
| 1502 | 241 | 245 | if (defined $auth->{spf}) { | |||
| 1503 | 19 | 52 | if ($auth->{spf} =~ /^fail/i) { | |||
| 1504 | 6 | 10 | $flag->('HIGH', 'spf_fail', | |||
| 1505 | "SPF result: $auth->{spf} -- sending IP not authorised by domain's SPF record"); | |||||
| 1506 | } elsif ($auth->{spf} =~ /^softfail/i) { | |||||
| 1507 | 1 | 2 | $flag->('MEDIUM', 'spf_softfail', | |||
| 1508 | "SPF result: softfail (~all) -- sending IP not explicitly authorised"); | |||||
| 1509 | } elsif ($auth->{spf} !~ /^pass/i) { | |||||
| 1510 | 0 | 0 | $flag->('HIGH', 'spf_fail', | |||
| 1511 | "SPF result: $auth->{spf} -- sending IP not authorised"); | |||||
| 1512 | } | |||||
| 1513 | } | |||||
| 1514 | 241 | 298 | if (defined $auth->{dkim} && $auth->{dkim} !~ /^pass/i) { | |||
| 1515 | 7 | 15 | $flag->('HIGH', 'dkim_fail', | |||
| 1516 | "DKIM result: $auth->{dkim} -- message signature invalid or absent"); | |||||
| 1517 | } | |||||
| 1518 | 241 | 301 | if (defined $auth->{dmarc} && $auth->{dmarc} !~ /^pass/i) { | |||
| 1519 | 5 | 8 | $flag->('HIGH', 'dmarc_fail', "DMARC result: $auth->{dmarc}"); | |||
| 1520 | } | |||||
| 1521 | ||||||
| 1522 | # DKIM signing domain vs From: domain mismatch check | |||||
| 1523 | 241 | 210 | if ($auth->{dkim_domain}) { | |||
| 1524 | 4 | 6 | my ($from_domain) = ($self->_header_value('from') // '') =~ /\@([\w.-]+)/; | |||
| 1525 | 4 | 6 | if ($from_domain) { | |||
| 1526 | 4 | 6 | my $reg_dkim = _registrable($auth->{dkim_domain}) // $auth->{dkim_domain}; | |||
| 1527 | 4 | 6 | my $reg_from = _registrable(lc $from_domain) // lc $from_domain; | |||
| 1528 | 4 | 5 | if ($reg_dkim ne $reg_from) { | |||
| 1529 | # Passing DKIM with a different domain is normal for ESPs | |||||
| 1530 | 2 | 5 | if ($auth->{dkim} && $auth->{dkim} =~ /^pass/i) { | |||
| 1531 | 1 | 3 | $flag->('INFO', 'dkim_domain_mismatch', | |||
| 1532 | "DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'" | |||||
| 1533 | . ' -- message sent via third-party sender (normal for bulk/ESP mail)'); | |||||
| 1534 | } else { | |||||
| 1535 | # Failing DKIM plus mismatched domain is more suspicious | |||||
| 1536 | 1 | 2 | $flag->('MEDIUM', 'dkim_domain_mismatch', | |||
| 1537 | "DKIM signed by '$auth->{dkim_domain}' but From: domain is '$from_domain'" | |||||
| 1538 | . ' and DKIM did not pass -- possible impersonation'); | |||||
| 1539 | } | |||||
| 1540 | } | |||||
| 1541 | } | |||||
| 1542 | } | |||||
| 1543 | ||||||
| 1544 | # --- Group 3: Date: header checks --- | |||||
| 1545 | 241 | 246 | my $date_raw = $self->_header_value('date'); | |||
| 1546 | 241 | 468 | if (!$date_raw || $date_raw !~ /\S/) { | |||
| 1547 | 35 | 34 | $flag->('MEDIUM', 'missing_date', | |||
| 1548 | 'No Date: header -- violates RFC 5322; common in spam'); | |||||
| 1549 | } else { | |||||
| 1550 | # Check for an implausible timezone offset (outside real-world bounds) | |||||
| 1551 | 206 | 442 | if ($date_raw =~ /([+-])(\d{2})(\d{2})\s*$/) { | |||
| 1552 | 206 | 309 | my ($sign, $hh, $mm) = ($1, $2, $3); | |||
| 1553 | 206 | 229 | my $offset_mins = $hh * 60 + $mm; | |||
| 1554 | 206 | 545 | my $implausible = $mm >= 60 | |||
| 1555 | || ($sign eq '+' && $offset_mins > $TZ_MAX_POS_MINS) | |||||
| 1556 | || ($sign eq '-' && $offset_mins > $TZ_MAX_NEG_MINS); | |||||
| 1557 | 206 | 217 | if ($implausible) { | |||
| 1558 | 8 | 17 | $flag->('MEDIUM', 'implausible_timezone', | |||
| 1559 | "Date: '$date_raw' contains an implausible timezone offset " | |||||
| 1560 | . "($sign$hh$mm) -- header is likely forged"); | |||||
| 1561 | } | |||||
| 1562 | } | |||||
| 1563 | ||||||
| 1564 | # Check for dates more than DATE_SKEW_DAYS outside the analysis window | |||||
| 1565 | 206 | 202 | my $date_epoch = _parse_rfc2822_date($date_raw); | |||
| 1566 | 206 | 3220 | if (defined $date_epoch) { | |||
| 1567 | 206 | 186 | my $delta = time() - $date_epoch; | |||
| 1568 | 206 | 272 | if ($delta > $DATE_SKEW_DAYS * $SECS_PER_DAY) { | |||
| 1569 | 25 | 43 | $flag->('LOW', 'suspicious_date', | |||
| 1570 | "Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the past"); | |||||
| 1571 | } elsif ($delta < -($DATE_SKEW_DAYS * $SECS_PER_DAY)) { | |||||
| 1572 | 2 | 3 | $flag->('LOW', 'suspicious_date', | |||
| 1573 | "Date: '$date_raw' is more than $DATE_SKEW_DAYS days in the future"); | |||||
| 1574 | } | |||||
| 1575 | } | |||||
| 1576 | } | |||||
| 1577 | ||||||
| 1578 | # --- Group 4: Header identity checks --- | |||||
| 1579 | 241 | 230 | my $from_raw = $self->_header_value('from') // ''; | |||
| 1580 | 241 | 273 | my $from_decoded = $self->_decode_mime_words($from_raw); | |||
| 1581 | ||||||
| 1582 | # Display-name domain spoofing: "PayPal paypal.com" <phish@evil.example> | |||||
| 1583 | 241 | 522 | if ($from_decoded =~ /^"?([^"<]+?)"?\s*<([^>]+)>/) { | |||
| 1584 | 130 | 156 | my ($display, $addr) = ($1, $2); | |||
| 1585 | 130 | 179 | while ($display =~ /\b([\w-]+\.(?:com|net|org|io|co|uk|au|gov|edu))\b/gi) { | |||
| 1586 | 5 | 7 | my $disp_domain = lc $1; | |||
| 1587 | 5 | 9 | my ($addr_domain) = $addr =~ /\@([\w.-]+)/; | |||
| 1588 | 5 | 9 | $addr_domain = lc($addr_domain // ''); | |||
| 1589 | 5 | 8 | my $reg_disp = _registrable($disp_domain); | |||
| 1590 | 5 | 8 | my $reg_addr = _registrable($addr_domain); | |||
| 1591 | 5 | 17 | if ($reg_disp && $reg_addr && $reg_disp ne $reg_addr) { | |||
| 1592 | 5 | 9 | $flag->('HIGH', 'display_name_domain_spoof', | |||
| 1593 | "From: display name mentions '$disp_domain' but actual address is <$addr>"); | |||||
| 1594 | } | |||||
| 1595 | } | |||||
| 1596 | } | |||||
| 1597 | ||||||
| 1598 | # Free webmail sender flag (no corporate infrastructure) | |||||
| 1599 | 241 | 515 | if ($from_raw =~ /\@(gmail|yahoo|hotmail|outlook|live|aol|protonmail|yandex)\./i | |||
| 1600 | || $from_raw =~ /\@mail\.ru(?:[\s>]|$)/i) { | |||||
| 1601 | 26 | 60 | $flag->('MEDIUM', 'free_webmail_sender', | |||
| 1602 | "Message sent from free webmail address ($from_raw)"); | |||||
| 1603 | } | |||||
| 1604 | ||||||
| 1605 | # Reply-To differs from From: -- replies harvested by different address | |||||
| 1606 | 241 | 213 | my $reply_to = $self->_header_value('reply-to'); | |||
| 1607 | 241 | 212 | if ($reply_to) { | |||
| 1608 | 6 | 13 | my ($from_addr) = $from_raw =~ /([\w.+%-]+\@[\w.-]+)/; | |||
| 1609 | 6 | 11 | my ($reply_addr) = $reply_to =~ /([\w.+%-]+\@[\w.-]+)/; | |||
| 1610 | 6 | 27 | if ($from_addr && $reply_addr && lc($from_addr) ne lc($reply_addr)) { | |||
| 1611 | 4 | 6 | $flag->('MEDIUM', 'reply_to_differs_from_from', | |||
| 1612 | "Reply-To ($reply_addr) differs from From: ($from_addr)"); | |||||
| 1613 | } | |||||
| 1614 | } | |||||
| 1615 | ||||||
| 1616 | # Undisclosed or absent To: header | |||||
| 1617 | 241 | 204 | my $to = $self->_header_value('to') // ''; | |||
| 1618 | 241 | 531 | if ($to =~ /undisclosed|:;/ || $to eq '') { | |||
| 1619 | 37 | 45 | $flag->('MEDIUM', 'undisclosed_recipients', | |||
| 1620 | "To: header is '$to' -- message was bulk-sent with hidden recipient list"); | |||||
| 1621 | } | |||||
| 1622 | ||||||
| 1623 | # MIME-encoded Subject (potential filter evasion) | |||||
| 1624 | 241 | 216 | my $subj_raw = $self->_header_value('subject') // ''; | |||
| 1625 | 241 | 275 | if ($subj_raw =~ /=\?[^?]+\?[BQ]\?/i) { | |||
| 1626 | 7 | 12 | $flag->('LOW', 'encoded_subject', | |||
| 1627 | "Subject line is MIME-encoded: '$subj_raw' (decoded: '" | |||||
| 1628 | . $self->_decode_mime_words($subj_raw) . "')"); | |||||
| 1629 | } | |||||
| 1630 | ||||||
| 1631 | # --- Group 5: URL and domain checks --- | |||||
| 1632 | 241 | 170 | my (%shortener_seen, %url_host_seen); | |||
| 1633 | 241 | 261 | for my $u ($self->embedded_urls()) { | |||
| 1634 | # Skip trusted infrastructure -- these are not spam indicators | |||||
| 1635 | 49 | 52 | my $bare = lc $u->{host}; | |||
| 1636 | 49 | 47 | $bare =~ s/^www\.//; | |||
| 1637 | 49 | 58 | next if $self->{trusted_domains}->{$bare}; | |||
| 1638 | 49 | 55 | next if $TRUSTED_DOMAINS{$bare}; | |||
| 1639 | ||||||
| 1640 | # URL shortener hides real destination | |||||
| 1641 | 47 | 120 | if(($URL_SHORTENERS{$bare} || $self->{url_shorteners}->{$bare}) && !$shortener_seen{$bare}++) { | |||
| 1642 | 6 | 9 | $flag->('MEDIUM', 'url_shortener', | |||
| 1643 | "$u->{host} is a URL shortener -- the real destination is hidden"); | |||||
| 1644 | } | |||||
| 1645 | # Plain HTTP provides no encryption | |||||
| 1646 | 47 | 91 | if ($u->{url} =~ m{^http://}i && !$url_host_seen{ $u->{host} }++) { | |||
| 1647 | 5 | 11 | $flag->('LOW', 'http_not_https', | |||
| 1648 | "$u->{host} linked over plain HTTP -- no encryption"); | |||||
| 1649 | } | |||||
| 1650 | } | |||||
| 1651 | ||||||
| 1652 | # Domain-level checks against contact/reply domains | |||||
| 1653 | 241 | 233 | for my $d ($self->mailto_domains()) { | |||
| 1654 | # Recently registered domain is a common phishing indicator | |||||
| 1655 | 109 | 107 | if ($d->{recently_registered}) { | |||
| 1656 | 5 | 35 | $flag->('HIGH', 'recently_registered_domain', | |||
| 1657 | 5 | 12 | "$d->{domain} was registered $d->{registered} (less than ${\$RECENT_REG_DAYS} days ago)"); | |||
| 1658 | } | |||||
| 1659 | ||||||
| 1660 | # Domain expiry checks | |||||
| 1661 | 109 | 100 | if ($d->{expires}) { | |||
| 1662 | 13 | 16 | if(my $exp = $self->_parse_date_to_epoch($d->{expires})) { | |||
| 1663 | 13 | 176 | my $now = time(); | |||
| 1664 | 13 | 11 | my $remaining = $exp - $now; | |||
| 1665 | 13 | 38 | if ($remaining > 0 && $remaining < $EXPIRY_WARN_DAYS * $SECS_PER_DAY) { | |||
| 1666 | 3 | 6 | $flag->('HIGH', 'domain_expires_soon', | |||
| 1667 | "$d->{domain} expires $d->{expires} -- may be a throwaway domain"); | |||||
| 1668 | } elsif ($remaining <= 0) { | |||||
| 1669 | 3 | 6 | $flag->('HIGH', 'domain_expired', | |||
| 1670 | "$d->{domain} expired $d->{expires} -- domain has lapsed"); | |||||
| 1671 | } | |||||
| 1672 | } | |||||
| 1673 | } | |||||
| 1674 | ||||||
| 1675 | # Lookalike domain check (brand name in a non-brand domain) | |||||
| 1676 | 109 | 109 | for my $brand (qw(paypal apple google amazon microsoft netflix ebay | |||
| 1677 | instagram facebook twitter linkedin bankofamerica | |||||
| 1678 | wellsfargo chase barclays hsbc lloyds santander)) { | |||||
| 1679 | 1833 | 4688 | if ($d->{domain} =~ /\Q$brand\E/i && | |||
| 1680 | $d->{domain} !~ /^\Q$brand\E\.(?:com|co\.uk|net|org)$/) { | |||||
| 1681 | 8 | 11 | $flag->('HIGH', 'lookalike_domain', | |||
| 1682 | "$d->{domain} contains brand name '$brand' but is not the real domain -- possible phishing"); | |||||
| 1683 | 8 | 11 | last; | |||
| 1684 | } | |||||
| 1685 | } | |||||
| 1686 | } | |||||
| 1687 | ||||||
| 1688 | # Determine overall risk level from accumulated score | |||||
| 1689 | 241 | 331 | my $level = $score >= $SCORE_HIGH ? 'HIGH' | |||
| 1690 | : $score >= $SCORE_MEDIUM ? 'MEDIUM' | |||||
| 1691 | : $score >= $SCORE_LOW ? 'LOW' | |||||
| 1692 | : 'INFO'; | |||||
| 1693 | ||||||
| 1694 | 241 | 378 | $self->{_risk} = { level => $level, score => $score, flags => \@flags }; | |||
| 1695 | 241 | 839 | return $self->{_risk}; | |||
| 1696 | } | |||||
| 1697 | ||||||
| 1698 | # ----------------------------------------------------------------------- | |||||
| 1699 | # Public: abuse report text | |||||
| 1700 | # ----------------------------------------------------------------------- | |||||
| 1701 | ||||||
| 1702 - 1758 | =head2 abuse_report_text()
Produces a compact, plain-text string suitable for sending as the body of
an abuse report email. It summarises risk level, red flags, originating IP,
abuse contacts, and original message headers. The message body is omitted
to keep the report concise.
Use C<abuse_contacts()> to get the recipient addresses and this method for
the body text.
=head3 Usage
my $text = $analyser->abuse_report_text();
my @contacts = $analyser->abuse_contacts();
for my $c (@contacts) {
send_email(to => $c->{address}, body => $text);
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A plain scalar string, newline-terminated, Unix line endings. Never empty
or undef.
=head3 Side Effects
Calls C<risk_assessment()>, C<originating_ip()>, and C<abuse_contacts()>
if not already cached.
=head3 Notes
Output text is sanitised: control characters that could affect terminal or
HTML rendering are stripped from all user-derived content before inclusion.
=head3 API Specification
=head4 Input
[]
=head4 Output
{ type => 'scalar' }
=head3 FORMAL SPECIFICATION
-- Z notation
abuse_report_text == [
Xi Email::Abuse::Investigator;
result! : STRING
]
post: result! /= '' /\ result! ends_with '\n'
=cut | |||||
| 1759 | ||||||
| 1760 | sub abuse_report_text { | |||||
| 1761 | 14 | 3874 | my $self = $_[0]; | |||
| 1762 | 14 | 12 | my @out; | |||
| 1763 | ||||||
| 1764 | 14 | 18 | push @out, 'This is an automated abuse report generated by Email::Abuse::Investigator.', | |||
| 1765 | 'Please investigate the following spam/phishing message.', | |||||
| 1766 | ''; | |||||
| 1767 | ||||||
| 1768 | 14 | 47 | my $risk = $self->risk_assessment(); | |||
| 1769 | 14 | 29 | push @out, "RISK LEVEL: $risk->{level} (score: $risk->{score})", | |||
| 1770 | ''; | |||||
| 1771 | ||||||
| 1772 | # List each red flag with its severity prefix | |||||
| 1773 | 14 14 | 8 21 | if (@{ $risk->{flags} }) { | |||
| 1774 | 9 | 10 | push @out, 'RED FLAGS IDENTIFIED:'; | |||
| 1775 | 9 9 | 9 11 | for my $f (@{ $risk->{flags} }) { | |||
| 1776 | 17 | 24 | push @out, " [$f->{severity}] " . _sanitise_output($f->{detail}); | |||
| 1777 | } | |||||
| 1778 | 9 | 9 | push @out, ''; | |||
| 1779 | } | |||||
| 1780 | ||||||
| 1781 | # Originating IP summary block | |||||
| 1782 | 14 | 16 | my $orig = $self->originating_ip(); | |||
| 1783 | 14 | 17 | if ($orig) { | |||
| 1784 | push @out, 'ORIGINATING IP: ' . _sanitise_output("$orig->{ip} ($orig->{rdns})"), | |||||
| 1785 | 11 | 16 | 'NETWORK OWNER: ' . _sanitise_output($orig->{org}), | |||
| 1786 | ''; | |||||
| 1787 | } | |||||
| 1788 | ||||||
| 1789 | # Email abuse contacts | |||||
| 1790 | 14 | 27 | my @contacts = $self->abuse_contacts(); | |||
| 1791 | 14 | 17 | if (@contacts) { | |||
| 1792 | 12 | 12 | push @out, 'ABUSE CONTACTS:'; | |||
| 1793 | 12 | 25 | push @out, ' ' . _sanitise_output("$_->{address} ($_->{role})") for @contacts; | |||
| 1794 | 12 | 11 | push @out, ''; | |||
| 1795 | } | |||||
| 1796 | ||||||
| 1797 | # Web-form contacts (providers that reject email) | |||||
| 1798 | 14 | 21 | if(my @form_cs = $self->form_contacts()) { | |||
| 1799 | 0 | 0 | push @out, 'WEB-FORM REPORTS REQUIRED:', | |||
| 1800 | ' The following parties do not accept email -- submit manually:'; | |||||
| 1801 | 0 | 0 | for my $c (@form_cs) { | |||
| 1802 | push @out, " [$c->{role}]", | |||||
| 1803 | 0 | 0 | ' Form : ' . _sanitise_output($c->{form}); | |||
| 1804 | 0 | 0 | push @out, ' Domain : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain}; | |||
| 1805 | 0 | 0 | push @out, ' Paste : ' . _sanitise_output($c->{form_paste}) if $c->{form_paste}; | |||
| 1806 | 0 | 0 | push @out, ' Upload : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload}; | |||
| 1807 | } | |||||
| 1808 | 0 | 0 | push @out, ''; | |||
| 1809 | } | |||||
| 1810 | ||||||
| 1811 | # Separator and raw headers (body excluded for brevity) | |||||
| 1812 | 14 | 24 | push @out, '-' x 72, | |||
| 1813 | 'ORIGINAL MESSAGE HEADERS:', | |||||
| 1814 | '-' x 72; | |||||
| 1815 | ||||||
| 1816 | 14 14 | 11 15 | for my $h (@{ $self->{_headers} }) { | |||
| 1817 | 114 | 105 | push @out, _sanitise_output("$h->{name}: $h->{value}"); | |||
| 1818 | } | |||||
| 1819 | 14 | 14 | push @out, ''; | |||
| 1820 | ||||||
| 1821 | 14 | 67 | return join("\n", @out); | |||
| 1822 | } | |||||
| 1823 | ||||||
| 1824 | # ----------------------------------------------------------------------- | |||||
| 1825 | # Public: abuse contacts | |||||
| 1826 | # ----------------------------------------------------------------------- | |||||
| 1827 | ||||||
| 1828 - 1895 | =head2 abuse_contacts()
Collates the complete set of parties that should receive an abuse report:
the sending ISP, URL host operators, contact domain web/mail/DNS/registrar
contacts, account providers identified from key headers, the DKIM signer,
and the ESP identified via List-Unsubscribe.
Addresses are deduplicated globally; if the same address is found via
multiple routes, a single entry is kept and role strings are merged.
=head3 Usage
my @contacts = $analyser->abuse_contacts();
my @addrs = map { $_->{address} } @contacts;
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs, one per unique abuse address, in discovery order.
Each hashref has keys C<role>, C<roles> (arrayref), C<address>, C<note>,
C<via>. Returns an empty list if no contacts can be determined.
=head3 Side Effects
Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.
=head3 Notes
The result is not independently cached; each call recomputes the contact
list from the cached results of the underlying methods.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
role => { type => 'scalar' },
roles => { type => 'arrayref' },
address => { type => 'scalar', regex => qr/\@/ },
note => { type => 'scalar' },
via => { type => 'scalar', regex => qr/^(?:provider-table|ip-whois|domain-whois)$/ },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
abuse_contacts == [
Xi Email::Abuse::Investigator;
result! : seq CONTACT_INFO
]
post: forall c : result! @ c.address contains '@' /\
forall c1, c2 : result! @ c1 /= c2 => c1.address /= c2.address
=cut | |||||
| 1896 | ||||||
| 1897 | sub abuse_contacts { | |||||
| 1898 | 136 | 3008 | my $self = $_[0]; | |||
| 1899 | ||||||
| 1900 | 136 | 104 | my (@contacts, %seen_idx); | |||
| 1901 | ||||||
| 1902 | # Inner closure: add one contact entry, merging roles for duplicate addresses | |||||
| 1903 | my $add = sub { | |||||
| 1904 | 296 | 473 | my (%args) = @_; | |||
| 1905 | 296 | 346 | my $addr = lc($args{address} // ''); | |||
| 1906 | 296 | 442 | return unless $addr && $addr =~ /\@/; | |||
| 1907 | ||||||
| 1908 | # Suppress addresses belonging to form-only providers (no email accepted) | |||||
| 1909 | 294 | 428 | if ($addr =~ /\@([\w.-]+)$/) { | |||
| 1910 | 294 | 248 | my $dom = $1; | |||
| 1911 | 294 | 258 | my $pa = $self->_provider_abuse_for_host($dom); | |||
| 1912 | 294 | 373 | return if $pa && $pa->{form} && !$pa->{email}; | |||
| 1913 | } | |||||
| 1914 | ||||||
| 1915 | 290 | 324 | if (exists $seen_idx{$addr}) { | |||
| 1916 | # Merge the new role into the existing entry | |||||
| 1917 | 145 | 108 | my $entry = $contacts[ $seen_idx{$addr} ]; | |||
| 1918 | 145 145 | 88 148 | push @{ $entry->{roles} }, $args{role}; | |||
| 1919 | ||||||
| 1920 | # Collapse repeated role labels to avoid unreadable strings | |||||
| 1921 | 145 | 100 | my (%role_counts, @ordered_roles); | |||
| 1922 | 145 145 | 86 114 | for my $r (@{ $entry->{roles} }) { | |||
| 1923 | 1219 | 1155 | push @ordered_roles, $r unless $role_counts{$r}++; | |||
| 1924 | } | |||||
| 1925 | my @display = map { | |||||
| 1926 | 145 1212 | 131 1020 | $role_counts{$_} > 1 ? "$_ (x$role_counts{$_})" : $_ | |||
| 1927 | } @ordered_roles; | |||||
| 1928 | 145 | 211 | my $joined = join(' and ', @display); | |||
| 1929 | ||||||
| 1930 | # Summarise if the merged string is too long to read | |||||
| 1931 | 145 | 157 | if (length($joined) > $ROLE_MAX_LEN) { | |||
| 1932 | my @short = map { | |||||
| 1933 | 73 1051 | 55 1103 | (my $s = $_) =~ s/[:(\d].*//; | |||
| 1934 | 1051 | 980 | $s =~ s/\s+$//; | |||
| 1935 | 1051 | 764 | $s; | |||
| 1936 | } @display; | |||||
| 1937 | 73 | 142 | $joined = scalar(@display) . ' routes: ' . join(', ', @short); | |||
| 1938 | } | |||||
| 1939 | 145 | 111 | $entry->{role} = $joined; | |||
| 1940 | 145 | 315 | return; | |||
| 1941 | } | |||||
| 1942 | ||||||
| 1943 | # First time seeing this address -- record and store | |||||
| 1944 | 145 | 163 | $seen_idx{$addr} = scalar @contacts; | |||
| 1945 | 145 | 173 | $args{roles} = [ $args{role} ]; | |||
| 1946 | 145 | 185 | push @contacts, \%args; | |||
| 1947 | 136 | 369 | }; | |||
| 1948 | ||||||
| 1949 | # Route 1 -- Sending ISP (originating IP) | |||||
| 1950 | 136 | 148 | my $orig = $self->originating_ip(); | |||
| 1951 | 136 | 161 | if ($orig) { | |||
| 1952 | 71 | 111 | my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns}); | |||
| 1953 | 71 | 110 | if ($pa) { | |||
| 1954 | $add->( | |||||
| 1955 | role => 'Sending ISP', | |||||
| 1956 | address => $pa->{email}, | |||||
| 1957 | 13 | 29 | note => "$orig->{ip} ($orig->{rdns}) -- $pa->{note}", | |||
| 1958 | via => 'provider-table', | |||||
| 1959 | ); | |||||
| 1960 | } | |||||
| 1961 | 71 | 156 | if ($orig->{abuse} && $orig->{abuse} ne '(unknown)') { | |||
| 1962 | $add->( | |||||
| 1963 | role => 'Sending ISP', | |||||
| 1964 | address => $orig->{abuse}, | |||||
| 1965 | 64 | 124 | note => "Network owner of originating IP $orig->{ip} ($orig->{org})", | |||
| 1966 | via => 'ip-whois', | |||||
| 1967 | ); | |||||
| 1968 | } | |||||
| 1969 | } | |||||
| 1970 | ||||||
| 1971 | # Route 2 -- URL hosts | |||||
| 1972 | 136 | 108 | my %url_host_seen; | |||
| 1973 | 136 | 150 | for my $u ($self->embedded_urls()) { | |||
| 1974 | 78 | 120 | next if $url_host_seen{ $u->{host} }++; | |||
| 1975 | 66 | 69 | my $bare_host = lc $u->{host}; | |||
| 1976 | 66 | 71 | $bare_host =~ s/^www\.//; | |||
| 1977 | # Skip trusted infrastructure (Google, W3C, etc.) | |||||
| 1978 | 66 | 79 | next if $self->{trusted_domains}->{$bare_host}; | |||
| 1979 | 66 | 86 | next if $TRUSTED_DOMAINS{$bare_host}; | |||
| 1980 | 63 | 73 | my $pa = $self->_provider_abuse_for_host($u->{host}); | |||
| 1981 | 63 | 64 | if ($pa) { | |||
| 1982 | $add->( | |||||
| 1983 | role => "URL host: $u->{host}", | |||||
| 1984 | address => $pa->{email}, | |||||
| 1985 | 15 | 71 | note => "$u->{host} -- $pa->{note}", | |||
| 1986 | via => 'provider-table', | |||||
| 1987 | ); | |||||
| 1988 | } | |||||
| 1989 | 63 | 117 | if ($u->{abuse} && $u->{abuse} ne '(unknown)') { | |||
| 1990 | $add->( | |||||
| 1991 | role => "URL host: $u->{host}", | |||||
| 1992 | address => $u->{abuse}, | |||||
| 1993 | 39 | 84 | note => "Hosting $u->{host} ($u->{ip}, $u->{org})", | |||
| 1994 | via => 'ip-whois', | |||||
| 1995 | ); | |||||
| 1996 | } | |||||
| 1997 | } | |||||
| 1998 | ||||||
| 1999 | # Route 3 -- Contact domain hosting and registration | |||||
| 2000 | 136 | 147 | for my $d ($self->mailto_domains()) { | |||
| 2001 | 152 | 144 | my $dom = $d->{domain}; | |||
| 2002 | ||||||
| 2003 | # Web host contact | |||||
| 2004 | 152 | 187 | if ($d->{web_abuse}) { | |||
| 2005 | 57 | 58 | my $pa = $self->_provider_abuse_for_host($dom); | |||
| 2006 | 57 | 45 | if ($pa) { | |||
| 2007 | $add->(role => "Web host of $dom", address => $pa->{email}, | |||||
| 2008 | 0 | 0 | note => $pa->{note}, via => 'provider-table'); | |||
| 2009 | } | |||||
| 2010 | $add->( | |||||
| 2011 | role => "Web host of $dom", | |||||
| 2012 | address => $d->{web_abuse}, | |||||
| 2013 | note => sprintf('Hosting %s (%s, %s)', | |||||
| 2014 | $dom // '(unknown domain)', | |||||
| 2015 | $d->{web_ip} // '(unknown IP)', | |||||
| 2016 | 57 | 192 | $d->{web_org} // '(unknown org)'), | |||
| 2017 | via => 'ip-whois', | |||||
| 2018 | ); | |||||
| 2019 | } | |||||
| 2020 | ||||||
| 2021 | # MX (mail host) contact | |||||
| 2022 | 152 | 150 | if ($d->{mx_abuse}) { | |||
| 2023 | $add->( | |||||
| 2024 | role => "Mail host (MX) for $dom", | |||||
| 2025 | address => $d->{mx_abuse}, | |||||
| 2026 | note => sprintf('MX %s (%s, %s)', | |||||
| 2027 | $d->{mx_host} // '(unknown host)', | |||||
| 2028 | $d->{mx_ip} // '(unknown IP)', | |||||
| 2029 | 23 | 93 | $d->{mx_org} // '(unknown org)'), | |||
| 2030 | via => 'ip-whois', | |||||
| 2031 | ); | |||||
| 2032 | } | |||||
| 2033 | ||||||
| 2034 | # NS (DNS host) contact | |||||
| 2035 | 152 | 150 | if ($d->{ns_abuse}) { | |||
| 2036 | $add->( | |||||
| 2037 | role => "DNS host (NS) for $dom", | |||||
| 2038 | address => $d->{ns_abuse}, | |||||
| 2039 | note => sprintf('NS %s (%s, %s)', | |||||
| 2040 | $d->{ns_host} // '(unknown host)', | |||||
| 2041 | $d->{ns_ip} // '(unknown IP)', | |||||
| 2042 | 15 | 50 | $d->{ns_org} // '(unknown org)'), | |||
| 2043 | via => 'ip-whois', | |||||
| 2044 | ); | |||||
| 2045 | } | |||||
| 2046 | ||||||
| 2047 | # Domain registrar (skip if domain only seen in spoofable headers) | |||||
| 2048 | 152 | 151 | if ($d->{registrar_abuse}) { | |||
| 2049 | my $spoofable_only = | |||||
| 2050 | $d->{source} =~ /^(?:From:|Return-Path:|Sender:) header$/ && | |||||
| 2051 | !scalar(grep { | |||||
| 2052 | 36 | 67 | $_->{host} && | |||
| 2053 | _registrable($_->{host}) eq (_registrable($dom) // $dom) | |||||
| 2054 | } $self->embedded_urls()); | |||||
| 2055 | 36 | 38 | unless ($spoofable_only) { | |||
| 2056 | $add->( | |||||
| 2057 | role => "Domain registrar for $dom", | |||||
| 2058 | address => $d->{registrar_abuse}, | |||||
| 2059 | 28 | 66 | note => 'Registrar: ' . ($d->{registrar} // '(unknown)'), | |||
| 2060 | via => 'domain-whois', | |||||
| 2061 | ); | |||||
| 2062 | } | |||||
| 2063 | } | |||||
| 2064 | } | |||||
| 2065 | ||||||
| 2066 | # Route 4 -- From:/Reply-To:/Return-Path:/Sender: account provider | |||||
| 2067 | 136 | 132 | for my $hname (qw(from reply-to return-path sender)) { | |||
| 2068 | 544 | 452 | my $val = $self->_header_value($hname) // next; | |||
| 2069 | ||||||
| 2070 | # Extract addr-spec from angle-bracket form to avoid display-name @-signs | |||||
| 2071 | 245 | 501 | my $addr_spec = ($val =~ /<([^>]*)>\s*$/) ? $1 : $val; | |||
| 2072 | 245 | 379 | my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/; | |||
| 2073 | 245 | 230 | next unless $addr_domain; | |||
| 2074 | ||||||
| 2075 | # Skip SRS-rewritten forwarder addresses (not the real sender) | |||||
| 2076 | 245 | 232 | next if $addr_spec =~ /\+SRS[0-9]?=/i; | |||
| 2077 | ||||||
| 2078 | 243 | 201 | my $pa = $self->_provider_abuse_for_host($addr_domain); | |||
| 2079 | 243 | 253 | if ($pa) { | |||
| 2080 | 37 | 49 | my $role_addr = $addr_spec =~ /\@/ ? $addr_spec : $val; | |||
| 2081 | 37 | 103 | $role_addr =~ s/^\s+|\s+$//g; | |||
| 2082 | $add->( | |||||
| 2083 | role => "Account provider ($hname: $role_addr)", | |||||
| 2084 | address => $pa->{email}, | |||||
| 2085 | note => $pa->{note}, | |||||
| 2086 | 37 | 73 | via => 'provider-table', | |||
| 2087 | ); | |||||
| 2088 | } | |||||
| 2089 | } | |||||
| 2090 | ||||||
| 2091 | # Route 5 -- DKIM signing organisation | |||||
| 2092 | 136 | 139 | my $auth = $self->_parse_auth_results_cached(); | |||
| 2093 | 136 | 138 | if ($auth->{dkim_domain}) { | |||
| 2094 | 3 | 4 | my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain}); | |||
| 2095 | 3 | 3 | if ($pa) { | |||
| 2096 | $add->( | |||||
| 2097 | role => "DKIM signer: $auth->{dkim_domain}", | |||||
| 2098 | address => $pa->{email}, | |||||
| 2099 | note => $pa->{note}, | |||||
| 2100 | 1 | 3 | via => 'provider-table', | |||
| 2101 | ); | |||||
| 2102 | } | |||||
| 2103 | } | |||||
| 2104 | ||||||
| 2105 | # Route 6 -- List-Unsubscribe ESP domain | |||||
| 2106 | 136 | 119 | my $unsub = $self->_header_value('list-unsubscribe'); | |||
| 2107 | 136 | 129 | if ($unsub) { | |||
| 2108 | 0 | 0 | my @unsub_domains; | |||
| 2109 | 0 | 0 | while ($unsub =~ m{https?://([^/:?\s>]+)}gi) { | |||
| 2110 | 0 | 0 | push @unsub_domains, lc $1; | |||
| 2111 | } | |||||
| 2112 | 0 | 0 | while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) { | |||
| 2113 | 0 | 0 | push @unsub_domains, lc $1; | |||
| 2114 | } | |||||
| 2115 | 0 | 0 | my %unsub_seen; | |||
| 2116 | 0 0 | 0 0 | for my $dom (grep { !$unsub_seen{$_}++ } @unsub_domains) { | |||
| 2117 | 0 | 0 | my $pa = $self->_provider_abuse_for_host($dom); | |||
| 2118 | 0 | 0 | if ($pa) { | |||
| 2119 | $add->( | |||||
| 2120 | role => "ESP / bulk sender (List-Unsubscribe: $dom)", | |||||
| 2121 | address => $pa->{email}, | |||||
| 2122 | 0 | 0 | note => "$pa->{note} -- responsible for this bulk delivery", | |||
| 2123 | via => 'provider-table', | |||||
| 2124 | ); | |||||
| 2125 | } | |||||
| 2126 | } | |||||
| 2127 | } | |||||
| 2128 | ||||||
| 2129 | # Route 7 -- Reply addresses embedded in the message body | |||||
| 2130 | 136 | 100 | my %body_addr_seen; | |||
| 2131 | 136 | 142 | my $combined_body = $self->{_body_plain} . "\n" . $self->{_body_html}; | |||
| 2132 | 136 | 154 | for my $addr_dom ($self->_domains_from_text($combined_body)) { | |||
| 2133 | 20 | 22 | next if $body_addr_seen{$addr_dom}++; | |||
| 2134 | 20 | 26 | my $pa = $self->_provider_abuse_for_host($addr_dom); | |||
| 2135 | 20 | 33 | next unless $pa && $pa->{email}; | |||
| 2136 | 4 | 55 | my ($example_addr) = $combined_body =~ /(\S+\@\Q$addr_dom\E)/i; | |||
| 2137 | 4 | 4 | $example_addr //= "\@$addr_dom"; | |||
| 2138 | $add->( | |||||
| 2139 | role => "Reply address in body ($example_addr)", | |||||
| 2140 | address => $pa->{email}, | |||||
| 2141 | note => $pa->{note}, | |||||
| 2142 | 4 | 6 | via => 'provider-table', | |||
| 2143 | ); | |||||
| 2144 | } | |||||
| 2145 | ||||||
| 2146 | 136 | 991 | return @contacts; | |||
| 2147 | } | |||||
| 2148 | ||||||
| 2149 | # ----------------------------------------------------------------------- | |||||
| 2150 | # Public: form contacts (providers that require web-form submission) | |||||
| 2151 | # ----------------------------------------------------------------------- | |||||
| 2152 | ||||||
| 2153 - 2219 | =head2 form_contacts()
Returns the list of parties that require abuse reports via a web form
rather than email. These are providers whose C<%PROVIDER_ABUSE> entry
has a C<form> key. Each hashref includes the form URL, paste
instructions, upload instructions, and the discovery role.
=head3 Usage
my @forms = $analyser->form_contacts();
for my $c (@forms) {
printf "Open: %s\n", $c->{form};
}
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A list of hashrefs, one per unique form contact. Each hashref has keys
C<form>, C<role>, C<note>, C<form_paste> (optional), C<form_upload>
(optional), and C<via>. Returns an empty list if no form contacts are found.
=head3 Side Effects
Triggers C<originating_ip()>, C<embedded_urls()>, and C<mailto_domains()>
if not already cached.
=head3 Notes
Deduplication is by form URL.
=head3 API Specification
=head4 Input
[]
=head4 Output
(
{
type => 'hashref',
keys => {
form => { type => 'scalar', regex => qr{^https?://} },
role => { type => 'scalar' },
note => { type => 'scalar' },
form_paste => { type => 'scalar', optional => 1 },
form_upload => { type => 'scalar', optional => 1 },
via => { type => 'scalar' },
},
},
...
)
=head3 FORMAL SPECIFICATION
-- Z notation
form_contacts == [
Xi Email::Abuse::Investigator;
result! : seq FORM_CONTACT_INFO
]
post: forall c : result! @ c.form =~ m{^https?://} /\
forall c1, c2 : result! @ c1 /= c2 => c1.form /= c2.form
=cut | |||||
| 2220 | ||||||
| 2221 | sub form_contacts { | |||||
| 2222 | 81 | 938 | my $self = $_[0]; | |||
| 2223 | ||||||
| 2224 | 81 | 64 | my (@contacts, %seen); | |||
| 2225 | ||||||
| 2226 | # Inner closure: add one form-contact entry, deduplicating by form URL | |||||
| 2227 | my $add = sub { | |||||
| 2228 | 11 | 45 | my (%args) = @_; | |||
| 2229 | 11 | 15 | my $form = $args{form} // ''; | |||
| 2230 | 11 | 12 | return unless $form; | |||
| 2231 | 11 | 15 | return if $seen{$form}++; | |||
| 2232 | 9 | 14 | push @contacts, \%args; | |||
| 2233 | 81 | 133 | }; | |||
| 2234 | ||||||
| 2235 | # Route 1 -- Sending ISP | |||||
| 2236 | 81 | 85 | my $orig = $self->originating_ip(); | |||
| 2237 | 81 | 80 | if ($orig) { | |||
| 2238 | 49 | 65 | my $pa = $self->_provider_abuse_for_ip($orig->{ip}, $orig->{rdns}); | |||
| 2239 | 49 | 59 | if ($pa && $pa->{form}) { | |||
| 2240 | $add->( | |||||
| 2241 | role => 'Sending ISP', | |||||
| 2242 | form => $pa->{form}, | |||||
| 2243 | note => $pa->{note} // '', | |||||
| 2244 | form_paste => $pa->{form_paste} // '', | |||||
| 2245 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2246 | via => 'provider-table', | |||||
| 2247 | ); | |||||
| 2248 | } | |||||
| 2249 | } | |||||
| 2250 | ||||||
| 2251 | # Route 2 -- URL hosts | |||||
| 2252 | 81 | 53 | my %url_host_seen; | |||
| 2253 | 81 | 76 | for my $u ($self->embedded_urls()) { | |||
| 2254 | 48 | 65 | next if $url_host_seen{ $u->{host} }++; | |||
| 2255 | 39 | 45 | my $pa = $self->_provider_abuse_for_host($u->{host}); | |||
| 2256 | 39 | 65 | if ($pa && $pa->{form}) { | |||
| 2257 | $add->( | |||||
| 2258 | role => "URL host: $u->{host}", | |||||
| 2259 | form => $pa->{form}, | |||||
| 2260 | form_domain => $u->{host}, | |||||
| 2261 | note => $pa->{note} // '', | |||||
| 2262 | form_paste => $pa->{form_paste} // '', | |||||
| 2263 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2264 | via => 'provider-table', | |||||
| 2265 | ); | |||||
| 2266 | } | |||||
| 2267 | } | |||||
| 2268 | ||||||
| 2269 | # Route 3 -- Contact domains (web host + registrar) | |||||
| 2270 | 81 | 81 | for my $d ($self->mailto_domains()) { | |||
| 2271 | 88 | 83 | my $dom = $d->{domain}; | |||
| 2272 | 88 | 72 | my $pa = $self->_provider_abuse_for_host($dom); | |||
| 2273 | 88 | 92 | if ($pa && $pa->{form}) { | |||
| 2274 | $add->( | |||||
| 2275 | role => "Web host of $dom", | |||||
| 2276 | form => $pa->{form}, | |||||
| 2277 | form_domain => $dom, | |||||
| 2278 | note => $pa->{note} // '', | |||||
| 2279 | form_paste => $pa->{form_paste} // '', | |||||
| 2280 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2281 | via => 'provider-table', | |||||
| 2282 | ); | |||||
| 2283 | } | |||||
| 2284 | ||||||
| 2285 | # Registrar identified via WHOIS -- check for form-only registrar | |||||
| 2286 | 88 | 132 | if ($d->{registrar_abuse} && $d->{registrar_abuse} =~ /\@([\w.-]+)/) { | |||
| 2287 | 18 | 21 | my $reg_domain = lc $1; | |||
| 2288 | 18 | 14 | my $rpa = $self->_provider_abuse_for_host($reg_domain); | |||
| 2289 | 18 | 32 | if ($rpa && $rpa->{form}) { | |||
| 2290 | $add->( | |||||
| 2291 | role => "Domain registrar for $dom (web form only)", | |||||
| 2292 | form => $rpa->{form}, | |||||
| 2293 | form_domain => $dom, | |||||
| 2294 | note => $rpa->{note} // '', | |||||
| 2295 | form_paste => $rpa->{form_paste} // '', | |||||
| 2296 | 11 | 34 | form_upload => $rpa->{form_upload} // '', | |||
| 2297 | via => 'provider-table', | |||||
| 2298 | ); | |||||
| 2299 | } | |||||
| 2300 | } | |||||
| 2301 | } | |||||
| 2302 | ||||||
| 2303 | # Route 4 -- Account provider headers | |||||
| 2304 | 81 | 79 | for my $hname (qw(from reply-to return-path sender)) { | |||
| 2305 | 324 | 240 | my $val = $self->_header_value($hname) // next; | |||
| 2306 | 142 | 226 | my $addr_spec = ($val =~ /<([^>]*)>\s*$/) ? $1 : $val; | |||
| 2307 | 142 | 214 | my ($addr_domain) = $addr_spec =~ /\@([\w.-]+)/; | |||
| 2308 | 142 | 121 | next unless $addr_domain; | |||
| 2309 | # Skip SRS forwarder rewrite addresses | |||||
| 2310 | 142 | 141 | next if $addr_spec =~ /\+SRS[0-9]?=/i; | |||
| 2311 | 142 | 125 | my $pa = $self->_provider_abuse_for_host($addr_domain); | |||
| 2312 | 142 | 150 | if ($pa && $pa->{form}) { | |||
| 2313 | 0 | 0 | my $role_addr = $addr_spec =~ /@/ ? $addr_spec : $val; | |||
| 2314 | 0 | 0 | $role_addr =~ s/^\s+|\s+$//g; | |||
| 2315 | $add->( | |||||
| 2316 | role => "Account provider ($hname: $role_addr)", | |||||
| 2317 | form => $pa->{form}, | |||||
| 2318 | note => $pa->{note} // '', | |||||
| 2319 | form_paste => $pa->{form_paste} // '', | |||||
| 2320 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2321 | via => 'provider-table', | |||||
| 2322 | ); | |||||
| 2323 | } | |||||
| 2324 | } | |||||
| 2325 | ||||||
| 2326 | # Route 5 -- DKIM signer | |||||
| 2327 | 81 | 74 | my $auth = $self->_parse_auth_results_cached(); | |||
| 2328 | 81 | 85 | if ($auth->{dkim_domain}) { | |||
| 2329 | 1 | 2 | my $pa = $self->_provider_abuse_for_host($auth->{dkim_domain}); | |||
| 2330 | 1 | 2 | if ($pa && $pa->{form}) { | |||
| 2331 | $add->( | |||||
| 2332 | role => "DKIM signer: $auth->{dkim_domain}", | |||||
| 2333 | form => $pa->{form}, | |||||
| 2334 | note => $pa->{note} // '', | |||||
| 2335 | form_paste => $pa->{form_paste} // '', | |||||
| 2336 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2337 | via => 'provider-table', | |||||
| 2338 | ); | |||||
| 2339 | } | |||||
| 2340 | } | |||||
| 2341 | ||||||
| 2342 | # Route 6 -- List-Unsubscribe ESP domains | |||||
| 2343 | 81 | 71 | my $unsub = $self->_header_value('list-unsubscribe'); | |||
| 2344 | 81 | 79 | if ($unsub) { | |||
| 2345 | 0 | 0 | my @unsub_domains; | |||
| 2346 | 0 0 | 0 0 | while ($unsub =~ m{https?://([^/:?\s>]+)}gi) { push @unsub_domains, lc $1 } | |||
| 2347 | 0 0 | 0 0 | while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) { push @unsub_domains, lc $1 } | |||
| 2348 | 0 | 0 | my %useen; | |||
| 2349 | 0 0 | 0 0 | for my $dom (grep { !$useen{$_}++ } @unsub_domains) { | |||
| 2350 | 0 | 0 | my $pa = $self->_provider_abuse_for_host($dom); | |||
| 2351 | 0 | 0 | if ($pa && $pa->{form}) { | |||
| 2352 | $add->( | |||||
| 2353 | role => "ESP / bulk sender (List-Unsubscribe: $dom)", | |||||
| 2354 | form => $pa->{form}, | |||||
| 2355 | note => $pa->{note} // '', | |||||
| 2356 | form_paste => $pa->{form_paste} // '', | |||||
| 2357 | 0 | 0 | form_upload => $pa->{form_upload} // '', | |||
| 2358 | via => 'provider-table', | |||||
| 2359 | ); | |||||
| 2360 | } | |||||
| 2361 | } | |||||
| 2362 | } | |||||
| 2363 | ||||||
| 2364 | 81 | 233 | return @contacts; | |||
| 2365 | } | |||||
| 2366 | ||||||
| 2367 | # ----------------------------------------------------------------------- | |||||
| 2368 | # Public: full analyst report | |||||
| 2369 | # ----------------------------------------------------------------------- | |||||
| 2370 | ||||||
| 2371 - 2427 | =head2 report()
Produces a comprehensive, analyst-facing plain-text report covering all
findings: envelope fields, risk assessment, originating host, sending
software, received chain tracking IDs, embedded URLs, contact domain
intelligence, and recommended abuse contacts.
Use C<report()> for human review or ticketing systems. Use
C<abuse_report_text()> for sending to ISP abuse desks.
=head3 Usage
print $analyser->report();
open my $fh, '>', 'report.txt' or croak "Cannot open: $!";
print $fh $analyser->report();
close $fh;
=head3 Arguments
None. C<parse_email()> must have been called first.
=head3 Returns
A plain scalar string, newline-terminated, Unix line endings. Never empty
or undef.
=head3 Side Effects
Triggers all analysis methods if not already cached.
=head3 Notes
The report is idempotent: calling it multiple times on the same object
always returns an identical string. All user-derived content is sanitised
before output.
=head3 API Specification
=head4 Input
[]
=head4 Output
{ type => 'scalar' }
=head3 FORMAL SPECIFICATION
-- Z notation
report == [
Xi Email::Abuse::Investigator;
result! : STRING
]
post: result! /= '' /\ result! ends_with '\n'
=cut | |||||
| 2428 | ||||||
| 2429 | sub report { | |||||
| 2430 | 48 | 6246 | my $self = $_[0]; | |||
| 2431 | ||||||
| 2432 | 48 | 37 | my @out; | |||
| 2433 | ||||||
| 2434 | # Banner header | |||||
| 2435 | 48 | 52 | push @out, '=' x 72; | |||
| 2436 | 48 | 61 | push @out, " Email::Abuse::Investigator Report (v$VERSION)"; | |||
| 2437 | 48 | 38 | push @out, '=' x 72; | |||
| 2438 | 48 | 33 | push @out, ''; | |||
| 2439 | ||||||
| 2440 | # Envelope summary -- decode MIME encoded-words for readability | |||||
| 2441 | 48 | 51 | for my $f (qw(from reply-to return-path subject date message-id)) { | |||
| 2442 | 288 | 266 | my $v = $self->_header_value($f); | |||
| 2443 | 288 | 257 | next unless defined $v; | |||
| 2444 | 207 | 174 | my $decoded = $self->_decode_mime_words($v); | |||
| 2445 | 207 | 204 | my $label = ucfirst($f); | |||
| 2446 | 207 | 212 | push @out, sprintf(' %-14s : %s', $label, | |||
| 2447 | _sanitise_output($decoded ne $v ? "$decoded [encoded: $v]" : $v)); | |||||
| 2448 | } | |||||
| 2449 | 48 | 39 | push @out, ''; | |||
| 2450 | ||||||
| 2451 | # Risk assessment section | |||||
| 2452 | 48 | 59 | my $risk = $self->risk_assessment(); | |||
| 2453 | 48 | 74 | push @out, "[ RISK ASSESSMENT: $risk->{level} (score: $risk->{score}) ]"; | |||
| 2454 | 48 48 | 35 59 | if (@{ $risk->{flags} }) { | |||
| 2455 | 25 25 | 18 28 | for my $f (@{ $risk->{flags} }) { | |||
| 2456 | 50 | 68 | push @out, " [$f->{severity}] " . _sanitise_output($f->{detail}); | |||
| 2457 | } | |||||
| 2458 | } else { | |||||
| 2459 | 23 | 19 | push @out, ' (no specific red flags detected)'; | |||
| 2460 | } | |||||
| 2461 | 48 | 44 | push @out, ''; | |||
| 2462 | ||||||
| 2463 | # Originating host section | |||||
| 2464 | 48 | 36 | push @out, '[ ORIGINATING HOST ]'; | |||
| 2465 | 48 | 74 | my $orig = $self->originating_ip(); | |||
| 2466 | 48 | 75 | if ($orig) { | |||
| 2467 | 34 | 40 | push @out, ' IP : ' . _sanitise_output($orig->{ip}); | |||
| 2468 | 34 | 58 | push @out, ' Reverse DNS : ' . _sanitise_output($orig->{rdns}) if $orig->{rdns}; | |||
| 2469 | 34 | 44 | push @out, ' Country : ' . _sanitise_output($orig->{country}) if $orig->{country}; | |||
| 2470 | 34 | 51 | push @out, ' Organisation : ' . _sanitise_output($orig->{org}) if $orig->{org}; | |||
| 2471 | 34 | 50 | push @out, ' Abuse addr : ' . _sanitise_output($orig->{abuse}) if $orig->{abuse}; | |||
| 2472 | 34 | 35 | push @out, " Confidence : $orig->{confidence}"; | |||
| 2473 | 34 | 46 | push @out, ' Note : ' . _sanitise_output($orig->{note}) if $orig->{note}; | |||
| 2474 | } else { | |||||
| 2475 | 14 | 13 | push @out, ' (could not determine originating IP)'; | |||
| 2476 | } | |||||
| 2477 | 48 | 38 | push @out, ''; | |||
| 2478 | ||||||
| 2479 | # Sending software section (omitted if none found) | |||||
| 2480 | 48 | 61 | my @sw = $self->sending_software(); | |||
| 2481 | 48 | 54 | if (@sw) { | |||
| 2482 | 0 | 0 | push @out, '[ SENDING SOFTWARE / INFRASTRUCTURE CLUES ]'; | |||
| 2483 | 0 | 0 | for my $s (@sw) { | |||
| 2484 | 0 | 0 | push @out, sprintf(' %-14s : %s', $s->{header}, _sanitise_output($s->{value})); | |||
| 2485 | 0 | 0 | push @out, " Note : $s->{note}"; | |||
| 2486 | 0 | 0 | push @out, ''; | |||
| 2487 | } | |||||
| 2488 | } | |||||
| 2489 | ||||||
| 2490 | # Received chain tracking IDs (only hops with id or for are shown) | |||||
| 2491 | 48 41 | 62 109 | my @trail = grep { defined $_->{id} || defined $_->{for} } | |||
| 2492 | $self->received_trail(); | |||||
| 2493 | 48 | 55 | if (@trail) { | |||
| 2494 | 0 | 0 | push @out, '[ RECEIVED CHAIN TRACKING IDs ]'; | |||
| 2495 | 0 | 0 | push @out, ' (Supply these to the relevant ISP abuse team to trace the session)'; | |||
| 2496 | 0 | 0 | push @out, ''; | |||
| 2497 | 0 | 0 | for my $hop (@trail) { | |||
| 2498 | 0 | 0 | push @out, ' IP : ' . (_sanitise_output($hop->{ip}) // '(unknown)'); | |||
| 2499 | 0 | 0 | push @out, ' Envelope for : ' . _sanitise_output($hop->{for}) if $hop->{for}; | |||
| 2500 | 0 | 0 | push @out, ' Server ID : ' . _sanitise_output($hop->{id}) if $hop->{id}; | |||
| 2501 | 0 | 0 | push @out, ''; | |||
| 2502 | } | |||||
| 2503 | } | |||||
| 2504 | ||||||
| 2505 | # Embedded URLs section -- grouped by hostname | |||||
| 2506 | 48 | 39 | push @out, '[ EMBEDDED HTTP/HTTPS URLs ]'; | |||
| 2507 | 48 | 49 | my @urls = $self->embedded_urls(); | |||
| 2508 | 48 | 49 | if (@urls) { | |||
| 2509 | 19 | 14 | my (%host_order, %host_meta, %host_paths); | |||
| 2510 | 19 | 16 | my $seq = 0; | |||
| 2511 | 19 | 19 | for my $u (@urls) { | |||
| 2512 | 30 | 23 | my $h = $u->{host}; | |||
| 2513 | 30 | 33 | unless (exists $host_order{$h}) { | |||
| 2514 | 21 | 21 | $host_order{$h} = $seq++; | |||
| 2515 | $host_meta{$h} = { | |||||
| 2516 | ip => $u->{ip}, | |||||
| 2517 | org => $u->{org}, | |||||
| 2518 | abuse => $u->{abuse}, | |||||
| 2519 | country => $u->{country}, | |||||
| 2520 | 21 | 44 | }; | |||
| 2521 | } | |||||
| 2522 | 30 30 | 20 42 | push @{ $host_paths{$h} }, $u->{url}; | |||
| 2523 | } | |||||
| 2524 | ||||||
| 2525 | # Output each host group in first-seen order | |||||
| 2526 | 19 2 | 26 4 | for my $h (sort { $host_order{$a} <=> $host_order{$b} } keys %host_order) { | |||
| 2527 | 21 | 19 | my $m = $host_meta{$h}; | |||
| 2528 | 21 21 | 24 22 | my $bare = lc $h; $bare =~ s/^www\.//; | |||
| 2529 | push @out, ' Host : ' . _sanitise_output($h) . | |||||
| 2530 | 21 | 24 | (($URL_SHORTENERS{$bare} || $self->{url_shorteners}->{$bare}) | |||
| 2531 | ? ' *** URL SHORTENER -- real destination hidden ***' : ''); | |||||
| 2532 | 21 | 35 | push @out, ' IP : ' . _sanitise_output($m->{ip}) if $m->{ip}; | |||
| 2533 | 21 | 29 | push @out, ' Country : ' . _sanitise_output($m->{country}) if $m->{country}; | |||
| 2534 | 21 | 32 | push @out, ' Organisation : ' . _sanitise_output($m->{org}) if $m->{org}; | |||
| 2535 | 21 | 34 | push @out, ' Abuse addr : ' . _sanitise_output($m->{abuse}) if $m->{abuse}; | |||
| 2536 | 21 21 | 19 22 | my @paths = @{ $host_paths{$h} }; | |||
| 2537 | 21 | 24 | if (@paths == 1) { | |||
| 2538 | 15 | 17 | push @out, ' URL : ' . _sanitise_output($paths[0]); | |||
| 2539 | } else { | |||||
| 2540 | 6 | 6 | push @out, ' URLs (' . scalar(@paths) . ') :'; | |||
| 2541 | 6 | 11 | push @out, ' ' . _sanitise_output($_) for @paths; | |||
| 2542 | } | |||||
| 2543 | 21 | 48 | push @out, ''; | |||
| 2544 | } | |||||
| 2545 | } else { | |||||
| 2546 | 29 | 25 | push @out, ' (none found)'; | |||
| 2547 | 29 | 23 | push @out, ''; | |||
| 2548 | } | |||||
| 2549 | ||||||
| 2550 | # Contact / reply-to domains section | |||||
| 2551 | 48 | 39 | push @out, '[ CONTACT / REPLY-TO DOMAINS ]'; | |||
| 2552 | 48 | 71 | my @mdoms = $self->mailto_domains(); | |||
| 2553 | 48 | 68 | if (@mdoms) { | |||
| 2554 | 36 | 33 | for my $d (@mdoms) { | |||
| 2555 | 49 | 55 | push @out, ' Domain : ' . _sanitise_output($d->{domain}); | |||
| 2556 | 49 | 74 | push @out, ' Found in : ' . _sanitise_output($d->{source}); | |||
| 2557 | 49 | 50 | if ($d->{recently_registered}) { | |||
| 2558 | 3 | 3 | push @out, ' *** WARNING: RECENTLY REGISTERED - possible phishing domain ***'; | |||
| 2559 | } | |||||
| 2560 | 49 | 53 | push @out, ' Registered : ' . $d->{registered} if $d->{registered}; | |||
| 2561 | 49 | 46 | push @out, ' Expires : ' . $d->{expires} if $d->{expires}; | |||
| 2562 | 49 | 44 | push @out, ' Registrar : ' . _sanitise_output($d->{registrar}) if $d->{registrar}; | |||
| 2563 | 49 | 51 | push @out, ' Reg. abuse : ' . _sanitise_output($d->{registrar_abuse}) if $d->{registrar_abuse}; | |||
| 2564 | 49 | 49 | if ($d->{web_ip}) { | |||
| 2565 | 22 | 24 | push @out, ' Web host IP : ' . _sanitise_output($d->{web_ip}); | |||
| 2566 | 22 | 34 | push @out, ' Web host org : ' . _sanitise_output($d->{web_org}) if $d->{web_org}; | |||
| 2567 | 22 | 25 | push @out, ' Web abuse : ' . _sanitise_output($d->{web_abuse}) if $d->{web_abuse}; | |||
| 2568 | } else { | |||||
| 2569 | 27 | 21 | push @out, ' Web host : (no A record / unreachable)'; | |||
| 2570 | } | |||||
| 2571 | 49 | 40 | if ($d->{mx_host}) { | |||
| 2572 | 3 | 4 | push @out, ' MX host : ' . _sanitise_output($d->{mx_host}); | |||
| 2573 | 3 | 7 | push @out, ' MX IP : ' . _sanitise_output($d->{mx_ip}) if $d->{mx_ip}; | |||
| 2574 | 3 | 6 | push @out, ' MX org : ' . _sanitise_output($d->{mx_org}) if $d->{mx_org}; | |||
| 2575 | 3 | 5 | push @out, ' MX abuse : ' . _sanitise_output($d->{mx_abuse}) if $d->{mx_abuse}; | |||
| 2576 | } else { | |||||
| 2577 | 46 | 35 | push @out, ' MX host : (none found)'; | |||
| 2578 | } | |||||
| 2579 | 49 | 45 | if ($d->{ns_host}) { | |||
| 2580 | 2 | 3 | push @out, ' NS host : ' . _sanitise_output($d->{ns_host}); | |||
| 2581 | 2 | 4 | push @out, ' NS IP : ' . _sanitise_output($d->{ns_ip}) if $d->{ns_ip}; | |||
| 2582 | 2 | 4 | push @out, ' NS org : ' . _sanitise_output($d->{ns_org}) if $d->{ns_org}; | |||
| 2583 | 2 | 4 | push @out, ' NS abuse : ' . _sanitise_output($d->{ns_abuse}) if $d->{ns_abuse}; | |||
| 2584 | } | |||||
| 2585 | 49 | 42 | push @out, ''; | |||
| 2586 | } | |||||
| 2587 | } else { | |||||
| 2588 | 12 | 11 | push @out, ' (none found)'; | |||
| 2589 | 12 | 12 | push @out, ''; | |||
| 2590 | } | |||||
| 2591 | ||||||
| 2592 | # Abuse contacts summary | |||||
| 2593 | 48 | 38 | push @out, '[ WHERE TO SEND ABUSE REPORTS ]'; | |||
| 2594 | 48 | 54 | my @contacts = $self->abuse_contacts(); | |||
| 2595 | 48 | 51 | if (@contacts) { | |||
| 2596 | 39 | 38 | for my $c (@contacts) { | |||
| 2597 | 58 | 57 | push @out, ' Role : ' . _sanitise_output($c->{role}); | |||
| 2598 | 58 | 90 | push @out, ' Send to : ' . _sanitise_output($c->{address}); | |||
| 2599 | 58 | 70 | push @out, ' Note : ' . _sanitise_output($c->{note}) if $c->{note}; | |||
| 2600 | 58 | 86 | push @out, " Discovered : $c->{via}"; | |||
| 2601 | 58 | 52 | push @out, ''; | |||
| 2602 | } | |||||
| 2603 | } else { | |||||
| 2604 | 9 | 8 | push @out, ' (no abuse contacts could be determined)'; | |||
| 2605 | 9 | 7 | push @out, ''; | |||
| 2606 | } | |||||
| 2607 | ||||||
| 2608 | # Web-form contacts (providers that require manual form submission) | |||||
| 2609 | 48 | 51 | my @form_cs = $self->form_contacts(); | |||
| 2610 | 48 | 51 | if (@form_cs) { | |||
| 2611 | 2 | 2 | push @out, '[ WHERE TO FILE WEB-FORM REPORTS ]'; | |||
| 2612 | 2 | 3 | push @out, ' The following parties require manual submission via a web form.'; | |||
| 2613 | 2 | 1 | push @out, ' Open each URL in a browser, then follow the instructions below it.'; | |||
| 2614 | 2 | 2 | push @out, ''; | |||
| 2615 | 2 | 2 | for my $c (@form_cs) { | |||
| 2616 | 2 | 3 | push @out, ' Role : ' . _sanitise_output($c->{role}); | |||
| 2617 | 2 | 2 | push @out, ' Form URL : ' . _sanitise_output($c->{form}); | |||
| 2618 | 2 | 5 | push @out, ' Domain/URL : ' . _sanitise_output($c->{form_domain}) if $c->{form_domain}; | |||
| 2619 | 2 | 4 | push @out, ' Note : ' . _sanitise_output($c->{note}) if $c->{note}; | |||
| 2620 | 2 | 3 | if ($c->{form_paste}) { | |||
| 2621 | # Word-wrap the paste hint at ROLE_WRAP_LEN characters | |||||
| 2622 | 2 | 2 | my $hint = $c->{form_paste}; | |||
| 2623 | 2 | 6 | my @words = split /\s+/, $hint; | |||
| 2624 | 2 | 3 | my (@lines, $line); | |||
| 2625 | 2 | 2 | for my $w (@words) { | |||
| 2626 | 59 | 57 | if (defined $line && length("$line $w") > $ROLE_WRAP_LEN) { | |||
| 2627 | 4 | 3 | push @lines, $line; | |||
| 2628 | 4 | 4 | $line = $w; | |||
| 2629 | } else { | |||||
| 2630 | 55 | 49 | $line = defined $line ? "$line $w" : $w; | |||
| 2631 | } | |||||
| 2632 | } | |||||
| 2633 | 2 | 3 | push @lines, $line if defined $line; | |||
| 2634 | 2 | 4 | push @out, ' Paste : ' . shift @lines if @lines; | |||
| 2635 | 2 | 5 | push @out, ' ' . $_ for @lines; | |||
| 2636 | } | |||||
| 2637 | 2 | 5 | push @out, ' Upload : ' . _sanitise_output($c->{form_upload}) if $c->{form_upload}; | |||
| 2638 | 2 | 2 | push @out, ''; | |||
| 2639 | } | |||||
| 2640 | } | |||||
| 2641 | ||||||
| 2642 | 48 | 42 | push @out, '=' x 72; | |||
| 2643 | 48 | 353 | return join("\n", @out) . "\n"; | |||
| 2644 | } | |||||
| 2645 | ||||||
| 2646 | # ----------------------------------------------------------------------- | |||||
| 2647 | # Private: output sanitisation | |||||
| 2648 | # ----------------------------------------------------------------------- | |||||
| 2649 | ||||||
| 2650 | # _sanitise_output( $str ) -> $str | |||||
| 2651 | # | |||||
| 2652 | # Purpose: | |||||
| 2653 | # Strip control characters that could affect terminal rendering or HTML | |||||
| 2654 | # injection from any string that will appear in a report or abuse email. | |||||
| 2655 | # Preserves printable ASCII, high-bytes (for UTF-8 content), tabs, and | |||||
| 2656 | # line endings. | |||||
| 2657 | # | |||||
| 2658 | # Entry criteria: | |||||
| 2659 | # $str -- a defined or undef scalar. | |||||
| 2660 | # | |||||
| 2661 | # Exit status: | |||||
| 2662 | # Returns the sanitised string, or the empty string if $str is undef. | |||||
| 2663 | # | |||||
| 2664 | # Notes: | |||||
| 2665 | # Only strips C0 control characters below 0x20 (except \t) and the DEL | |||||
| 2666 | # character (0x7F). High bytes (0x80-0xFF) are preserved because they | |||||
| 2667 | # form valid UTF-8 multi-byte sequences in headers and body text. | |||||
| 2668 | ||||||
| 2669 | sub _sanitise_output { | |||||
| 2670 | 1153 | 20873 | my ($str) = @_; | |||
| 2671 | 1153 | 835 | return '' unless defined $str; | |||
| 2672 | # Remove C0 controls (except tab) and DEL | |||||
| 2673 | 1149 | 893 | $str =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]//g; | |||
| 2674 | 1149 | 1273 | return $str; | |||
| 2675 | } | |||||
| 2676 | ||||||
| 2677 | # ----------------------------------------------------------------------- | |||||
| 2678 | # Private: message parsing | |||||
| 2679 | # ----------------------------------------------------------------------- | |||||
| 2680 | ||||||
| 2681 | # _split_message( $text ) | |||||
| 2682 | # | |||||
| 2683 | # Purpose: | |||||
| 2684 | # Split a raw RFC 2822 email into headers and body, parse all headers, | |||||
| 2685 | # decode the body (including multipart), extract sending-software | |||||
| 2686 | # fingerprints, and populate per-hop tracking data. | |||||
| 2687 | # | |||||
| 2688 | # Entry criteria: | |||||
| 2689 | # $text -- defined scalar, already dereferenced by parse_email(). | |||||
| 2690 | # $self->{_sending_sw} and $self->{_rcvd_tracking} reset to [] by caller. | |||||
| 2691 | # | |||||
| 2692 | # Exit status: | |||||
| 2693 | # Returns undef silently if the header block is empty/whitespace-only. | |||||
| 2694 | # Otherwise all results are communicated via side effects on $self. | |||||
| 2695 | # | |||||
| 2696 | # Side effects: | |||||
| 2697 | # Populates _headers, _received, _body_plain, _body_html, _sending_sw, | |||||
| 2698 | # and _rcvd_tracking. | |||||
| 2699 | # | |||||
| 2700 | # Notes: | |||||
| 2701 | # Delegates to _decode_multipart() for multipart/* content types. | |||||
| 2702 | # Lines not matching the header pattern are silently discarded. | |||||
| 2703 | # Boundary extraction uses a simple regex; missing boundary causes the | |||||
| 2704 | # body to be skipped silently. | |||||
| 2705 | ||||||
| 2706 | sub _split_message { | |||||
| 2707 | 481 | 470 | my ($self, $text) = @_; | |||
| 2708 | ||||||
| 2709 | # Split at the first blank line (RFC 2822 header/body separator) | |||||
| 2710 | 481 | 2324 | my ($header_block, $body_raw) = split /\r?\n\r?\n/, $text, 2; | |||
| 2711 | ||||||
| 2712 | 481 | 1008 | return unless defined $header_block && $header_block =~ /\S/; | |||
| 2713 | 481 | 464 | $body_raw //= ''; | |||
| 2714 | ||||||
| 2715 | # Unfold RFC 2822 continuation lines (s2.2.3) | |||||
| 2716 | 481 | 1382 | $header_block =~ s/\r?\n([ \t]+)/ $1/g; | |||
| 2717 | ||||||
| 2718 | # Parse each header line into a { name, value } pair | |||||
| 2719 | 481 | 326 | my @headers; | |||
| 2720 | 481 | 1774 | for my $line (split /\r?\n/, $header_block) { | |||
| 2721 | 4207 | 5647 | if ($line =~ /^([\w-]+)\s*:\s*(.*)/) { | |||
| 2722 | 4204 | 6018 | push @headers, { name => lc($1), value => $2 }; | |||
| 2723 | } | |||||
| 2724 | } | |||||
| 2725 | 481 | 615 | $self->{_headers} = \@headers; | |||
| 2726 | ||||||
| 2727 | # Collect all Received: header values (most-recent first, as in message) | |||||
| 2728 | $self->{_received} = [ | |||||
| 2729 | 1416 | 1301 | map { $_->{value} } | |||
| 2730 | 481 4204 | 517 3342 | grep { $_->{name} eq 'received' } @headers | |||
| 2731 | ]; | |||||
| 2732 | ||||||
| 2733 | # Determine content type and transfer encoding from top-level headers | |||||
| 2734 | 481 4204 | 451 3050 | my ($ct_h) = grep { $_->{name} eq 'content-type' } @headers; | |||
| 2735 | 481 4204 | 361 2892 | my ($cte_h) = grep { $_->{name} eq 'content-transfer-encoding' } @headers; | |||
| 2736 | 481 | 554 | my $ct = defined $ct_h ? $ct_h->{value} : ''; | |||
| 2737 | 481 | 421 | my $cte = defined $cte_h ? $cte_h->{value} : ''; | |||
| 2738 | ||||||
| 2739 | # Decode multipart or single-part body as appropriate | |||||
| 2740 | 481 | 521 | if ($ct =~ /multipart/i) { | |||
| 2741 | 17 | 38 | my ($boundary) = $ct =~ /boundary="?([^";]+)"?/i; | |||
| 2742 | # Pass depth=0 to enforce the MAX_MULTIPART_DEPTH recursion guard | |||||
| 2743 | 17 | 36 | $self->_decode_multipart($body_raw, $boundary, 0) if $boundary; | |||
| 2744 | } else { | |||||
| 2745 | 464 | 515 | my $decoded = $self->_decode_body($body_raw, $cte); | |||
| 2746 | 464 8 | 495 9 | if ($ct =~ /html/i) { $self->{_body_html} = $decoded } | |||
| 2747 | 456 | 430 | else { $self->{_body_plain} = $decoded } | |||
| 2748 | } | |||||
| 2749 | ||||||
| 2750 | $self->_debug(sprintf 'Parsed %d headers, %d Received lines', | |||||
| 2751 | 481 481 | 369 1169 | scalar @headers, scalar @{ $self->{_received} }); | |||
| 2752 | ||||||
| 2753 | # --- Sending software fingerprints --- | |||||
| 2754 | # These headers identify the mailer or shared-hosting script that sent | |||||
| 2755 | # the message; invaluable for shared-hosting abuse reports. | |||||
| 2756 | 481 | 1513 | my %sw_notes = ( | |||
| 2757 | 'x-php-originating-script' => 'PHP script on shared hosting -- report to hosting abuse team', | |||||
| 2758 | 'x-source' => 'Source file on shared hosting -- report to hosting abuse team', | |||||
| 2759 | 'x-source-host' => 'Sending hostname injected by shared hosting provider', | |||||
| 2760 | 'x-source-args' => 'Command-line args injected by shared hosting provider', | |||||
| 2761 | 'x-mailer' => 'Email client or bulk-mailer identifier', | |||||
| 2762 | 'user-agent' => 'Email client identifier', | |||||
| 2763 | ); | |||||
| 2764 | 481 | 1061 | for my $sw_hdr (sort keys %sw_notes) { | |||
| 2765 | 2886 25224 | 1847 17097 | my ($h) = grep { $_->{name} eq $sw_hdr } @headers; | |||
| 2766 | 2886 | 2327 | next unless $h; | |||
| 2767 | 7 | 15 | push @{ $self->{_sending_sw} }, { | |||
| 2768 | header => $sw_hdr, | |||||
| 2769 | value => $h->{value}, | |||||
| 2770 | 7 | 3 | note => $sw_notes{$sw_hdr}, | |||
| 2771 | }; | |||||
| 2772 | } | |||||
| 2773 | ||||||
| 2774 | # --- Per-hop tracking IDs from Received: chain --- | |||||
| 2775 | # Walk oldest-first (reverse) so _rcvd_tracking is oldest-first | |||||
| 2776 | 481 481 | 403 521 | for my $rcvd (reverse @{ $self->{_received} }) { | |||
| 2777 | 1416 | 1158 | my $ip = $self->_extract_ip_from_received($rcvd); | |||
| 2778 | 1416 | 1117 | my ($for_addr) = $rcvd =~ /\bfor\s+<?([^\s>]+\@[\w.-]+\.[\w]+)>?/i; | |||
| 2779 | 1416 | 2222 | my ($srv_id) = $rcvd =~ /\bid\s+([\w.-]+)/i; | |||
| 2780 | # Skip hops with no actionable tracking data | |||||
| 2781 | 1416 | 1190 | next unless defined $ip || defined $for_addr || defined $srv_id; | |||
| 2782 | 1415 1415 | 815 2678 | push @{ $self->{_rcvd_tracking} }, { | |||
| 2783 | received => $rcvd, | |||||
| 2784 | ip => $ip, | |||||
| 2785 | for => $for_addr, | |||||
| 2786 | id => $srv_id, | |||||
| 2787 | }; | |||||
| 2788 | } | |||||
| 2789 | } | |||||
| 2790 | ||||||
| 2791 | # _decode_multipart( $body, $boundary, $depth ) | |||||
| 2792 | # | |||||
| 2793 | # Purpose: | |||||
| 2794 | # Recursively split a MIME multipart body on its boundary and decode each | |||||
| 2795 | # text/plain and text/html part. Nested multipart/* containers are | |||||
| 2796 | # recursed into up to MAX_MULTIPART_DEPTH levels deep. | |||||
| 2797 | # | |||||
| 2798 | # Entry criteria: | |||||
| 2799 | # $body -- the raw body text of the multipart container. | |||||
| 2800 | # $boundary -- the boundary string from the Content-Type header. | |||||
| 2801 | # $depth -- current recursion depth (starts at 0 from _split_message). | |||||
| 2802 | # | |||||
| 2803 | # Exit status: | |||||
| 2804 | # Returns undef if $depth >= MAX_MULTIPART_DEPTH (recursion guard). | |||||
| 2805 | # Otherwise all results via side effects. | |||||
| 2806 | # | |||||
| 2807 | # Side effects: | |||||
| 2808 | # Appends decoded text to $self->{_body_plain} and $self->{_body_html}. | |||||
| 2809 | # | |||||
| 2810 | # Notes: | |||||
| 2811 | # Whitespace-only MIME segments between boundaries are silently skipped. | |||||
| 2812 | # Decoding errors are silenced; raw bytes are used as fallback. | |||||
| 2813 | ||||||
| 2814 | sub _decode_multipart { | |||||
| 2815 | 49 | 111 | my ($self, $body, $boundary, $depth) = @_; | |||
| 2816 | 49 | 47 | $depth //= 0; | |||
| 2817 | ||||||
| 2818 | # Enforce the recursion depth limit to prevent stack exhaustion on | |||||
| 2819 | # pathological crafted messages with deeply nested multipart structures. | |||||
| 2820 | 49 | 49 | if ($depth >= $MAX_MULTIPART_DEPTH) { | |||
| 2821 | 7 | 15 | Carp::carp 'Email::Abuse::Investigator: multipart nesting depth limit', | |||
| 2822 | "($MAX_MULTIPART_DEPTH) exceeded; stopping recursion"; | |||||
| 2823 | 7 | 18 | return; | |||
| 2824 | } | |||||
| 2825 | ||||||
| 2826 | # Split on the boundary marker; the (?:--)? suffix handles closing boundary | |||||
| 2827 | 42 | 332 | my @parts = split /--\Q$boundary\E(?:--)?/, $body; | |||
| 2828 | ||||||
| 2829 | 42 | 53 | for my $part (@parts) { | |||
| 2830 | # Skip whitespace-only segments between boundaries | |||||
| 2831 | 135 | 160 | next unless $part =~ /\S/; | |||
| 2832 | ||||||
| 2833 | 52 | 80 | $part =~ s/^\r?\n//; | |||
| 2834 | ||||||
| 2835 | # Each MIME part has its own headers separated from body by a blank line | |||||
| 2836 | 52 | 104 | my ($phdr_block, $pbody) = split /\r?\n\r?\n/, $part, 2; | |||
| 2837 | 52 | 48 | next unless defined $pbody; | |||
| 2838 | ||||||
| 2839 | # Unfold continuation header lines within this part | |||||
| 2840 | 51 | 41 | $phdr_block =~ s/\r?\n([ \t]+)/ $1/g; | |||
| 2841 | ||||||
| 2842 | # Parse this part's headers into a simple hash | |||||
| 2843 | 51 | 38 | my %phdr; | |||
| 2844 | 51 | 56 | for my $line (split /\r?\n/, $phdr_block) { | |||
| 2845 | 54 | 124 | $phdr{ lc($1) } = $2 if $line =~ /^([\w-]+)\s*:\s*(.*)/; | |||
| 2846 | } | |||||
| 2847 | ||||||
| 2848 | 51 | 58 | my $pct = $phdr{'content-type'} // ''; | |||
| 2849 | 51 | 72 | my $pcte = $phdr{'content-transfer-encoding'} // ''; | |||
| 2850 | ||||||
| 2851 | # Nested multipart/* must be recursed into; without this URLs in | |||||
| 2852 | # multipart/alternative inside multipart/mixed would be missed. | |||||
| 2853 | 51 | 59 | if ($pct =~ /multipart/i) { | |||
| 2854 | 24 | 35 | my ($inner_boundary) = $pct =~ /boundary\s*=\s*"?([^";]+)"?/i; | |||
| 2855 | 24 | 24 | if ($inner_boundary) { | |||
| 2856 | 24 | 19 | $inner_boundary =~ s/\s+$//; | |||
| 2857 | # Increment depth counter for the recursion guard | |||||
| 2858 | 24 | 43 | $self->_decode_multipart($pbody, $inner_boundary, $depth + 1); | |||
| 2859 | } | |||||
| 2860 | 24 | 22 | next; | |||
| 2861 | } | |||||
| 2862 | ||||||
| 2863 | # Decode transfer encoding and accumulate by content type | |||||
| 2864 | 27 | 30 | my $decoded = $self->_decode_body($pbody, $pcte); | |||
| 2865 | 27 10 | 69 17 | if ($pct =~ /text\/html/i) { $self->{_body_html} .= $decoded } | |||
| 2866 | 16 | 26 | elsif ($pct =~ /text/i || !$pct) { $self->{_body_plain} .= $decoded } | |||
| 2867 | } | |||||
| 2868 | } | |||||
| 2869 | ||||||
| 2870 | # _decode_body( $body, $cte ) -> string | |||||
| 2871 | # | |||||
| 2872 | # Purpose: | |||||
| 2873 | # Decode a MIME body part according to its Content-Transfer-Encoding. | |||||
| 2874 | # | |||||
| 2875 | # Entry criteria: | |||||
| 2876 | # $body -- raw body string (may be undef). | |||||
| 2877 | # $cte -- Content-Transfer-Encoding value string (may be undef). | |||||
| 2878 | # | |||||
| 2879 | # Exit status: | |||||
| 2880 | # Returns the decoded string, or the original string if the encoding is | |||||
| 2881 | # 7bit/8bit/binary or unrecognised. | |||||
| 2882 | # | |||||
| 2883 | # Notes: | |||||
| 2884 | # decode_qp and decode_base64 are imported from MIME:: modules; errors | |||||
| 2885 | # from malformed content are silenced by the eval wrappers they provide. | |||||
| 2886 | ||||||
| 2887 | sub _decode_body { | |||||
| 2888 | 497 | 1124 | my ($self, $body, $cte) = @_; | |||
| 2889 | 497 | 451 | $cte //= ''; | |||
| 2890 | 497 | 453 | return decode_qp($body) if $cte =~ /quoted-printable/i; | |||
| 2891 | 491 | 408 | return decode_base64($body) if $cte =~ /base64/i; | |||
| 2892 | 488 | 587 | return $body // ''; | |||
| 2893 | } | |||||
| 2894 | ||||||
| 2895 | # ----------------------------------------------------------------------- | |||||
| 2896 | # Private: Received-chain -> originating IP | |||||
| 2897 | # ----------------------------------------------------------------------- | |||||
| 2898 | ||||||
| 2899 | # _find_origin() | |||||
| 2900 | # | |||||
| 2901 | # Purpose: | |||||
| 2902 | # Walk the Received: chain (oldest-first) to find the first external IP, | |||||
| 2903 | # or fall back to X-Originating-IP. Enrich with rDNS and WHOIS. | |||||
| 2904 | # | |||||
| 2905 | # Entry criteria: | |||||
| 2906 | # $self->{_received} populated by _split_message(). | |||||
| 2907 | # $self->{trusted_relays} set by new(). | |||||
| 2908 | # | |||||
| 2909 | # Exit status: | |||||
| 2910 | # Returns { ip, rdns, org, abuse, country, confidence, note } on success. | |||||
| 2911 | # Returns undef if no usable IP can be identified. | |||||
| 2912 | # | |||||
| 2913 | # Side effects: | |||||
| 2914 | # Network I/O via _enrich_ip(): one PTR lookup, one RDAP/WHOIS query. | |||||
| 2915 | # Results are also stored in the CHI cross-message cache if available. | |||||
| 2916 | # | |||||
| 2917 | # Notes: | |||||
| 2918 | # confidence 'high' = 2+ distinct external IPs; | |||||
| 2919 | # 'medium' = exactly one external IP; | |||||
| 2920 | # 'low' = taken from X-Originating-IP. | |||||
| 2921 | ||||||
| 2922 | sub _find_origin { | |||||
| 2923 | 267 | 191 | my $self = $_[0]; | |||
| 2924 | ||||||
| 2925 | 267 | 203 | my @candidates; | |||
| 2926 | ||||||
| 2927 | # Walk oldest-first (reverse) to collect external IPs | |||||
| 2928 | 267 267 | 219 247 | for my $hdr (reverse @{ $self->{_received} }) { | |||
| 2929 | 1222 | 960 | my $ip = $self->_extract_ip_from_received($hdr) // next; | |||
| 2930 | 1221 | 974 | next if $self->_is_private($ip); | |||
| 2931 | 89 | 1078 | next if $self->_is_trusted($ip); | |||
| 2932 | 84 | 86 | push @candidates, $ip; | |||
| 2933 | } | |||||
| 2934 | ||||||
| 2935 | # Fall back to X-Originating-IP if no external IPs in Received: chain | |||||
| 2936 | 267 | 262 | unless (@candidates) { | |||
| 2937 | 187 | 196 | my $xoip = $self->_header_value('x-originating-ip'); | |||
| 2938 | 187 | 178 | if ($xoip) { | |||
| 2939 | 6 | 8 | $xoip =~ s/[\[\]\s]//g; | |||
| 2940 | 6 | 7 | return $self->_enrich_ip($xoip, 'low', | |||
| 2941 | 'Taken from X-Originating-IP (webmail, unverified)') | |||||
| 2942 | unless $self->_is_private($xoip); | |||||
| 2943 | } | |||||
| 2944 | 182 | 267 | return undef; | |||
| 2945 | } | |||||
| 2946 | ||||||
| 2947 | # Report the oldest (first) external IP; confidence depends on count | |||||
| 2948 | 80 | 149 | return $self->_enrich_ip( | |||
| 2949 | $candidates[0], | |||||
| 2950 | @candidates > 1 ? 'high' : 'medium', | |||||
| 2951 | 'First external hop in Received: chain', | |||||
| 2952 | ); | |||||
| 2953 | } | |||||
| 2954 | ||||||
| 2955 | # _extract_ip_from_received( $hdr ) -> ipv4_or_ipv6_string | undef | |||||
| 2956 | # | |||||
| 2957 | # Purpose: | |||||
| 2958 | # Extract the most-significant IP address from a raw Received: header | |||||
| 2959 | # value, trying patterns in priority order. Supports both IPv4 dotted- | |||||
| 2960 | # quad and IPv6 bracket notation. | |||||
| 2961 | # | |||||
| 2962 | # Entry criteria: | |||||
| 2963 | # $hdr -- a defined Received: header value string. | |||||
| 2964 | # | |||||
| 2965 | # Exit status: | |||||
| 2966 | # Returns the IP string on success, undef if no IP can be extracted. | |||||
| 2967 | # | |||||
| 2968 | # Notes: | |||||
| 2969 | # IPv4 addresses are validated (all octets <= 255). | |||||
| 2970 | # IPv6 addresses are returned as-is if they contain colons. | |||||
| 2971 | ||||||
| 2972 | sub _extract_ip_from_received { | |||||
| 2973 | 2653 | 3281 | my ($self, $hdr) = @_; | |||
| 2974 | 2653 | 1749 | for my $re (@RECEIVED_IP_RE) { | |||
| 2975 | 2719 | 4730 | if ($hdr =~ $re) { | |||
| 2976 | 2659 | 2086 | my $ip = $1; | |||
| 2977 | ||||||
| 2978 | # Accept IPv6 addresses (contain colons) without further validation | |||||
| 2979 | 2659 | 2076 | return $ip if $ip =~ /:/; | |||
| 2980 | ||||||
| 2981 | # Validate IPv4 format and octet range | |||||
| 2982 | 2656 | 3089 | next unless $ip =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/; | |||
| 2983 | 2650 10600 | 2457 8556 | next if grep { $_ > 255 } split /\./, $ip; | |||
| 2984 | 2642 | 2942 | return $ip; | |||
| 2985 | } | |||||
| 2986 | } | |||||
| 2987 | 8 | 15 | return undef; | |||
| 2988 | } | |||||
| 2989 | ||||||
| 2990 | # _is_private( $ip ) -> bool | |||||
| 2991 | # | |||||
| 2992 | # Purpose: | |||||
| 2993 | # Test whether an IP address falls in any private, reserved, or special- | |||||
| 2994 | # use range (IPv4 or IPv6) that should never be reported as a spam origin. | |||||
| 2995 | # | |||||
| 2996 | # Entry criteria: | |||||
| 2997 | # $ip -- a scalar IP string (IPv4 or IPv6); may be undef. | |||||
| 2998 | # | |||||
| 2999 | # Exit status: | |||||
| 3000 | # Returns 1 (true) if the IP is private/reserved, 0 (false) otherwise. | |||||
| 3001 | # Returns 1 for undef or empty strings. | |||||
| 3002 | # | |||||
| 3003 | # Notes: | |||||
| 3004 | # Uses the module-level @PRIVATE_RANGES array of pre-compiled regexes. | |||||
| 3005 | # Covers all ranges listed in RFC 1122, 1918, 5737, 6598, and RFC 4193. | |||||
| 3006 | ||||||
| 3007 | sub _is_private { | |||||
| 3008 | 1263 | 1089 | my ($self, $ip) = @_; | |||
| 3009 | 1263 | 1404 | return 1 unless defined $ip && $ip ne ''; | |||
| 3010 | 1261 6320 | 829 7363 | for my $re (@PRIVATE_RANGES) { return 1 if $ip =~ $re } | |||
| 3011 | 101 | 119 | return 0; | |||
| 3012 | } | |||||
| 3013 | ||||||
| 3014 | # _is_trusted( $ip ) -> bool | |||||
| 3015 | # | |||||
| 3016 | # Purpose: | |||||
| 3017 | # Test whether an IP address matches any entry in the caller-supplied | |||||
| 3018 | # trusted_relays list (exact IP or CIDR block). | |||||
| 3019 | # | |||||
| 3020 | # Entry criteria: | |||||
| 3021 | # $ip -- a defined IPv4 address string. | |||||
| 3022 | # $self->{trusted_relays} -- arrayref of exact IPs or CIDR strings. | |||||
| 3023 | # | |||||
| 3024 | # Exit status: | |||||
| 3025 | # Returns 1 (true) if the IP matches any trusted relay, 0 otherwise. | |||||
| 3026 | ||||||
| 3027 | sub _is_trusted { | |||||
| 3028 | 95 | 83 | my ($self, $ip) = @_; | |||
| 3029 | 95 95 | 69 99 | for my $cidr (@{ $self->{trusted_relays} }) { | |||
| 3030 | 19 | 17 | return 1 if $self->_ip_in_cidr($ip, $cidr); | |||
| 3031 | } | |||||
| 3032 | 87 | 98 | return 0; | |||
| 3033 | } | |||||
| 3034 | ||||||
| 3035 | # ----------------------------------------------------------------------- | |||||
| 3036 | # Private: HTTP/HTTPS URL extraction and resolution | |||||
| 3037 | # ----------------------------------------------------------------------- | |||||
| 3038 | ||||||
| 3039 | # _extract_and_resolve_urls() -> arrayref of url hashrefs | |||||
| 3040 | # | |||||
| 3041 | # Purpose: | |||||
| 3042 | # Extract all HTTP/HTTPS URLs from the decoded body, resolve each unique | |||||
| 3043 | # hostname to an IP, and enrich with WHOIS/RDAP data. Optionally uses | |||||
| 3044 | # AnyEvent::DNS to parallelise the DNS resolution step. | |||||
| 3045 | # | |||||
| 3046 | # Entry criteria: | |||||
| 3047 | # $self->{_body_plain} and $self->{_body_html} populated by _split_message(). | |||||
| 3048 | # | |||||
| 3049 | # Exit status: | |||||
| 3050 | # Returns an arrayref of url hashrefs (possibly empty). | |||||
| 3051 | # | |||||
| 3052 | # Side effects: | |||||
| 3053 | # Network I/O per unique hostname: one A/AAAA lookup, one RDAP/WHOIS. | |||||
| 3054 | # Results stored in the CHI cross-message cache if available. | |||||
| 3055 | ||||||
| 3056 | sub _extract_and_resolve_urls { | |||||
| 3057 | 144 | 105 | my $self = $_[0]; | |||
| 3058 | 144 | 182 | my (%url_seen, %host_cache); | |||
| 3059 | 144 | 0 | my @results; | |||
| 3060 | 144 | 165 | my $combined = $self->{_body_plain} . "\n" . $self->{_body_html}; | |||
| 3061 | ||||||
| 3062 | # Collect unique URLs from body | |||||
| 3063 | 144 614 | 168 560 | my @urls = grep { !$url_seen{$_}++ } $self->_extract_http_urls($combined); | |||
| 3064 | ||||||
| 3065 | # Extract unique hostnames for parallel DNS resolution | |||||
| 3066 | 144 | 105 | my %hostname_needed; | |||
| 3067 | 144 | 122 | for my $url (@urls) { | |||
| 3068 | 614 | 805 | my ($host) = $url =~ m{https?://([^/:?\s#]+)}i; | |||
| 3069 | 614 | 586 | $hostname_needed{$host}++ if $host; | |||
| 3070 | } | |||||
| 3071 | ||||||
| 3072 | # Parallelise DNS lookups if AnyEvent::DNS is available | |||||
| 3073 | 144 | 176 | if ($HAS_ANYEVENT_DNS && scalar(keys %hostname_needed) > 1) { | |||
| 3074 | 0 | 0 | $self->_parallel_resolve_hosts(\%hostname_needed, \%host_cache); | |||
| 3075 | } | |||||
| 3076 | ||||||
| 3077 | # Process each URL: resolve hostname and WHOIS-enrich | |||||
| 3078 | 144 | 113 | for my $url (@urls) { | |||
| 3079 | 614 | 780 | my ($host) = $url =~ m{https?://([^/:?\s#]+)}i; | |||
| 3080 | 614 | 493 | next unless $host; | |||
| 3081 | ||||||
| 3082 | # Resolve and WHOIS once per unique hostname, then cache the result | |||||
| 3083 | 614 | 492 | unless (exists $host_cache{$host}) { | |||
| 3084 | # Check the cross-message CHI cache first | |||||
| 3085 | 99 | 94 | my $cached = $_cache ? $_cache->get("url:$host") : undef; | |||
| 3086 | 99 | 81 | if ($cached) { | |||
| 3087 | 0 | 0 | $host_cache{$host} = $cached; | |||
| 3088 | } else { | |||||
| 3089 | 99 | 132 | my $ip = $self->_resolve_host($host) // '(unresolved)'; | |||
| 3090 | 99 | 298 | my $whois = $ip ne '(unresolved)' | |||
| 3091 | ? $self->_whois_ip($ip) | |||||
| 3092 | : {}; | |||||
| 3093 | ||||||
| 3094 | # Fall back to domain WHOIS if IP lookup returned nothing | |||||
| 3095 | 99 | 191 | if (!$whois->{abuse}) { | |||
| 3096 | 56 | 84 | my $reg = _registrable($host) // $host; | |||
| 3097 | 56 | 76 | my $dw = $self->_parse_domain_whois_abuse($reg); | |||
| 3098 | 56 | 142 | $whois = $dw if $dw->{abuse}; | |||
| 3099 | } | |||||
| 3100 | ||||||
| 3101 | my $entry = { | |||||
| 3102 | ip => $ip, | |||||
| 3103 | org => $whois->{org} // '(unknown)', | |||||
| 3104 | abuse => $whois->{abuse} // '(unknown)', | |||||
| 3105 | country => $whois->{country} // undef, | |||||
| 3106 | 99 | 337 | }; | |||
| 3107 | 99 | 92 | $host_cache{$host} = $entry; | |||
| 3108 | ||||||
| 3109 | # Store in cross-message cache for reuse across messages | |||||
| 3110 | 99 | 139 | $_cache->set("url:$host", $entry) if $_cache; | |||
| 3111 | } | |||||
| 3112 | } | |||||
| 3113 | ||||||
| 3114 | 614 614 | 372 1028 | push @results, { url => $url, host => $host, %{ $host_cache{$host} } }; | |||
| 3115 | } | |||||
| 3116 | 144 | 377 | return \@results; | |||
| 3117 | } | |||||
| 3118 | ||||||
| 3119 | # _parallel_resolve_hosts( \%hostnames, \%cache ) | |||||
| 3120 | # | |||||
| 3121 | # Purpose: | |||||
| 3122 | # Resolve multiple hostnames to IPs in parallel using AnyEvent::DNS. | |||||
| 3123 | # Populates the cache with resolved IPs so the sequential loop in | |||||
| 3124 | # _extract_and_resolve_urls() can skip the DNS step for pre-resolved hosts. | |||||
| 3125 | # | |||||
| 3126 | # Entry criteria: | |||||
| 3127 | # $hostnames_ref -- hashref keyed by hostname (values ignored). | |||||
| 3128 | # $cache_ref -- hashref to populate with { ip => '...' } results. | |||||
| 3129 | # AnyEvent::DNS must be installed ($HAS_ANYEVENT_DNS is true). | |||||
| 3130 | # | |||||
| 3131 | # Exit status: | |||||
| 3132 | # Returns undef; all results written to %$cache_ref via side effects. | |||||
| 3133 | # | |||||
| 3134 | # Notes: | |||||
| 3135 | # Errors (NXDOMAIN, timeout) are silently swallowed; the sequential | |||||
| 3136 | # resolution loop will return '(unresolved)' for those hosts. | |||||
| 3137 | ||||||
| 3138 | sub _parallel_resolve_hosts { | |||||
| 3139 | 2 | 419 | my ($self, $hostnames_ref, $cache_ref) = @_; | |||
| 3140 | 2 | 6 | return unless $HAS_ANYEVENT_DNS; | |||
| 3141 | ||||||
| 3142 | # Build an AnyEvent condvar to wait for all lookups to complete | |||||
| 3143 | 0 | 0 | my $cv = AnyEvent->condvar; | |||
| 3144 | 0 | 0 | my $pending = scalar keys %$hostnames_ref; | |||
| 3145 | ||||||
| 3146 | 0 | 0 | for my $host (keys %$hostnames_ref) { | |||
| 3147 | # Fire an async A (and AAAA) query for each hostname | |||||
| 3148 | AnyEvent::DNS::resolve( | |||||
| 3149 | $host, 'A', | |||||
| 3150 | sub { | |||||
| 3151 | 0 | 0 | my @answers = @_; | |||
| 3152 | 0 | 0 | if (@answers) { | |||
| 3153 | # Cache the first A record result | |||||
| 3154 | 0 | 0 | $cache_ref->{$host} = { ip => $answers[0][4] }; | |||
| 3155 | } | |||||
| 3156 | # Decrement the pending counter; signal when all done | |||||
| 3157 | 0 | 0 | $cv->send if --$pending <= 0; | |||
| 3158 | }, | |||||
| 3159 | 0 | 0 | ); | |||
| 3160 | } | |||||
| 3161 | ||||||
| 3162 | # Block until all DNS queries complete (subject to AnyEvent's own timeouts) | |||||
| 3163 | 0 | 0 | $cv->recv; | |||
| 3164 | } | |||||
| 3165 | ||||||
| 3166 | # _extract_http_urls( $body ) -> list of url strings | |||||
| 3167 | # | |||||
| 3168 | # Purpose: | |||||
| 3169 | # Extract all HTTP and HTTPS URLs from a body string, using both | |||||
| 3170 | # structural HTML parsing (if HTML::LinkExtor is available) and a | |||||
| 3171 | # plain-text regex pass. Deduplicates and strips trailing punctuation. | |||||
| 3172 | # | |||||
| 3173 | # Entry criteria: | |||||
| 3174 | # $body -- combined plain+HTML body string. | |||||
| 3175 | # | |||||
| 3176 | # Exit status: | |||||
| 3177 | # Returns a list of URL strings (possibly empty), deduplicated. | |||||
| 3178 | ||||||
| 3179 | sub _extract_http_urls { | |||||
| 3180 | 147 | 693 | my ($self, $body) = @_; | |||
| 3181 | 147 | 102 | my @urls; | |||
| 3182 | ||||||
| 3183 | # Structural HTML link extraction (handles quoted attributes correctly) | |||||
| 3184 | 147 | 148 | if ($HAS_HTML_LINKEXTOR) { | |||
| 3185 | my $p = HTML::LinkExtor->new(sub { | |||||
| 3186 | 16 | 251 | my ($tag, %attrs) = @_; | |||
| 3187 | 16 | 14 | for my $attr (qw(href src action)) { | |||
| 3188 | 48 | 61 | my $val = $attrs{$attr} // ''; | |||
| 3189 | 48 | 120 | if ($val =~ m{^https?://}i) { | |||
| 3190 | 12 | 11 | push @urls, $val; | |||
| 3191 | } elsif ($val =~ m{^//[\w.-]}) { | |||||
| 3192 | # Protocol-relative -- assume https | |||||
| 3193 | 2 | 2 | push @urls, 'https:' . $val; | |||
| 3194 | } | |||||
| 3195 | } | |||||
| 3196 | 147 | 476 | }); | |||
| 3197 | 147 | 5285 | $p->parse($body); | |||
| 3198 | } | |||||
| 3199 | ||||||
| 3200 | # Plain-text regex pass for bare URLs not in HTML attributes | |||||
| 3201 | 147 | 1030 | while ($body =~ m{(https?://[^\s<>"'\)\]]+)}gi) { | |||
| 3202 | 1116 | 1702 | push @urls, $1; | |||
| 3203 | } | |||||
| 3204 | ||||||
| 3205 | # Protocol-relative URLs not caught above | |||||
| 3206 | 147 | 388 | while ($body =~ m{(?:^|[\s"'=])(//[\w.-][^\s<>"'\)\]]*)}gim) { | |||
| 3207 | 2 | 4 | push @urls, 'https:' . $1; | |||
| 3208 | } | |||||
| 3209 | ||||||
| 3210 | # Deduplicate and strip trailing punctuation | |||||
| 3211 | 147 | 104 | my %seen; | |||
| 3212 | 147 1132 | 147 999 | my @all = grep { !$seen{$_}++ } @urls; | |||
| 3213 | 147 | 335 | s/[.,;:!?\)>\]]+$// for @all; | |||
| 3214 | 147 | 292 | return @all; | |||
| 3215 | } | |||||
| 3216 | ||||||
| 3217 | # ----------------------------------------------------------------------- | |||||
| 3218 | # Private: domain extraction and full analysis | |||||
| 3219 | # ----------------------------------------------------------------------- | |||||
| 3220 | ||||||
| 3221 | # _extract_and_analyse_domains() -> arrayref of domain hashrefs | |||||
| 3222 | # | |||||
| 3223 | # Purpose: | |||||
| 3224 | # Collect all non-infrastructure contact domains from headers and body, | |||||
| 3225 | # run the full domain intelligence pipeline on each, and return an arrayref | |||||
| 3226 | # suitable for storage in $self->{_mailto_domains}. | |||||
| 3227 | # | |||||
| 3228 | # Entry criteria: | |||||
| 3229 | # _split_message() must have been called. | |||||
| 3230 | # | |||||
| 3231 | # Exit status: | |||||
| 3232 | # Always returns an arrayref; never undef. | |||||
| 3233 | # | |||||
| 3234 | # Side effects: | |||||
| 3235 | # Network I/O per domain via _analyse_domain(). | |||||
| 3236 | # Results stored in $self->{_domain_info} and CHI cache. | |||||
| 3237 | ||||||
| 3238 | sub _extract_and_analyse_domains { | |||||
| 3239 | 138 | 103 | my $self = $_[0]; | |||
| 3240 | 138 | 149 | my (%seen, @domains_with_source); | |||
| 3241 | ||||||
| 3242 | # Build a set of recipient domains to exclude (victims, not senders) | |||||
| 3243 | 138 | 0 | my %recipient_domains; | |||
| 3244 | 138 | 119 | for my $hname (qw(to cc)) { | |||
| 3245 | 276 | 272 | my $val = $self->_header_value($hname) // next; | |||
| 3246 | 129 | 168 | for my $dom ($self->_domains_from_text($val)) { | |||
| 3247 | 126 | 135 | my $reg = _registrable($dom) // $dom; | |||
| 3248 | 126 | 142 | $recipient_domains{$dom}++; | |||
| 3249 | 126 | 146 | $recipient_domains{$reg}++; | |||
| 3250 | } | |||||
| 3251 | } | |||||
| 3252 | ||||||
| 3253 | # Also exclude domains from Received: "for" envelope recipients | |||||
| 3254 | 138 138 | 87 132 | for my $hop (@{ $self->{_rcvd_tracking} }) { | |||
| 3255 | 135 | 179 | next unless $hop->{for} && $hop->{for} =~ /\@([\w.-]+)/; | |||
| 3256 | 0 | 0 | my $dom = lc $1; | |||
| 3257 | 0 | 0 | my $reg = _registrable($dom) // $dom; | |||
| 3258 | 0 | 0 | $recipient_domains{$dom}++; | |||
| 3259 | 0 | 0 | $recipient_domains{$reg}++; | |||
| 3260 | } | |||||
| 3261 | ||||||
| 3262 | # Inner closure: record a domain if it passes all filters | |||||
| 3263 | my $record = sub { | |||||
| 3264 | 624 | 511 | my ($dom, $source) = @_; | |||
| 3265 | 624 | 466 | $dom = lc $dom; | |||
| 3266 | 624 | 403 | $dom =~ s/\.$//; | |||
| 3267 | 624 | 570 | next if $self->{trusted_domains}->{$dom}; | |||
| 3268 | 624 | 520 | return if $TRUSTED_DOMAINS{$dom}; | |||
| 3269 | 585 | 422 | return if $recipient_domains{$dom}; | |||
| 3270 | 583 | 435 | return if $recipient_domains{ _registrable($dom) // $dom }; | |||
| 3271 | # Discard non-routable hostnames (single-label, pseudo-TLDs, etc.) | |||||
| 3272 | 583 | 713 | return unless $dom =~ /\.[a-zA-Z]{2,}$/; | |||
| 3273 | 519 | 613 | return if $dom =~ /\.(?:local|internal|lan|localdomain|arpa)$/i; | |||
| 3274 | 517 | 554 | return if $seen{$dom}++; | |||
| 3275 | 383 | 518 | push @domains_with_source, { domain => $dom, source => $source }; | |||
| 3276 | 138 | 289 | }; | |||
| 3277 | ||||||
| 3278 | # Collect from standard sender/reply headers | |||||
| 3279 | 138 | 274 | my %header_sources = ( | |||
| 3280 | 'from' => 'From: header', | |||||
| 3281 | 'reply-to' => 'Reply-To: header', | |||||
| 3282 | 'return-path' => 'Return-Path: header', | |||||
| 3283 | 'sender' => 'Sender: header', | |||||
| 3284 | ); | |||||
| 3285 | 138 | 244 | for my $hname (sort keys %header_sources) { | |||
| 3286 | 552 | 441 | my $val = $self->_header_value($hname) // next; | |||
| 3287 | $record->($_, $header_sources{$hname}) | |||||
| 3288 | 262 | 258 | for $self->_domains_from_text($val); | |||
| 3289 | } | |||||
| 3290 | ||||||
| 3291 | # Message-ID domain often reveals the real bulk-sending platform | |||||
| 3292 | 138 | 161 | my $mid = $self->_header_value('message-id'); | |||
| 3293 | 138 | 282 | if ($mid && $mid =~ /\@([\w.-]+)/) { | |||
| 3294 | 128 | 120 | my $mid_dom = lc $1; | |||
| 3295 | 128 | 118 | my $mid_reg = _registrable($mid_dom) // $mid_dom; | |||
| 3296 | $record->($mid_dom, 'Message-ID: header') | |||||
| 3297 | 128 | 421 | unless $TRUSTED_DOMAINS{$mid_dom} || $TRUSTED_DOMAINS{$mid_reg} || $self->{trusted_domains}->{$mid_dom} || $self->{trusted_domains}->{$mid_reg}; | |||
| 3298 | } | |||||
| 3299 | ||||||
| 3300 | # DKIM signing domain(s) -- the organisation that vouches for the message | |||||
| 3301 | 138 | 151 | my $auth = $self->_parse_auth_results_cached(); | |||
| 3302 | 138 138 | 101 229 | for my $dkim_d (@{ $auth->{dkim_domains} // [] }) { | |||
| 3303 | 5 | 5 | $record->($dkim_d, 'DKIM-Signature: d= (signing domain)'); | |||
| 3304 | } | |||||
| 3305 | ||||||
| 3306 | # List-Unsubscribe identifies the ESP or bulk sender | |||||
| 3307 | 138 | 147 | my $unsub = $self->_header_value('list-unsubscribe'); | |||
| 3308 | 138 | 132 | if ($unsub) { | |||
| 3309 | 0 | 0 | while ($unsub =~ m{https?://([^/:?\s>]+)}gi) { | |||
| 3310 | 0 | 0 | $record->(lc $1, 'List-Unsubscribe: header'); | |||
| 3311 | } | |||||
| 3312 | 0 | 0 | while ($unsub =~ m{mailto:[^@\s>]+\@([\w.-]+)}gi) { | |||
| 3313 | 0 | 0 | $record->(lc $1, 'List-Unsubscribe: header'); | |||
| 3314 | } | |||||
| 3315 | } | |||||
| 3316 | ||||||
| 3317 | # Body email addresses (mailto: and bare user@domain forms) | |||||
| 3318 | 138 | 144 | my $combined = $self->{_body_plain} . "\n" . $self->{_body_html}; | |||
| 3319 | $record->($_, 'email address / mailto in body') | |||||
| 3320 | 138 | 124 | for $self->_domains_from_text($combined); | |||
| 3321 | ||||||
| 3322 | # Run the full intelligence pipeline on each collected domain | |||||
| 3323 | 138 | 100 | my @results; | |||
| 3324 | 138 | 103 | for my $entry (@domains_with_source) { | |||
| 3325 | 383 | 358 | my $info = $self->_analyse_domain($entry->{domain}); | |||
| 3326 | 383 | 655 | push @results, { %$entry, %$info }; | |||
| 3327 | } | |||||
| 3328 | 138 | 803 | return \@results; | |||
| 3329 | } | |||||
| 3330 | ||||||
| 3331 | # _domains_from_text( $text ) -> list of domain strings | |||||
| 3332 | # | |||||
| 3333 | # Purpose: | |||||
| 3334 | # Extract unique domain names from mailto: links and bare user@domain | |||||
| 3335 | # addresses in a block of text. | |||||
| 3336 | # | |||||
| 3337 | # Entry criteria: | |||||
| 3338 | # $text -- a defined scalar of decoded body or header text. | |||||
| 3339 | # | |||||
| 3340 | # Exit status: | |||||
| 3341 | # Returns a list of lower-cased domain strings (possibly empty). | |||||
| 3342 | ||||||
| 3343 | sub _domains_from_text { | |||||
| 3344 | 674 | 2009 | my ($self, $text) = @_; | |||
| 3345 | 674 | 416 | my (%seen, @out); | |||
| 3346 | ||||||
| 3347 | # mailto: links (including HTML-entity-encoded @ signs from QP) | |||||
| 3348 | 674 | 738 | while ($text =~ /mailto:(?:[^@\s<>"]+)@([\w.-]+)/gi) { | |||
| 3349 | 14 14 | 18 12 | my $dom = lc $1; $dom =~ s/\.$//; | |||
| 3350 | 14 | 30 | push @out, $dom unless $seen{$dom}++; | |||
| 3351 | } | |||||
| 3352 | ||||||
| 3353 | # Bare user@domain patterns | |||||
| 3354 | 674 | 1239 | while ($text =~ /\b[\w.+%-]+@([\w.-]+\.[a-zA-Z]{2,})\b/g) { | |||
| 3355 | 662 662 | 563 468 | my $dom = lc $1; $dom =~ s/\.$//; | |||
| 3356 | 662 | 1132 | push @out, $dom unless $seen{$dom}++; | |||
| 3357 | } | |||||
| 3358 | 674 | 960 | return @out; | |||
| 3359 | } | |||||
| 3360 | ||||||
| 3361 | # _analyse_domain( $domain ) -> hashref | |||||
| 3362 | # | |||||
| 3363 | # Purpose: | |||||
| 3364 | # Run the complete intelligence pipeline for a single domain: A record | |||||
| 3365 | # (web hosting), MX record (mail hosting), NS record (DNS hosting), | |||||
| 3366 | # and WHOIS (registrar, creation/expiry dates, abuse contact). | |||||
| 3367 | # Each IP is enriched via RDAP/WHOIS. Results are cached per domain | |||||
| 3368 | # in $self->{_domain_info} and in the CHI cross-message cache. | |||||
| 3369 | # | |||||
| 3370 | # Entry criteria: | |||||
| 3371 | # $domain -- lower-cased, no trailing dot, not in TRUSTED_DOMAINS. | |||||
| 3372 | # $self->{timeout} used for all network operations. | |||||
| 3373 | # | |||||
| 3374 | # Exit status: | |||||
| 3375 | # Always returns a hashref reference; never undef; may be empty ({}). | |||||
| 3376 | # Possible keys: web_ip, web_org, web_abuse, mx_host, mx_ip, mx_org, | |||||
| 3377 | # mx_abuse, ns_host, ns_ip, ns_org, ns_abuse, registrar, | |||||
| 3378 | # registrar_abuse, registered, expires, recently_registered, whois_raw. | |||||
| 3379 | # | |||||
| 3380 | # Side effects: | |||||
| 3381 | # Network I/O; writes result to $self->{_domain_info}{$domain} and CHI. | |||||
| 3382 | # | |||||
| 3383 | # Notes: | |||||
| 3384 | # MX/NS lookups require Net::DNS; absent without it. | |||||
| 3385 | # recently_registered is set to 1 (not 0) when the threshold is met. | |||||
| 3386 | # whois_raw is truncated to WHOIS_RAW_MAX bytes. | |||||
| 3387 | ||||||
| 3388 | sub _analyse_domain { | |||||
| 3389 | 386 | 3654 | my ($self, $domain) = @_; | |||
| 3390 | ||||||
| 3391 | # Return the per-message cached result if already analysed | |||||
| 3392 | return $self->{_domain_info}{$domain} | |||||
| 3393 | 386 | 355 | if $self->{_domain_info}{$domain}; | |||
| 3394 | ||||||
| 3395 | # Check the cross-message CHI cache before hitting the network | |||||
| 3396 | 382 | 293 | if ($_cache) { | |||
| 3397 | 0 | 0 | my $cached = $_cache->get("dom:$domain"); | |||
| 3398 | 0 | 0 | if ($cached) { | |||
| 3399 | 0 | 0 | $self->{_domain_info}{$domain} = $cached; | |||
| 3400 | 0 | 0 | return $cached; | |||
| 3401 | } | |||||
| 3402 | } | |||||
| 3403 | ||||||
| 3404 | 382 | 452 | $self->_debug("Analysing domain: $domain"); | |||
| 3405 | 382 | 250 | my %info; | |||
| 3406 | ||||||
| 3407 | # --- A record -> web hosting IP --- | |||||
| 3408 | 382 | 367 | my $web_ip = $self->_resolve_host($domain); | |||
| 3409 | 382 | 608 | if ($web_ip) { | |||
| 3410 | 50 | 58 | $info{web_ip} = $web_ip; | |||
| 3411 | 50 | 53 | my $w = $self->_whois_ip($web_ip); | |||
| 3412 | 50 | 122 | $info{web_org} = $w->{org} if $w->{org}; | |||
| 3413 | 50 | 72 | $info{web_abuse} = $w->{abuse} if $w->{abuse}; | |||
| 3414 | } | |||||
| 3415 | ||||||
| 3416 | # MX and NS lookups require Net::DNS | |||||
| 3417 | 382 | 276 | if ($HAS_NET_DNS) { | |||
| 3418 | my $res = Net::DNS::Resolver->new( | |||||
| 3419 | tcp_timeout => $self->{timeout}, | |||||
| 3420 | udp_timeout => $self->{timeout}, | |||||
| 3421 | 0 | 0 | ); | |||
| 3422 | ||||||
| 3423 | # --- MX record -> mail hosting --- | |||||
| 3424 | 0 | 0 | my $mxq = $res->search($domain, 'MX'); | |||
| 3425 | 0 | 0 | if ($mxq) { | |||
| 3426 | 0 | 0 | my ($best) = sort { $a->preference <=> $b->preference } | |||
| 3427 | 0 0 | 0 0 | grep { $_->type eq 'MX' } $mxq->answer; | |||
| 3428 | 0 | 0 | if ($best) { | |||
| 3429 | 0 | 0 | (my $mx_host = lc $best->exchange) =~ s/\.$//; | |||
| 3430 | 0 | 0 | $info{mx_host} = $mx_host; | |||
| 3431 | 0 | 0 | my $mx_ip = $self->_resolve_host($mx_host); | |||
| 3432 | 0 | 0 | if ($mx_ip) { | |||
| 3433 | 0 | 0 | $info{mx_ip} = $mx_ip; | |||
| 3434 | 0 | 0 | my $mw = $self->_whois_ip($mx_ip); | |||
| 3435 | 0 | 0 | $info{mx_org} = $mw->{org} if $mw->{org}; | |||
| 3436 | 0 | 0 | $info{mx_abuse} = $mw->{abuse} if $mw->{abuse}; | |||
| 3437 | } | |||||
| 3438 | } | |||||
| 3439 | } | |||||
| 3440 | ||||||
| 3441 | # --- NS record -> DNS hosting --- | |||||
| 3442 | 0 | 0 | my $nsq = $res->search($domain, 'NS'); | |||
| 3443 | 0 | 0 | if ($nsq) { | |||
| 3444 | 0 0 | 0 0 | my ($first) = grep { $_->type eq 'NS' } $nsq->answer; | |||
| 3445 | 0 | 0 | if ($first) { | |||
| 3446 | 0 | 0 | (my $ns_host = lc $first->nsdname) =~ s/\.$//; | |||
| 3447 | 0 | 0 | $info{ns_host} = $ns_host; | |||
| 3448 | 0 | 0 | my $ns_ip = $self->_resolve_host($ns_host); | |||
| 3449 | 0 | 0 | if ($ns_ip) { | |||
| 3450 | 0 | 0 | $info{ns_ip} = $ns_ip; | |||
| 3451 | 0 | 0 | my $nw = $self->_whois_ip($ns_ip); | |||
| 3452 | 0 | 0 | $info{ns_org} = $nw->{org} if $nw->{org}; | |||
| 3453 | 0 | 0 | $info{ns_abuse} = $nw->{abuse} if $nw->{abuse}; | |||
| 3454 | } | |||||
| 3455 | } | |||||
| 3456 | } | |||||
| 3457 | } | |||||
| 3458 | ||||||
| 3459 | # --- Domain WHOIS -> registrar + dates --- | |||||
| 3460 | 382 | 339 | my $domain_whois = $self->_domain_whois($domain); | |||
| 3461 | 382 | 515 | if ($domain_whois) { | |||
| 3462 | # Truncate raw WHOIS for storage but parse structured fields from full text | |||||
| 3463 | 19 | 47 | $info{whois_raw} = substr($domain_whois, 0, $WHOIS_RAW_MAX); | |||
| 3464 | ||||||
| 3465 | # Registrar name | |||||
| 3466 | 19 | 42 | if ($domain_whois =~ /Registrar:\s*(.+)/i) { | |||
| 3467 | 15 | 35 | ($info{registrar} = $1) =~ s/\s+$//; | |||
| 3468 | } | |||||
| 3469 | ||||||
| 3470 | # Registrar abuse contact email (try multiple field names) | |||||
| 3471 | 19 | 37 | for my $pat ( | |||
| 3472 | qr/Registrar Abuse Contact Email:\s*(\S+@\S+)/i, | |||||
| 3473 | qr/Abuse Contact Email:\s*(\S+@\S+)/i, | |||||
| 3474 | qr/abuse-contact:\s*(\S+@\S+)/i, | |||||
| 3475 | ) { | |||||
| 3476 | 57 | 139 | if (!$info{registrar_abuse} && $domain_whois =~ $pat) { | |||
| 3477 | 12 | 25 | ($info{registrar_abuse} = $1) =~ s/\s+$//; | |||
| 3478 | } | |||||
| 3479 | } | |||||
| 3480 | ||||||
| 3481 | # Domain creation date (multiple registrar field name variations) | |||||
| 3482 | 19 | 49 | for my $pat ( | |||
| 3483 | qr/Creation Date:\s*(\S+)/i, | |||||
| 3484 | qr/Created(?:\s+On)?:\s*(\S+)/i, | |||||
| 3485 | qr/Registration Time:\s*(\S+)/i, | |||||
| 3486 | qr/^registered:\s*(\S+)/im, | |||||
| 3487 | ) { | |||||
| 3488 | 76 | 127 | if (!$info{registered} && $domain_whois =~ $pat) { | |||
| 3489 | 15 | 23 | ($info{registered} = $1) =~ s/[TZ].*//; | |||
| 3490 | } | |||||
| 3491 | } | |||||
| 3492 | ||||||
| 3493 | # Domain expiry date | |||||
| 3494 | 19 | 34 | for my $pat ( | |||
| 3495 | qr/Registry Expiry Date:\s*(\S+)/i, | |||||
| 3496 | qr/Expir(?:y|ation)(?: Date)?:\s*(\S+)/i, | |||||
| 3497 | qr/paid-till:\s*(\S+)/i, | |||||
| 3498 | ) { | |||||
| 3499 | 57 | 110 | if (!$info{expires} && $domain_whois =~ $pat) { | |||
| 3500 | 13 | 24 | ($info{expires} = $1) =~ s/[TZ].*//; | |||
| 3501 | } | |||||
| 3502 | } | |||||
| 3503 | ||||||
| 3504 | # Flag recently-registered domains (< RECENT_REG_DAYS old) | |||||
| 3505 | 19 | 35 | if ($info{registered}) { | |||
| 3506 | 15 | 34 | my $epoch = $self->_parse_date_to_epoch($info{registered}); | |||
| 3507 | 15 | 300 | $info{recently_registered} = 1 | |||
| 3508 | if $epoch && (time() - $epoch) < $RECENT_REG_DAYS * $SECS_PER_DAY; | |||||
| 3509 | } | |||||
| 3510 | } | |||||
| 3511 | ||||||
| 3512 | # Store in per-message and cross-message caches | |||||
| 3513 | 382 | 339 | $self->{_domain_info}{$domain} = \%info; | |||
| 3514 | 382 | 298 | $_cache->set("dom:$domain", \%info) if $_cache; | |||
| 3515 | ||||||
| 3516 | 382 | 303 | return \%info; | |||
| 3517 | } | |||||
| 3518 | ||||||
| 3519 | # ----------------------------------------------------------------------- | |||||
| 3520 | # Private: DNS helpers | |||||
| 3521 | # ----------------------------------------------------------------------- | |||||
| 3522 | ||||||
| 3523 | # _resolve_host( $host ) -> ip_string | undef | |||||
| 3524 | # | |||||
| 3525 | # Purpose: | |||||
| 3526 | # Resolve a hostname to an IPv4 (or IPv6) address. Uses Net::DNS for | |||||
| 3527 | # both A and AAAA queries when available; falls back to inet_aton for | |||||
| 3528 | # pure IPv4 resolution. | |||||
| 3529 | # | |||||
| 3530 | # Entry criteria: | |||||
| 3531 | # $host -- hostname string or already-numeric IP. | |||||
| 3532 | # | |||||
| 3533 | # Exit status: | |||||
| 3534 | # Returns the first resolved IP string, or undef on failure. | |||||
| 3535 | # | |||||
| 3536 | # Notes: | |||||
| 3537 | # When the input is already a dotted-quad IPv4 it is returned immediately. | |||||
| 3538 | # AAAA records are tried if the A query fails and Net::DNS is available. | |||||
| 3539 | ||||||
| 3540 | sub _resolve_host { | |||||
| 3541 | 13 | 212 | my ($self, $host) = @_; | |||
| 3542 | 13 | 23 | return $host if $host =~ /^\d{1,3}(?:\.\d{1,3}){3}$/; | |||
| 3543 | ||||||
| 3544 | # Check the CHI cache before hitting DNS | |||||
| 3545 | 12 | 16 | if ($_cache) { | |||
| 3546 | 0 | 0 | my $cached_ip = $_cache->get("resolve:$host"); | |||
| 3547 | 0 | 0 | return $cached_ip if defined $cached_ip; | |||
| 3548 | } | |||||
| 3549 | ||||||
| 3550 | 12 | 6 | my $ip; | |||
| 3551 | ||||||
| 3552 | 12 | 18 | if ($HAS_NET_DNS) { | |||
| 3553 | my $res = Net::DNS::Resolver->new( | |||||
| 3554 | tcp_timeout => $self->{timeout}, | |||||
| 3555 | udp_timeout => $self->{timeout}, | |||||
| 3556 | 0 | 0 | ); | |||
| 3557 | ||||||
| 3558 | # Try A record first, then AAAA for IPv6 | |||||
| 3559 | 0 | 0 | for my $type (qw(A AAAA)) { | |||
| 3560 | 0 | 0 | my $query = $res->search($host, $type); | |||
| 3561 | 0 | 0 | if ($query) { | |||
| 3562 | 0 | 0 | for my $rr ($query->answer) { | |||
| 3563 | 0 | 0 | if ($rr->type eq 'A') { | |||
| 3564 | 0 | 0 | $ip = $rr->address; | |||
| 3565 | 0 | 0 | last; | |||
| 3566 | } elsif ($rr->type eq 'AAAA') { | |||||
| 3567 | 0 | 0 | $ip = $rr->address; | |||
| 3568 | 0 | 0 | last; | |||
| 3569 | } | |||||
| 3570 | } | |||||
| 3571 | } | |||||
| 3572 | 0 | 0 | last if defined $ip; | |||
| 3573 | } | |||||
| 3574 | } else { | |||||
| 3575 | # Fallback: gethostbyname (IPv4 only) | |||||
| 3576 | 12 12 | 9 213098 | my $packed = eval { inet_aton($host) }; | |||
| 3577 | 12 | 52 | $ip = $packed ? inet_ntoa($packed) : undef; | |||
| 3578 | } | |||||
| 3579 | ||||||
| 3580 | # Cache the result (including undef as '' to avoid repeated failed lookups) | |||||
| 3581 | 12 | 19 | if ($_cache) { | |||
| 3582 | 0 | 0 | $_cache->set("resolve:$host", $ip // ''); | |||
| 3583 | } | |||||
| 3584 | ||||||
| 3585 | 12 | 29 | return $ip; | |||
| 3586 | } | |||||
| 3587 | ||||||
| 3588 | # _reverse_dns( $ip ) -> hostname | undef | |||||
| 3589 | # | |||||
| 3590 | # Purpose: | |||||
| 3591 | # Perform a PTR (reverse DNS) lookup for an IP address. Supports both | |||||
| 3592 | # IPv4 and IPv6 via Net::DNS when available; falls back to gethostbyaddr. | |||||
| 3593 | # | |||||
| 3594 | # Entry criteria: | |||||
| 3595 | # $ip -- a defined IPv4 or IPv6 address string. | |||||
| 3596 | # | |||||
| 3597 | # Exit status: | |||||
| 3598 | # Returns the PTR hostname string, or undef if no record exists. | |||||
| 3599 | ||||||
| 3600 | sub _reverse_dns { | |||||
| 3601 | 6 | 7 | my ($self, $ip) = @_; | |||
| 3602 | 6 | 35 | return undef unless $ip; | |||
| 3603 | ||||||
| 3604 | 6 | 13 | if ($HAS_NET_DNS) { | |||
| 3605 | 0 | 0 | my $res = Net::DNS::Resolver->new(tcp_timeout => $self->{timeout}); | |||
| 3606 | 0 | 0 | my $query = $res->search($ip, 'PTR'); | |||
| 3607 | 0 | 0 | if ($query) { | |||
| 3608 | 0 | 0 | for my $rr ($query->answer) { | |||
| 3609 | 0 | 0 | return $rr->ptrdname if $rr->type eq 'PTR'; | |||
| 3610 | } | |||||
| 3611 | } | |||||
| 3612 | 0 | 0 | return undef; | |||
| 3613 | } | |||||
| 3614 | ||||||
| 3615 | # Fallback for IPv4 only | |||||
| 3616 | 6 | 176465 | return scalar gethostbyaddr(inet_aton($ip), AF_INET); | |||
| 3617 | } | |||||
| 3618 | ||||||
| 3619 | # ----------------------------------------------------------------------- | |||||
| 3620 | # Private: WHOIS / RDAP | |||||
| 3621 | # ----------------------------------------------------------------------- | |||||
| 3622 | ||||||
| 3623 | # _whois_ip( $ip ) -> hashref | |||||
| 3624 | # | |||||
| 3625 | # Purpose: | |||||
| 3626 | # Enrich an IP address with organisation name, abuse contact, and country | |||||
| 3627 | # code. Tries RDAP first (if LWP is available), then falls back to raw | |||||
| 3628 | # WHOIS via IANA referral. Results are cached in CHI if available. | |||||
| 3629 | # | |||||
| 3630 | # Entry criteria: | |||||
| 3631 | # $ip -- a defined IPv4 or IPv6 address string. | |||||
| 3632 | # | |||||
| 3633 | # Exit status: | |||||
| 3634 | # Returns { org, abuse, country } hashref; keys absent when unknown. | |||||
| 3635 | ||||||
| 3636 | sub _whois_ip { | |||||
| 3637 | 9 | 1150 | my ($self, $ip) = @_; | |||
| 3638 | ||||||
| 3639 | # Check CHI cache before going to the network | |||||
| 3640 | 9 | 20 | if ($_cache) { | |||
| 3641 | 0 | 0 | my $cached = $_cache->get("whois_ip:$ip"); | |||
| 3642 | 0 | 0 | return $cached if $cached; | |||
| 3643 | } | |||||
| 3644 | ||||||
| 3645 | 9 | 22 | my $result = $HAS_LWP ? $self->_rdap_lookup($ip) : {}; | |||
| 3646 | ||||||
| 3647 | # Fall back to raw WHOIS if RDAP returned no organisation | |||||
| 3648 | 9 | 33 | unless ($result->{org}) { | |||
| 3649 | 9 | 24 | my $raw = $self->_raw_whois($ip, 'whois.iana.org'); | |||
| 3650 | 9 | 61 | if ($raw) { | |||
| 3651 | 8 | 61 | my ($ref) = $raw =~ /whois:\s*([\w.-]+)/i; | |||
| 3652 | 8 | 27 | my $detail = $ref ? $self->_raw_whois($ip, $ref) : $raw; | |||
| 3653 | 8 | 74 | $result = $self->_parse_whois_text($detail) if $detail; | |||
| 3654 | } | |||||
| 3655 | } | |||||
| 3656 | ||||||
| 3657 | # Cache the enrichment result | |||||
| 3658 | 9 | 24 | $_cache->set("whois_ip:$ip", $result) if $_cache && $result; | |||
| 3659 | ||||||
| 3660 | 9 | 17 | return $result; | |||
| 3661 | } | |||||
| 3662 | ||||||
| 3663 | # _domain_whois( $domain ) -> raw_whois_string | undef | |||||
| 3664 | # | |||||
| 3665 | # Purpose: | |||||
| 3666 | # Perform a two-step WHOIS lookup for a domain: first ask IANA for the | |||||
| 3667 | # TLD's authoritative WHOIS server, then query that server. | |||||
| 3668 | # | |||||
| 3669 | # Entry criteria: | |||||
| 3670 | # $domain -- a lower-cased domain name string. | |||||
| 3671 | # | |||||
| 3672 | # Exit status: | |||||
| 3673 | # Returns the raw WHOIS response string, or undef on failure. | |||||
| 3674 | ||||||
| 3675 | sub _domain_whois { | |||||
| 3676 | 16 | 590 | my ($self, $domain) = @_; | |||
| 3677 | 16 | 30 | my $iana = $self->_raw_whois($domain, 'whois.iana.org') // return undef; | |||
| 3678 | 15 | 122 | my ($server) = $iana =~ /whois:\s*([\w.-]+)/i; | |||
| 3679 | 15 | 40 | return undef unless $server; | |||
| 3680 | 1 | 2 | return $self->_raw_whois($domain, $server); | |||
| 3681 | } | |||||
| 3682 | ||||||
| 3683 | # _parse_domain_whois_abuse( $domain ) -> hashref | |||||
| 3684 | # | |||||
| 3685 | # Purpose: | |||||
| 3686 | # Lightweight domain WHOIS lookup to extract only registrar name and | |||||
| 3687 | # abuse contact. Used as a fallback in _extract_and_resolve_urls() when | |||||
| 3688 | # a URL host cannot be resolved to an IP. | |||||
| 3689 | # | |||||
| 3690 | # Entry criteria: | |||||
| 3691 | # $domain -- a registrable domain name string. | |||||
| 3692 | # | |||||
| 3693 | # Exit status: | |||||
| 3694 | # Returns { org, abuse } hashref; empty hashref on failure. | |||||
| 3695 | ||||||
| 3696 | sub _parse_domain_whois_abuse { | |||||
| 3697 | 56 | 56 | my ($self, $domain) = @_; | |||
| 3698 | 56 | 66 | my $raw = $self->_domain_whois($domain) // return {}; | |||
| 3699 | 7 | 17 | my %info; | |||
| 3700 | 7 | 13 | if ($raw =~ /Registrar:\s*(.+)/i) { | |||
| 3701 | 7 | 13 | ($info{org} = $1) =~ s/\s+$//; | |||
| 3702 | } | |||||
| 3703 | # Try multiple field name patterns for the abuse email | |||||
| 3704 | 7 | 10 | for my $pat ( | |||
| 3705 | qr/Registrar Abuse Contact Email:\s*(\S+\@\S+)/i, | |||||
| 3706 | qr/Abuse Contact Email:\s*(\S+\@\S+)/i, | |||||
| 3707 | qr/abuse-contact:\s*(\S+\@\S+)/i, | |||||
| 3708 | ) { | |||||
| 3709 | 21 | 37 | if (!$info{abuse} && $raw =~ $pat) { | |||
| 3710 | 7 | 9 | ($info{abuse} = $1) =~ s/\s+$//; | |||
| 3711 | } | |||||
| 3712 | } | |||||
| 3713 | 7 | 8 | return \%info; | |||
| 3714 | } | |||||
| 3715 | ||||||
| 3716 | # _rdap_lookup( $ip ) -> hashref | |||||
| 3717 | # | |||||
| 3718 | # Purpose: | |||||
| 3719 | # Query the ARIN RDAP API for IP block ownership information. RDAP is | |||||
| 3720 | # preferred over raw WHOIS because it returns structured JSON. | |||||
| 3721 | # | |||||
| 3722 | # Entry criteria: | |||||
| 3723 | # $ip -- a defined IPv4 or IPv6 address string. | |||||
| 3724 | # LWP::UserAgent must be installed. | |||||
| 3725 | # | |||||
| 3726 | # Exit status: | |||||
| 3727 | # Returns { org, abuse, country } hashref; empty hashref on failure. | |||||
| 3728 | ||||||
| 3729 | sub _rdap_lookup { | |||||
| 3730 | 0 | 0 | my ($self, $ip) = @_; | |||
| 3731 | 0 | 0 | return {} unless $HAS_LWP; | |||
| 3732 | ||||||
| 3733 | 0 | 0 | my $ua = $self->{ua}; | |||
| 3734 | 0 | 0 | if(!defined($ua)) { | |||
| 3735 | $ua = LWP::UserAgent->new( | |||||
| 3736 | timeout => $self->{timeout}, | |||||
| 3737 | 0 | 0 | agent => "Email-Abuse-Investigator/$VERSION", | |||
| 3738 | ); | |||||
| 3739 | ||||||
| 3740 | 0 | 0 | if($HAS_CONN_CACHE) { | |||
| 3741 | 0 | 0 | my $conn_cache = LWP::ConnCache->new(); | |||
| 3742 | 0 | 0 | $conn_cache->total_capacity(10); | |||
| 3743 | 0 | 0 | $ua->conn_cache($conn_cache); | |||
| 3744 | } | |||||
| 3745 | ||||||
| 3746 | 0 | 0 | $ua->env_proxy(1); | |||
| 3747 | 0 | 0 | $self->{ua} = $ua; | |||
| 3748 | } | |||||
| 3749 | ||||||
| 3750 | # Use the ARIN RDAP endpoint; it covers the ARIN region and redirects | |||||
| 3751 | # for RIPE/APNIC/LACNIC/AfriNIC allocations. | |||||
| 3752 | 0 0 | 0 0 | my $res = eval { $ua->get("https://rdap.arin.net/registry/ip/$ip") }; | |||
| 3753 | 0 | 0 | return {} unless $res && $res->is_success(); | |||
| 3754 | ||||||
| 3755 | 0 | 0 | my $j = $res->decoded_content(); | |||
| 3756 | 0 | 0 | my %info; | |||
| 3757 | ||||||
| 3758 | # Extract organisation name from the JSON response | |||||
| 3759 | 0 | 0 | $info{org} = $1 if $j =~ /"name"\s*:\s*"([^"]+)"/; | |||
| 3760 | 0 | 0 | $info{handle} = $1 if $j =~ /"handle"\s*:\s*"([^"]+)"/; | |||
| 3761 | ||||||
| 3762 | # Extract abuse email from the vcardArray contact block | |||||
| 3763 | 0 | 0 | if ($j =~ /"abuse".*?"email"\s*:\s*"([^"]+)"/s) { | |||
| 3764 | 0 | 0 | $info{abuse} = $1; | |||
| 3765 | } elsif ($j =~ /"email"\s*:\s*"([^@"]+@[^"]+)"/) { | |||||
| 3766 | 0 | 0 | $info{abuse} = $1; | |||
| 3767 | } | |||||
| 3768 | ||||||
| 3769 | # Country code from the network's country field | |||||
| 3770 | 0 | 0 | $info{country} = $1 if $j =~ /"country"\s*:\s*"([A-Z]{2})"/; | |||
| 3771 | ||||||
| 3772 | 0 | 0 | return \%info; | |||
| 3773 | } | |||||
| 3774 | ||||||
| 3775 | # _raw_whois( $query, $server ) -> string | undef | |||||
| 3776 | # | |||||
| 3777 | # Purpose: | |||||
| 3778 | # Open a TCP connection to a WHOIS server on port 43, send the query, | |||||
| 3779 | # and return the full response as a string. Uses IO::Select for read | |||||
| 3780 | # timeouts so that alarm() is never needed (alarm() is unreliable on | |||||
| 3781 | # Windows and in threaded Perl). Supports IPv6 WHOIS servers via | |||||
| 3782 | # IO::Socket::IP when that module is available. | |||||
| 3783 | # | |||||
| 3784 | # Entry criteria: | |||||
| 3785 | # $query -- the domain name or IP to query (defined, non-empty). | |||||
| 3786 | # $server -- the WHOIS server hostname (default: 'whois.iana.org'). | |||||
| 3787 | # $self->{timeout} -- seconds used for connect and per-read waits. | |||||
| 3788 | # | |||||
| 3789 | # Exit status: | |||||
| 3790 | # Returns the raw WHOIS response string, or undef on connection/write failure. | |||||
| 3791 | # | |||||
| 3792 | # Notes: | |||||
| 3793 | # Uses IO::Socket::IP (dual-stack) when available, falling back to | |||||
| 3794 | # IO::Socket::INET (IPv4 only) otherwise. The IO::Select loop reads | |||||
| 3795 | # until the server closes the connection or the per-read timeout expires. | |||||
| 3796 | ||||||
| 3797 | sub _raw_whois { | |||||
| 3798 | 23 | 274 | my ($self, $query, $server) = @_; | |||
| 3799 | 23 | 37 | $server //= 'whois.iana.org'; | |||
| 3800 | 23 | 71 | $self->_debug("WHOIS $server -> $query"); | |||
| 3801 | ||||||
| 3802 | # Choose the socket class based on what is installed. | |||||
| 3803 | # IO::Socket::IP supports both IPv4 and IPv6 WHOIS servers. | |||||
| 3804 | 23 | 46 | my $sock_class = $HAS_IO_SOCKET_IP ? 'IO::Socket::IP' : 'IO::Socket::INET'; | |||
| 3805 | ||||||
| 3806 | # Attempt TCP connection to port 43 on the WHOIS server | |||||
| 3807 | 23 | 28 | my $sock = eval { | |||
| 3808 | $sock_class->new( | |||||
| 3809 | PeerAddr => $server, | |||||
| 3810 | PeerPort => $WHOIS_PORT, | |||||
| 3811 | Proto => 'tcp', | |||||
| 3812 | Timeout => $self->{timeout}, | |||||
| 3813 | 23 | 154 | ); | |||
| 3814 | }; | |||||
| 3815 | 23 | 2419586 | return undef unless $sock; | |||
| 3816 | ||||||
| 3817 | # Send the WHOIS query in wire format (CRLF-terminated per RFC 3912) | |||||
| 3818 | 23 0 0 | 123 0 0 | $sock->print("$query\r\n") or do { $sock->close(); return undef }; | |||
| 3819 | ||||||
| 3820 | # Use IO::Select to implement per-read timeouts without alarm() | |||||
| 3821 | 23 | 1138 | my $sel = IO::Select->new($sock); | |||
| 3822 | 23 | 1095 | my $response = ''; | |||
| 3823 | 23 | 95 | my $buf = ''; | |||
| 3824 | ||||||
| 3825 | # Read until EOF (server closes) or timeout | |||||
| 3826 | 23 | 87 | while ($sel->can_read($self->{timeout})) { | |||
| 3827 | # Wrap in eval to catch 'Connection reset by peer' thrown by Fatal/autodie | |||||
| 3828 | 75 75 | 1289566 180 | my $n = eval { sysread($sock, $buf, $WHOIS_READ_CHUNK) }; | |||
| 3829 | ||||||
| 3830 | 75 | 8443 | if ($@ || !defined $n || $n <= 0) { | |||
| 3831 | 23 | 37 | $self->_debug("WHOIS read failed: $@") if $@; | |||
| 3832 | 23 | 28 | last; | |||
| 3833 | } | |||||
| 3834 | 52 | 132 | last unless defined $n && $n > 0; | |||
| 3835 | 52 | 243 | $response .= $buf; | |||
| 3836 | } | |||||
| 3837 | ||||||
| 3838 | 23 | 111 | $sock->close(); | |||
| 3839 | 23 | 1252 | return $response || undef; | |||
| 3840 | } | |||||
| 3841 | ||||||
| 3842 | # _parse_whois_text( $text ) -> hashref | |||||
| 3843 | # | |||||
| 3844 | # Purpose: | |||||
| 3845 | # Parse a raw WHOIS IP block response to extract organisation name, | |||||
| 3846 | # abuse contact email, and country code. | |||||
| 3847 | # | |||||
| 3848 | # Entry criteria: | |||||
| 3849 | # $text -- a defined WHOIS response string. | |||||
| 3850 | # | |||||
| 3851 | # Exit status: | |||||
| 3852 | # Returns { org, abuse, country } hashref; keys absent when not found. | |||||
| 3853 | ||||||
| 3854 | sub _parse_whois_text { | |||||
| 3855 | 37 | 5885 | my ($self, $text) = @_; | |||
| 3856 | 37 | 47 | return {} unless $text; | |||
| 3857 | 35 | 31 | my %info; | |||
| 3858 | ||||||
| 3859 | # Try multiple field names for the organisation name | |||||
| 3860 | 35 | 110 | for my $pat ( | |||
| 3861 | qr/^OrgName:\s*(.+)/mi, qr/^org-name:\s*(.+)/mi, | |||||
| 3862 | qr/^owner:\s*(.+)/mi, qr/^descr:\s*(.+)/mi, | |||||
| 3863 | ) { | |||||
| 3864 | 140 | 417 | if (!$info{org} && $text =~ $pat) { | |||
| 3865 | 20 | 69 | ($info{org} = $1) =~ s/\s+$//; | |||
| 3866 | } | |||||
| 3867 | } | |||||
| 3868 | ||||||
| 3869 | # Try multiple field names for the abuse email | |||||
| 3870 | 35 | 75 | for my $pat ( | |||
| 3871 | qr/OrgAbuseEmail:\s*(\S+@\S+)/mi, | |||||
| 3872 | qr/abuse-mailbox:\s*(\S+@\S+)/mi, | |||||
| 3873 | ) { | |||||
| 3874 | 70 | 178 | if (!$info{abuse} && $text =~ $pat) { | |||
| 3875 | 6 | 14 | ($info{abuse} = $1) =~ s/\s+$//; | |||
| 3876 | } | |||||
| 3877 | } | |||||
| 3878 | ||||||
| 3879 | # Last-resort: any abuse@ address in the response | |||||
| 3880 | 35 | 108 | $info{abuse} //= $1 if $text =~ /(abuse\@[\w.-]+)/i; | |||
| 3881 | ||||||
| 3882 | # Country code (case-insensitive match, normalised to uppercase) | |||||
| 3883 | 35 | 66 | if ($text =~ /^country:\s*([A-Za-z]{2})\s*$/m) { | |||
| 3884 | 12 | 35 | $info{country} = uc $1; | |||
| 3885 | } | |||||
| 3886 | 35 | 63 | return \%info; | |||
| 3887 | } | |||||
| 3888 | ||||||
| 3889 | # ----------------------------------------------------------------------- | |||||
| 3890 | # Private: authentication results parsing | |||||
| 3891 | # ----------------------------------------------------------------------- | |||||
| 3892 | ||||||
| 3893 | # _parse_auth_results_cached() -> hashref | |||||
| 3894 | # | |||||
| 3895 | # Purpose: | |||||
| 3896 | # Parse the Authentication-Results: header(s) from the message once, | |||||
| 3897 | # cache the result, and return it. Extracts SPF, DKIM, DMARC, ARC | |||||
| 3898 | # results and the DKIM signing domain(s). | |||||
| 3899 | # | |||||
| 3900 | # Entry criteria: | |||||
| 3901 | # $self->{_headers} populated by _split_message(). | |||||
| 3902 | # | |||||
| 3903 | # Exit status: | |||||
| 3904 | # Returns { spf, dkim, dmarc, arc, dkim_domain, dkim_domains } hashref. | |||||
| 3905 | # Keys absent when the corresponding header or field is not present. | |||||
| 3906 | ||||||
| 3907 | sub _parse_auth_results_cached { | |||||
| 3908 | 609 | 1828 | my $self = $_[0]; | |||
| 3909 | 609 | 686 | return $self->{_auth_results} if $self->{_auth_results}; | |||
| 3910 | ||||||
| 3911 | 339 | 231 | my %auth; | |||
| 3912 | ||||||
| 3913 | # Concatenate all Authentication-Results: header values | |||||
| 3914 | my $raw = join('; ', | |||||
| 3915 | 37 | 48 | map { $_->{value} } | |||
| 3916 | 2571 | 2064 | grep { $_->{name} eq 'authentication-results' } | |||
| 3917 | 339 339 | 301 341 | @{ $self->{_headers} } | |||
| 3918 | ); | |||||
| 3919 | ||||||
| 3920 | # Extract individual authentication mechanism results | |||||
| 3921 | 339 | 401 | $auth{spf} = $1 if $raw =~ /\bspf=(\S+)/i; | |||
| 3922 | 339 | 329 | $auth{dkim} = $1 if $raw =~ /\bdkim=(\S+)/i; | |||
| 3923 | 339 | 328 | $auth{dmarc} = $1 if $raw =~ /\bdmarc=(\S+)/i; | |||
| 3924 | 339 | 782 | $auth{arc} = $1 if $raw =~ /\barc=(\S+)/i; | |||
| 3925 | ||||||
| 3926 | # Strip trailing punctuation captured by the greedy \S+ | |||||
| 3927 | 339 | 295 | for my $k (qw(spf dkim dmarc arc)) { | |||
| 3928 | 1356 | 1115 | $auth{$k} =~ s/[;,\s]+$// if defined $auth{$k}; | |||
| 3929 | } | |||||
| 3930 | ||||||
| 3931 | # Extract DKIM signing domains from all DKIM-Signature: d= tags. | |||||
| 3932 | # Prefer the first domain that matches the provider table (identifies ESP). | |||||
| 3933 | 339 | 226 | my @dkim_domains; | |||
| 3934 | 339 2571 339 | 243 1846 287 | for my $h (grep { $_->{name} eq 'dkim-signature' } @{ $self->{_headers} }) { | |||
| 3935 | 14 | 25 | if ($h->{value} =~ /\bd=([^;,\s]+)/) { | |||
| 3936 | 14 | 17 | push @dkim_domains, lc $1; | |||
| 3937 | } | |||||
| 3938 | } | |||||
| 3939 | ||||||
| 3940 | 339 | 298 | if (@dkim_domains) { | |||
| 3941 | # Check if any signing domain matches a known provider | |||||
| 3942 | 9 | 6 | my $preferred; | |||
| 3943 | 9 | 8 | for my $d (@dkim_domains) { | |||
| 3944 | 14 | 12 | if ($self->_provider_abuse_for_host($d)) { | |||
| 3945 | 4 | 2 | $preferred = $d; | |||
| 3946 | 4 | 5 | last; | |||
| 3947 | } | |||||
| 3948 | } | |||||
| 3949 | 9 | 17 | $auth{dkim_domain} = $preferred // $dkim_domains[0]; | |||
| 3950 | 9 | 11 | $auth{dkim_domains} = \@dkim_domains; | |||
| 3951 | } | |||||
| 3952 | ||||||
| 3953 | 339 | 295 | $self->{_auth_results} = \%auth; | |||
| 3954 | 339 | 355 | return \%auth; | |||
| 3955 | } | |||||
| 3956 | ||||||
| 3957 | # ----------------------------------------------------------------------- | |||||
| 3958 | # Private: provider-table lookups | |||||
| 3959 | # ----------------------------------------------------------------------- | |||||
| 3960 | ||||||
| 3961 | # _provider_abuse_for_host( $host ) -> hashref | undef | |||||
| 3962 | # | |||||
| 3963 | # Purpose: | |||||
| 3964 | # Look up a hostname (and each of its parent domains, stripping one label | |||||
| 3965 | # at a time from the left) in the %PROVIDER_ABUSE table. | |||||
| 3966 | # | |||||
| 3967 | # Entry criteria: | |||||
| 3968 | # $host -- a defined hostname or domain string. | |||||
| 3969 | # | |||||
| 3970 | # Exit status: | |||||
| 3971 | # Returns the %PROVIDER_ABUSE entry hashref on match, undef otherwise. | |||||
| 3972 | ||||||
| 3973 | sub _provider_abuse_for_host { | |||||
| 3974 | 1131 | 6535 | my ($self, $host) = @_; | |||
| 3975 | 1131 | 845 | $host = lc $host; | |||
| 3976 | # Strip successive subdomains until we find a match or exhaust labels | |||||
| 3977 | 1131 | 1140 | while ($host =~ /\./) { | |||
| 3978 | 1253 | 1055 | return $self->{provider_abuse}->{$host} if $self->{provider_abuse}->{$host}; | |||
| 3979 | 1253 | 1164 | return $PROVIDER_ABUSE{$host} if $PROVIDER_ABUSE{$host}; | |||
| 3980 | 1003 | 1513 | $host =~ s/^[^.]+\.//; | |||
| 3981 | } | |||||
| 3982 | 881 | 689 | return undef; | |||
| 3983 | } | |||||
| 3984 | ||||||
| 3985 | # _provider_abuse_for_ip( $ip, $rdns ) -> hashref | undef | |||||
| 3986 | # | |||||
| 3987 | # Purpose: | |||||
| 3988 | # Look up an IP's reverse-DNS hostname in the %PROVIDER_ABUSE table to | |||||
| 3989 | # identify well-known provider networks by rDNS pattern. | |||||
| 3990 | # | |||||
| 3991 | # Entry criteria: | |||||
| 3992 | # $ip -- IPv4 or IPv6 address string (used as fallback if $rdns absent). | |||||
| 3993 | # $rdns -- optional rDNS hostname string. | |||||
| 3994 | # | |||||
| 3995 | # Exit status: | |||||
| 3996 | # Returns the %PROVIDER_ABUSE entry on match, undef otherwise. | |||||
| 3997 | ||||||
| 3998 | sub _provider_abuse_for_ip { | |||||
| 3999 | 125 | 870 | my ($self, $ip, $rdns) = @_; | |||
| 4000 | 125 | 181 | return $self->_provider_abuse_for_host($rdns) if $rdns; | |||
| 4001 | 2 | 5 | return undef; | |||
| 4002 | } | |||||
| 4003 | ||||||
| 4004 | # ----------------------------------------------------------------------- | |||||
| 4005 | # Private: eTLD+1 normalisation | |||||
| 4006 | # ----------------------------------------------------------------------- | |||||
| 4007 | ||||||
| 4008 | # _registrable( $host ) -> string | undef | |||||
| 4009 | # | |||||
| 4010 | # Purpose: | |||||
| 4011 | # Return the registrable eTLD+1 form of a hostname. Uses | |||||
| 4012 | # Domain::PublicSuffix when installed for accurate results; falls back | |||||
| 4013 | # to a built-in heuristic for the common two-letter ccTLD+2 pattern. | |||||
| 4014 | # | |||||
| 4015 | # Entry criteria: | |||||
| 4016 | # $host -- a hostname string (may include subdomains). | |||||
| 4017 | # | |||||
| 4018 | # Exit status: | |||||
| 4019 | # Returns the registrable domain string, or undef for single-label | |||||
| 4020 | # hostnames (e.g. 'localhost'). | |||||
| 4021 | # | |||||
| 4022 | # Notes: | |||||
| 4023 | # The heuristic handles co.uk, com.au, net.jp, org.nz etc. but not | |||||
| 4024 | # uncommon second-level delegations like ltd.uk or plc.uk. | |||||
| 4025 | ||||||
| 4026 | sub _registrable { | |||||
| 4027 | 976 | 12405 | my ($host) = @_; | |||
| 4028 | 976 | 1481 | return undef unless $host && $host =~ /\./; | |||
| 4029 | ||||||
| 4030 | # Use Domain::PublicSuffix for accurate PSL-based normalisation | |||||
| 4031 | 839 | 644 | if ($HAS_PUBLIC_SUFFIX) { | |||
| 4032 | 0 | 0 | my $psl = Domain::PublicSuffix->new(); | |||
| 4033 | 0 | 0 | my $root = $psl->get_root_domain(lc $host); | |||
| 4034 | 0 | 0 | return $root if $root; | |||
| 4035 | } | |||||
| 4036 | ||||||
| 4037 | # Built-in heuristic fallback | |||||
| 4038 | 839 | 811 | my @labels = split /\./, lc $host; | |||
| 4039 | 839 | 1172 | return $host if @labels <= 2; | |||
| 4040 | ||||||
| 4041 | # Detect common ccTLD second-level patterns (e.g. co.uk, com.au) | |||||
| 4042 | 95 | 231 | if ($labels[-1] =~ /^[a-z]{2}$/ && | |||
| 4043 | $labels[-2] =~ /^(?:co|com|net|org|gov|edu|ac|me)$/) { | |||||
| 4044 | 47 | 106 | return join('.', @labels[-3..-1]); | |||
| 4045 | } | |||||
| 4046 | 48 | 151 | return join('.', @labels[-2..-1]); | |||
| 4047 | } | |||||
| 4048 | ||||||
| 4049 | # ----------------------------------------------------------------------- | |||||
| 4050 | # Private: utilities | |||||
| 4051 | # ----------------------------------------------------------------------- | |||||
| 4052 | ||||||
| 4053 | # _enrich_ip( $ip, $confidence, $note ) -> origin hashref | |||||
| 4054 | # | |||||
| 4055 | # Purpose: | |||||
| 4056 | # Perform rDNS and WHOIS/RDAP for a single IP and package the results | |||||
| 4057 | # into the standard origin hashref returned by originating_ip(). | |||||
| 4058 | # | |||||
| 4059 | # Entry criteria: | |||||
| 4060 | # $ip -- a defined, non-private IPv4 or IPv6 address string. | |||||
| 4061 | # $confidence -- 'high', 'medium', or 'low'. | |||||
| 4062 | # $note -- human-readable explanation of why this IP was chosen. | |||||
| 4063 | # | |||||
| 4064 | # Exit status: | |||||
| 4065 | # Returns { ip, rdns, org, abuse, country, confidence, note } hashref. | |||||
| 4066 | ||||||
| 4067 | sub _enrich_ip { | |||||
| 4068 | 89 | 2371 | my ($self, $ip, $confidence, $note) = @_; | |||
| 4069 | 89 | 129 | my $rdns = $self->_reverse_dns($ip); | |||
| 4070 | 89 | 223 | my $whois = $self->_whois_ip($ip); | |||
| 4071 | return { | |||||
| 4072 | ip => $ip, | |||||
| 4073 | rdns => $rdns // '(no reverse DNS)', | |||||
| 4074 | org => $whois->{org} // '(unknown)', | |||||
| 4075 | abuse => $whois->{abuse} // '(unknown)', | |||||
| 4076 | country => $whois->{country} // undef, | |||||
| 4077 | 89 | 714 | confidence => $confidence, | |||
| 4078 | note => $note, | |||||
| 4079 | }; | |||||
| 4080 | } | |||||
| 4081 | ||||||
| 4082 | # _header_value( $name ) -> value_string | undef | |||||
| 4083 | # | |||||
| 4084 | # Purpose: | |||||
| 4085 | # Return the value of the first header matching the given lower-cased | |||||
| 4086 | # header name. | |||||
| 4087 | # | |||||
| 4088 | # Entry criteria: | |||||
| 4089 | # $name -- a lower-cased header name string. | |||||
| 4090 | # $self->{_headers} populated by _split_message(). | |||||
| 4091 | # | |||||
| 4092 | # Exit status: | |||||
| 4093 | # Returns the value string, or undef if the header is not present. | |||||
| 4094 | ||||||
| 4095 | sub _header_value { | |||||
| 4096 | 3892 | 3997 | my ($self, $name) = @_; | |||
| 4097 | 3892 3892 | 2279 2969 | for my $h (@{ $self->{_headers} }) { | |||
| 4098 | 22126 | 18296 | return $h->{value} if $h->{name} eq lc($name); | |||
| 4099 | } | |||||
| 4100 | 1883 | 1755 | return undef; | |||
| 4101 | } | |||||
| 4102 | ||||||
| 4103 | # _ip_in_cidr( $ip, $cidr ) -> bool | |||||
| 4104 | # | |||||
| 4105 | # Purpose: | |||||
| 4106 | # Test whether an IPv4 address falls within a CIDR block or is an exact | |||||
| 4107 | # match (when $cidr contains no '/' separator). | |||||
| 4108 | # | |||||
| 4109 | # Entry criteria: | |||||
| 4110 | # $ip -- a defined dotted-quad IPv4 address string. | |||||
| 4111 | # $cidr -- a CIDR string like '10.0.0.0/8' or an exact IP. | |||||
| 4112 | # | |||||
| 4113 | # Exit status: | |||||
| 4114 | # Returns 1 (true) if the IP is within the CIDR block, 0 otherwise. | |||||
| 4115 | ||||||
| 4116 | sub _ip_in_cidr { | |||||
| 4117 | 51 | 838 | my ($self, $ip, $cidr) = @_; | |||
| 4118 | 51 | 105 | return $ip eq $cidr unless $cidr =~ m{/}; | |||
| 4119 | 45 | 61 | my ($net_addr, $prefix) = split m{/}, $cidr; | |||
| 4120 | 45 | 186 | return 0 unless defined $prefix && $prefix =~ /^\d+$/ && $prefix <= 32; | |||
| 4121 | ||||||
| 4122 | # Compute the network mask and compare masked network addresses | |||||
| 4123 | 44 | 43 | my $mask = ~0 << (32 - $prefix); | |||
| 4124 | 44 | 157 | my $net_n = unpack 'N', (inet_aton($net_addr) // return 0); | |||
| 4125 | 44 | 79 | my $ip_n = unpack 'N', (inet_aton($ip) // return 0); | |||
| 4126 | 44 | 97 | return ($ip_n & $mask) == ($net_n & $mask); | |||
| 4127 | } | |||||
| 4128 | ||||||
| 4129 | # _decode_mime_words( $str ) -> decoded_string | |||||
| 4130 | # | |||||
| 4131 | # Purpose: | |||||
| 4132 | # Decode MIME encoded-words (=?charset?B/Q?...?=) in a header value | |||||
| 4133 | # string for human-readable display in reports. | |||||
| 4134 | # | |||||
| 4135 | # Entry criteria: | |||||
| 4136 | # $str -- a defined header value string; may be undef. | |||||
| 4137 | # | |||||
| 4138 | # Exit status: | |||||
| 4139 | # Returns the decoded string, or '' if $str is undef. | |||||
| 4140 | ||||||
| 4141 | sub _decode_mime_words { | |||||
| 4142 | 473 | 2628 | my ($self, $str) = @_; | |||
| 4143 | 473 | 413 | return '' unless defined $str; | |||
| 4144 | # Replace each encoded-word with its decoded equivalent | |||||
| 4145 | 471 33 | 428 37 | $str =~ s/=\?([^?]+)\?([BbQq])\?([^?]*)\?=/_decode_ew($1,$2,$3)/ge; | |||
| 4146 | 471 | 385 | return $str; | |||
| 4147 | } | |||||
| 4148 | ||||||
| 4149 | # _decode_ew( $charset, $enc, $text ) -> decoded_bytes | |||||
| 4150 | # | |||||
| 4151 | # Purpose: | |||||
| 4152 | # Decode a single MIME encoded-word component (base64 or quoted-printable). | |||||
| 4153 | # | |||||
| 4154 | # Notes: | |||||
| 4155 | # Non-UTF-8 charsets return raw bytes; good enough for display-name spoof | |||||
| 4156 | # detection which only needs ASCII matching. | |||||
| 4157 | ||||||
| 4158 | sub _decode_ew { | |||||
| 4159 | 33 | 53 | my ($charset, $enc, $text) = @_; | |||
| 4160 | 33 | 25 | my $raw; | |||
| 4161 | 33 | 42 | if (uc($enc) eq 'B') { | |||
| 4162 | 31 | 55 | $raw = decode_base64($text); | |||
| 4163 | } else { | |||||
| 4164 | # Quoted-printable encoded-word uses underscore for space | |||||
| 4165 | 2 | 3 | $text =~ s/_/ /g; | |||
| 4166 | 2 | 4 | $raw = decode_qp($text); | |||
| 4167 | } | |||||
| 4168 | 33 | 66 | return $raw; | |||
| 4169 | } | |||||
| 4170 | ||||||
| 4171 | # _parse_date_to_epoch( $str ) -> epoch_int | undef | |||||
| 4172 | # | |||||
| 4173 | # Purpose: | |||||
| 4174 | # Parse common WHOIS date strings to a Unix epoch integer. | |||||
| 4175 | # Handles YYYY-MM-DD, YYYY-MM-DDThh:mm:ssZ, and DD-Mon-YYYY formats. | |||||
| 4176 | # | |||||
| 4177 | # Entry criteria: | |||||
| 4178 | # $str -- a defined date string; may be undef. | |||||
| 4179 | # | |||||
| 4180 | # Exit status: | |||||
| 4181 | # Returns epoch integer on success, undef if the string cannot be parsed. | |||||
| 4182 | ||||||
| 4183 | sub _parse_date_to_epoch { | |||||
| 4184 | 61 | 5112 | my ($self, $str) = @_; | |||
| 4185 | 61 | 86 | return undef unless $str; | |||
| 4186 | ||||||
| 4187 | # Clean the string of trailing whitespace/newlines | |||||
| 4188 | 60 | 132 | $str =~ s/^\s+|\s+$//g; | |||
| 4189 | ||||||
| 4190 | # Guard Regex: Validates the strict YYYY-MM-DDThh:mm:ssZ format | |||||
| 4191 | 60 | 67 | if ($str =~ /^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2})(?:\.\d+)?Z$/) { | |||
| 4192 | # Parse the string | |||||
| 4193 | # We use 'strptime' to create a Time::Piece object. | |||||
| 4194 | # The 'Z' indicates UTC (Zulu time). | |||||
| 4195 | 2 | 2 | my $epoch = eval { | |||
| 4196 | 2 | 11 | my $t = Time::Piece->strptime($1, '%Y-%m-%dT%H:%M:%S'); | |||
| 4197 | ||||||
| 4198 | # Return seconds since the epoch | |||||
| 4199 | # Time::Piece handles the timezone offset internally when calling ->epoch | |||||
| 4200 | ||||||
| 4201 | # strptime returns a local time object. | |||||
| 4202 | # We must subtract the local timezone offset to get the true UTC epoch. | |||||
| 4203 | 2 | 217 | return $t->epoch - $t->tzoffset->seconds; | |||
| 4204 | }; | |||||
| 4205 | 2 | 99 | return $epoch if defined $epoch; | |||
| 4206 | } | |||||
| 4207 | 58 | 42 | my ($y, $m, $d); | |||
| 4208 | ||||||
| 4209 | 58 39 | 124 71 | if ($str =~ /^(\d{4})-(\d{2})-(\d{2})/) { ($y,$m,$d)=($1,$2,$3) } | |||
| 4210 | 17 | 42 | elsif ($str =~ /^(\d{2})-([A-Za-z]{3})-(\d{4})/) { ($d,$m,$y)=($1,$Readonly::Values::Months::months{lc$2}//0,$3) } | |||
| 4211 | 1 | 2 | elsif ($str =~ /^(\d{2})\/(\d{2})\/(\d{4})/) { ($m,$d,$y)=($1,$2,$3) } | |||
| 4212 | ||||||
| 4213 | 58 | 185 | return undef unless $y && $m && $d; | |||
| 4214 | ||||||
| 4215 | 56 56 56 | 42 103 61 | if (eval { require Time::Local; 1 }) { | |||
| 4216 | 56 56 | 32 105 | return eval { Time::Local::timegm(0,0,0,$d,$m-1,$y-1900) }; | |||
| 4217 | } | |||||
| 4218 | # Approximate fallback without Time::Local | |||||
| 4219 | 0 | 0 | return ($y-1970)*365.25*$SECS_PER_DAY + ($m-1)*30.5*$SECS_PER_DAY + ($d-1)*$SECS_PER_DAY; | |||
| 4220 | } | |||||
| 4221 | ||||||
| 4222 | # _parse_rfc2822_date( $str ) -> epoch_int | undef | |||||
| 4223 | # | |||||
| 4224 | # Purpose: | |||||
| 4225 | # Parse an RFC 2822 Date: header value to a Unix epoch integer. | |||||
| 4226 | # Timezone offsets are intentionally ignored; the function returns a | |||||
| 4227 | # UTC-equivalent value. For the 7-day suspicious_date window the | |||||
| 4228 | # maximum error is ~14 hours, well within the tolerance. | |||||
| 4229 | # | |||||
| 4230 | # Entry criteria: | |||||
| 4231 | # $str -- a defined Date: header value string. | |||||
| 4232 | # | |||||
| 4233 | # Exit status: | |||||
| 4234 | # Returns epoch integer on success, undef if the string cannot be parsed. | |||||
| 4235 | ||||||
| 4236 | sub _parse_rfc2822_date { | |||||
| 4237 | 206 | 172 | my ($str) = @_; | |||
| 4238 | 206 | 168 | return undef unless $str; | |||
| 4239 | ||||||
| 4240 | # Match: DD Mon YYYY HH:MM:SS (timezone offset ignored) | |||||
| 4241 | 206 | 475 | if ($str =~ /(\d{1,2})\s+([A-Za-z]{3})\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})/) { | |||
| 4242 | my ($d, $m, $y, $H, $M, $S) = | |||||
| 4243 | 206 | 555 | ($1, $Readonly::Values::Months::months{ lc $2 } // 0, $3, $4, $5, $6); | |||
| 4244 | 206 | 1038 | return undef unless $m; | |||
| 4245 | 206 206 206 | 184 485 216 | if (eval { require Time::Local; 1 }) { | |||
| 4246 | 206 206 | 138 372 | return eval { Time::Local::timegm($S, $M, $H, $d, $m - 1, $y - 1900) }; | |||
| 4247 | } | |||||
| 4248 | } | |||||
| 4249 | 0 | 0 | return undef; | |||
| 4250 | } | |||||
| 4251 | ||||||
| 4252 | # _country_name( $cc ) -> country_name_string | |||||
| 4253 | # | |||||
| 4254 | # Purpose: | |||||
| 4255 | # Return a human-readable country name for a two-letter ISO 3166-1 | |||||
| 4256 | # alpha-2 country code. Only the small set of statistically high-volume | |||||
| 4257 | # spam-originating countries is covered; other codes are returned as-is. | |||||
| 4258 | # | |||||
| 4259 | # Entry criteria: | |||||
| 4260 | # $cc -- a two-letter uppercase country code string. | |||||
| 4261 | # | |||||
| 4262 | # Exit status: | |||||
| 4263 | # Returns the country name string, or the code itself if not in the table. | |||||
| 4264 | ||||||
| 4265 | sub _country_name { | |||||
| 4266 | 35 | 4711 | my ($cc) = @_; | |||
| 4267 | 35 | 74 | my %names = ( | |||
| 4268 | CN => 'China', RU => 'Russia', NG => 'Nigeria', | |||||
| 4269 | VN => 'Vietnam', IN => 'India', PK => 'Pakistan', | |||||
| 4270 | BD => 'Bangladesh', | |||||
| 4271 | ); | |||||
| 4272 | 35 | 95 | return $names{$cc} // $cc; | |||
| 4273 | } | |||||
| 4274 | ||||||
| 4275 | # _debug( $msg ) | |||||
| 4276 | # | |||||
| 4277 | # Purpose: | |||||
| 4278 | # Write a diagnostic message to STDERR when verbose mode is enabled. | |||||
| 4279 | # | |||||
| 4280 | # Entry criteria: | |||||
| 4281 | # $msg -- a defined message string. | |||||
| 4282 | # | |||||
| 4283 | # Notes: | |||||
| 4284 | # Messages are prefixed with the class name for easy grepping. | |||||
| 4285 | ||||||
| 4286 | sub _debug { | |||||
| 4287 | 888 | 709 | my ($self, $msg) = @_; | |||
| 4288 | ||||||
| 4289 | 888 | 1007 | if($self->{verbose}) { | |||
| 4290 | 2 | 4 | if(my $logger = $self->{logger}) { # May have been set in Object::Configure | |||
| 4291 | 0 | 0 | $logger->debug("[Email::Abuse::Investigator] $msg"); | |||
| 4292 | } else { | |||||
| 4293 | 2 | 4 | print STDERR "[Email::Abuse::Investigator] $msg\n"; | |||
| 4294 | } | |||||
| 4295 | } | |||||
| 4296 | } | |||||
| 4297 | ||||||
| 4298 | 1; | |||||
| 4299 | ||||||