| File: | blib/lib/Data/Random/String/Matches.pm | 
| Coverage: | 87.3% | 
| line | stmt | bran | cond | sub | time | code | 
|---|---|---|---|---|---|---|
| 1 | package Data::Random::String::Matches; | |||||
| 2 | ||||||
| 3 | 31 31 | 901758 49 | use 5.010; | |||
| 4 | ||||||
| 5 | 31 31 31 | 52 25 272 | use strict; | |||
| 6 | 31 31 31 | 47 26 601 | use warnings; | |||
| 7 | ||||||
| 8 | 31 31 31 | 48 25 682 | use Carp qw(carp croak); | |||
| 9 | 31 31 31 | 5530 153280 606 | use Params::Get; | |||
| 10 | 31 31 31 | 5512 2733 57 | use utf8; | |||
| 11 | ||||||
| 12 | our $VERSION = '0.03'; | |||||
| 13 | ||||||
| 14 - 182 | =head1 NAME
Data::Random::String::Matches - Generate random strings matching a regex
=head1 SYNOPSIS
        use Data::Random::String::Matches;
        # Create a generator with regex and optional length
        my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7);
        # Generate a matching string
        my $str = $gen->generate();
        print $str;  # e.g., "XYZ1234"
        # Alternation
        my $gen2 = Data::Random::String::Matches->new(qr/(cat|dog|bird)/);
        my $animal = $gen2->generate_smart();  # "cat", "dog", or "bird"
        # Backreferences
        my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/);
        my $str3 = $gen3->generate_smart();  # e.g., "abc-abc"
        # Groups and quantifiers
        my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/);
        my $laugh = $gen4->generate_smart();  # "haha", "hahaha", or "hahahaha"
        # Unicode
        $gen = Data::Random::String::Matches->new(qr/\p{L}{5}/);
        # Named captures
        $gen = Data::Random::String::Matches->new(qr/(?<year>\d{4})-\k<year>/);
        # Possessive
        $gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/);
        # Lookaheads
        $gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/);
        # Combined
        $gen = Data::Random::String::Matches->new(
                qr/(?<prefix>\p{Lu}{2})\d++\k<prefix>(?=[A-Z])/
        );
        # Consistency with Legacy software
        print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";
=head1 DESCRIPTION
This module generates random strings that match a given regular expression pattern.
It parses the regex pattern and intelligently builds matching strings, supporting
a wide range of regex features.
=head1 SUPPORTED REGEX FEATURES
=head2 Character Classes
=over 4
=item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]>
=item * Negated classes: C<[^a-z]>
=item * Ranges: C<[a-zA-Z0-9]>
=item * Escape sequences in classes: C<[\d\w]>
=back
=head2 Escape Sequences
=over 4
=item * C<\d> - digit [0-9]
=item * C<\w> - word character [a-zA-Z0-9_]
=item * C<\s> - whitespace
=item * C<\D> - non-digit
=item * C<\W> - non-word character
=item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return
=back
=head2 Quantifiers
=over 4
=item * C<{n}> - exactly n times
=item * C<{n,m}> - between n and m times
=item * C<{n,}> - n or more times
=item * C<+> - one or more (1-5 times)
=item * C<*> - zero or more (0-5 times)
=item * C<?> - zero or one
=back
=head2 Grouping and Alternation
=over 4
=item * C<(...)> - capturing group
=item * C<(?:...)> - non-capturing group
=item * C<|> - alternation (e.g., C<cat|dog|bird>)
=item * C<\1>, C<\2>, etc. - backreferences
=back
=head2 Other
=over 4
=item * C<.> - any character (printable ASCII)
=item * Literal characters
=item * C<^> and C<$> anchors (stripped during parsing)
=back
=head1 LIMITATIONS
=over 4
=item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported
=item * Named groups ((?<name>...)) are not supported
=item * Possessive quantifiers (*+, ++) are not supported
=item * Unicode properties (\p{L}, \p{N}) are not supported
=item * Some complex nested patterns may not work correctly with smart parsing
=back
=head1 EXAMPLES
        # Email-like pattern
        my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/);
        # API key pattern
        my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/);
        # Phone number
        my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/);
        # Repeated pattern
        my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/);
=head1 METHODS
=head2 new($regex, $length)
Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string.
C<$length> is optional and defaults to 10 (used for fallback generation).
=cut | |||||
| 183 | ||||||
| 184 | sub new { | |||||
| 185 | 198 | 415246 | my ($class, $regex, $length) = @_; | |||
| 186 | ||||||
| 187 | 198 | 260 | croak 'Regex pattern is required' unless defined $regex; | |||
| 188 | ||||||
| 189 | # Convert string to regex if needed | |||||
| 190 | 197 | 395 | my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/; | |||
| 191 | ||||||
| 192 | 197 | 539 | my $self = { | |||
| 193 | regex => $regex_obj, | |||||
| 194 | regex_str => "$regex", | |||||
| 195 | length => $length || 10, | |||||
| 196 | backrefs => {}, # Store backreferences | |||||
| 197 | named_refs => {}, # Store named captures | |||||
| 198 | }; | |||||
| 199 | ||||||
| 200 | 197 | 270 | return bless $self, $class; | |||
| 201 | } | |||||
| 202 | ||||||
| 203 - 209 | =head2 generate($max_attempts) Generates a random string matching the regex. First tries smart parsing, then falls back to brute force if needed. Tries up to C<$max_attempts> times (default 1000) before croaking. =cut | |||||
| 210 | ||||||
| 211 | sub generate { | |||||
| 212 | 3939 | 11480 | my ($self, $max_attempts) = @_; | |||
| 213 | 3939 | 5208 | $max_attempts //= 1000; | |||
| 214 | ||||||
| 215 | 3939 | 2569 | my $regex = $self->{regex}; | |||
| 216 | 3939 | 2406 | my $length = $self->{length}; | |||
| 217 | ||||||
| 218 | # First try the smart approach | |||||
| 219 | 3939 3939 | 2404 2978 | my $str = eval { $self->_build_from_pattern($self->{regex_str}) }; | |||
| 220 | 3939 | 7825 | if (defined $str && $str =~ /^$regex$/) { | |||
| 221 | 3937 | 4275 | return $str; | |||
| 222 | } | |||||
| 223 | ||||||
| 224 | # If smart approach failed, show warning in debug mode | |||||
| 225 | 2 | 4 | if ($ENV{DEBUG_REGEX_GEN} && $@) { | |||
| 226 | 0 | 0 | warn "Smart generation failed: $@"; | |||
| 227 | } | |||||
| 228 | ||||||
| 229 | # Fall back to brute force with character set matching | |||||
| 230 | 2 | 21 | for (1 .. $max_attempts) { | |||
| 231 | 1010 | 741 | $str = $self->_random_string_smart($length); | |||
| 232 | 1010 | 1691 | return $str if $str =~ /^$regex$/; | |||
| 233 | } | |||||
| 234 | ||||||
| 235 | 2 | 74 | croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}"; | |||
| 236 | } | |||||
| 237 | ||||||
| 238 | sub _random_string_smart { | |||||
| 239 | 1010 | 633 | my ($self, $len) = @_; | |||
| 240 | ||||||
| 241 | 1010 | 635 | my $regex_str = $self->{regex_str}; | |||
| 242 | ||||||
| 243 | # Detect common patterns and generate appropriate characters | |||||
| 244 | 1010 | 530 | my @chars; | |||
| 245 | ||||||
| 246 | 1010 | 2432 | if ($regex_str =~ /\\d/ || $regex_str =~ /\[0-9\]/ || $regex_str =~ /\[\^[^\]]*[A-Za-z]/) { | |||
| 247 | # Digit patterns | |||||
| 248 | 0 | 0 | @chars = ('0'..'9'); | |||
| 249 | } elsif ($regex_str =~ /\[A-Z\]/ || $regex_str =~ /\[A-Z[^\]]*\]/) { | |||||
| 250 | # Uppercase patterns | |||||
| 251 | 0 | 0 | @chars = ('A'..'Z'); | |||
| 252 | } elsif ($regex_str =~ /\[a-z\]/ || $regex_str =~ /\[a-z[^\]]*\]/) { | |||||
| 253 | # Lowercase patterns | |||||
| 254 | 0 | 0 | @chars = ('a'..'z'); | |||
| 255 | } elsif ($regex_str =~ /\\w/ || $regex_str =~ /\[a-zA-Z0-9_\]/) { | |||||
| 256 | # Word characters | |||||
| 257 | 0 | 0 | @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_'); | |||
| 258 | } else { | |||||
| 259 | # Default to printable ASCII | |||||
| 260 | 1010 94940 | 711 68992 | @chars = map { chr($_) } (33 .. 126); | |||
| 261 | } | |||||
| 262 | ||||||
| 263 | 1010 | 707 | my $str = ''; | |||
| 264 | 1010 | 2342 | $str .= $chars[int(rand(@chars))] for (1 .. $len); | |||
| 265 | ||||||
| 266 | 1010 | 2498 | return $str; | |||
| 267 | } | |||||
| 268 | ||||||
| 269 - 274 | =head2 generate_smart() Parses the regex and builds a matching string directly. Faster and more reliable than brute force, but may not handle all edge cases. =cut | |||||
| 275 | ||||||
| 276 | sub generate_smart { | |||||
| 277 | 132 | 5841 | my $self = $_[0]; | |||
| 278 | 132 | 170 | return $self->_build_from_pattern($self->{regex_str}); | |||
| 279 | } | |||||
| 280 | ||||||
| 281 - 311 | =head2 generate_many($count, $unique)
Generates multiple random strings matching the regex.
    my @strings = $gen->generate_many(10);           # 10 strings (may have duplicates)
    my @strings = $gen->generate_many(10, 1);        # 10 unique strings
    my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings
    # Generate until you have 1000 unique codes
    my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/);
    my @codes = $gen->generate_many(1000, 'unique');
Parameters:
=over 4
=item * C<$count> - Number of strings to generate (required, must be positive)
=item * C<$unique> - If true, ensures all generated strings are unique. May return fewer
than C<$count> strings if uniqueness cannot be achieved within reasonable attempts.
Accepts any true value (1, 'unique', etc.)
=back
Returns: List of generated strings
Dies: If count is not a positive integer
Warns: If unable to generate the requested number of unique strings
=cut | |||||
| 312 | ||||||
| 313 | sub generate_many { | |||||
| 314 | 23 | 2682 | my ($self, $count, $unique) = @_; | |||
| 315 | ||||||
| 316 | 23 | 48 | croak 'Count must be a positive integer' unless defined $count && $count > 0; | |||
| 317 | ||||||
| 318 | 20 | 13 | my @results; | |||
| 319 | ||||||
| 320 | 20 | 16 | if ($unique) { | |||
| 321 | # Generate unique strings | |||||
| 322 | 5 | 6 | my %seen; | |||
| 323 | 5 | 3 | my $attempts = 0; | |||
| 324 | 5 | 5 | my $max_attempts = $count * 100; # Reasonable limit | |||
| 325 | ||||||
| 326 | 5 | 10 | while (keys %seen < $count && $attempts < $max_attempts) { | |||
| 327 | 1628 | 1186 | my $str = $self->generate(); | |||
| 328 | 1628 | 1237 | $seen{$str} = 1; | |||
| 329 | 1628 | 1921 | $attempts++; | |||
| 330 | } | |||||
| 331 | ||||||
| 332 | 5 | 4 | if (keys %seen < $count) { | |||
| 333 | 1 | 12 | carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested"; | |||
| 334 | } | |||||
| 335 | ||||||
| 336 | 5 | 296 | @results = keys %seen; | |||
| 337 | } else { | |||||
| 338 | # Generate any strings (may have duplicates) | |||||
| 339 | 15 | 23 | push @results, $self->generate() for (1 .. $count); | |||
| 340 | } | |||||
| 341 | ||||||
| 342 | 20 | 250 | return @results; | |||
| 343 | } | |||||
| 344 | ||||||
| 345 - 349 | =head2 get_seed() Gets the random seed for reproducible generation =cut | |||||
| 350 | ||||||
| 351 | sub get_seed { | |||||
| 352 | 2 | 338 | my $self = shift; | |||
| 353 | ||||||
| 354 | 2 | 6 | return $self->{seed}; | |||
| 355 | } | |||||
| 356 | ||||||
| 357 - 361 | =head2 set_seed($seed) Sets the random seed for reproducible generation =cut | |||||
| 362 | ||||||
| 363 | sub set_seed { | |||||
| 364 | 13 | 1512 | my $self = shift; | |||
| 365 | 13 | 19 | my $params = Params::Get::get_params('seed', \@_); | |||
| 366 | 12 | 130 | my $seed = $params->{'seed'}; | |||
| 367 | ||||||
| 368 | 12 | 70 | croak 'Seed must be defined' unless defined $seed; | |||
| 369 | ||||||
| 370 | 11 | 12 | srand($seed); | |||
| 371 | 11 | 11 | $self->{seed} = $seed; | |||
| 372 | ||||||
| 373 | 11 | 18 | return $self; | |||
| 374 | } | |||||
| 375 | ||||||
| 376 - 389 | =head2 suggest_simpler_pattern()
Analyzes patterns and suggests improvements.
  my $suggestion = $gen->suggest_simpler_pattern();
  if ($suggestion) {
    print "Reason: $suggestion->{reason}\n";
    print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern};
    print "Tips:\n";
    print "  - $_\n" for @{$suggestion->{tips}};
  }
=cut | |||||
| 390 | ||||||
| 391 | sub suggest_simpler_pattern { | |||||
| 392 | 21 | 36 | my $self = $_[0]; | |||
| 393 | ||||||
| 394 | 21 | 16 | my $pattern = $self->{regex_str}; | |||
| 395 | 21 | 25 | my $info = $self->pattern_info(); | |||
| 396 | ||||||
| 397 | # Check for patterns that are too complex | |||||
| 398 | 21 | 21 | if ($info->{complexity} eq 'very_complex') { | |||
| 399 | return { | |||||
| 400 | 1 | 6 | pattern => undef, | |||
| 401 | reason => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.', | |||||
| 402 | tips => [ | |||||
| 403 | 'Split alternations into separate generators', | |||||
| 404 | 'Avoid deeply nested groups', | |||||
| 405 | 'Use fixed-length patterns when possible', | |||||
| 406 | ], | |||||
| 407 | }; | |||||
| 408 | } | |||||
| 409 | ||||||
| 410 | # Suggest removing unnecessary backreferences | |||||
| 411 | 20 | 21 | if ($info->{features}{has_backreferences} && $pattern =~ /(\(\w+\)).*\\\d+/) { | |||
| 412 | 0 | 0 | my $simpler = $pattern; | |||
| 413 | # Can't automatically simplify backreferences, but can suggest | |||||
| 414 | return { | |||||
| 415 | 0 | 0 | pattern => undef, | |||
| 416 | reason => 'Backreferences add complexity. Consider if you really need repeated groups.', | |||||
| 417 | tips => [ | |||||
| 418 | 'If the repeated part doesn\'t need to match, use two separate patterns', | |||||
| 419 | 'For validation, backreferences are great; for generation, they limit variation', | |||||
| 420 | ], | |||||
| 421 | }; | |||||
| 422 | } | |||||
| 423 | ||||||
| 424 | # Suggest fixed quantifiers instead of ranges | |||||
| 425 | 20 | 24 | if ($pattern =~ /\{(\d+),(\d+)\}/) { | |||
| 426 | 8 | 12 | my ($min, $max) = ($1, $2); | |||
| 427 | 8 | 10 | if ($max - $min > 10) { | |||
| 428 | 6 | 4 | my $mid = int(($min + $max) / 2); | |||
| 429 | 6 | 5 | my $simpler = $pattern; | |||
| 430 | 6 | 15 | $simpler =~ s/\{\d+,\d+\}/\{$mid\}/; | |||
| 431 | return { | |||||
| 432 | 6 | 17 | pattern => $simpler, | |||
| 433 | reason => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.", | |||||
| 434 | tips => [ | |||||
| 435 | 'Fixed lengths are faster to generate', | |||||
| 436 | 'If you need variety, generate multiple patterns with different fixed lengths', | |||||
| 437 | ], | |||||
| 438 | }; | |||||
| 439 | } | |||||
| 440 | } | |||||
| 441 | ||||||
| 442 | # Suggest limiting alternations | |||||
| 443 | 14 | 16 | if ($info->{features}{has_alternation}) { | |||
| 444 | 2 | 4 | my @alts = split /\|/, $pattern; | |||
| 445 | 2 | 3 | if (@alts > 10) { | |||
| 446 | return { | |||||
| 447 | 1 | 4 | pattern => undef, | |||
| 448 | reason => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.', | |||||
| 449 | tips => [ | |||||
| 450 | 'Create separate generators for different alternatives', | |||||
| 451 | 'Group similar patterns together', | |||||
| 452 | 'Use character classes [abc] instead of (a|b|c)', | |||||
| 453 | ], | |||||
| 454 | }; | |||||
| 455 | } | |||||
| 456 | ||||||
| 457 | # Check if alternations could be a character class | |||||
| 458 | 1 | 2 | if ($pattern =~ /\(([a-zA-Z])\|([a-zA-Z])\|([a-zA-Z])\)/) { | |||
| 459 | 1 | 2 | my $chars = join('', $1, $2, $3); | |||
| 460 | 1 | 1 | my $simpler = $pattern; | |||
| 461 | 1 | 4 | $simpler =~ s/\([a-zA-Z]\|[a-zA-Z]\|[a-zA-Z]\)/[$chars]/; | |||
| 462 | return { | |||||
| 463 | 1 | 3 | pattern => $simpler, | |||
| 464 | reason => 'Single-character alternations can be simplified to character classes.', | |||||
| 465 | tips => [ | |||||
| 466 | 'Use [abc] instead of (a|b|c)', | |||||
| 467 | 'Character classes are faster to process', | |||||
| 468 | ], | |||||
| 469 | }; | |||||
| 470 | } | |||||
| 471 | } | |||||
| 472 | ||||||
| 473 | # Suggest removing lookaheads/lookbehinds for generation | |||||
| 474 | 12 | 19 | if ($info->{features}{has_lookahead} || $info->{features}{has_lookbehind}) { | |||
| 475 | 3 | 3 | my $simpler = $pattern; | |||
| 476 | 3 | 6 | $simpler =~ s/\(\?[=!].*?\)//g; # Remove lookaheads | |||
| 477 | 3 | 5 | $simpler =~ s/\(\?<[=!].*?\)//g; # Remove lookbehinds | |||
| 478 | ||||||
| 479 | 3 | 4 | if ($simpler ne $pattern) { | |||
| 480 | return { | |||||
| 481 | 3 | 7 | pattern => $simpler, | |||
| 482 | reason => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.', | |||||
| 483 | tips => [ | |||||
| 484 | 'Lookaheads are great for validation, not generation', | |||||
| 485 | 'The simplified pattern generates the same strings', | |||||
| 486 | ], | |||||
| 487 | }; | |||||
| 488 | } | |||||
| 489 | } | |||||
| 490 | ||||||
| 491 | # Check for Unicode when ASCII would work | |||||
| 492 | 9 | 13 | if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) { | |||
| 493 | 2 | 2 | my $simpler = $pattern; | |||
| 494 | 2 | 4 | $simpler =~ s/\\p\{L\}/[A-Za-z]/g; | |||
| 495 | return { | |||||
| 496 | 2 | 6 | pattern => $simpler, | |||
| 497 | reason => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.', | |||||
| 498 | tips => [ | |||||
| 499 | 'ASCII patterns are faster', | |||||
| 500 | 'Only use Unicode if you need non-ASCII characters', | |||||
| 501 | ], | |||||
| 502 | }; | |||||
| 503 | } | |||||
| 504 | ||||||
| 505 | # Check for overly long fixed strings | |||||
| 506 | 7 | 7 | if ($pattern =~ /([a-zA-Z]{20,})/) { | |||
| 507 | return { | |||||
| 508 | 0 | 0 | pattern => undef, | |||
| 509 | reason => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.', | |||||
| 510 | tips => [ | |||||
| 511 | 'Use variables instead of long literals', | |||||
| 512 | 'Break into smaller patterns', | |||||
| 513 | ], | |||||
| 514 | }; | |||||
| 515 | } | |||||
| 516 | ||||||
| 517 | # Pattern seems reasonable | |||||
| 518 | 7 | 6 | return undef; | |||
| 519 | } | |||||
| 520 | ||||||
| 521 - 529 | =head2 validate($string)
Checks if a string matches the pattern without generating.
  if ($gen->validate('1234')) {
    print "Valid!\n";
  }
=cut | |||||
| 530 | ||||||
| 531 | sub validate { | |||||
| 532 | 61 | 1800 | my $self = shift; | |||
| 533 | 61 | 78 | my $params = Params::Get::get_params('string', \@_); | |||
| 534 | 60 | 578 | my $string = $params->{'string'}; | |||
| 535 | ||||||
| 536 | 60 | 96 | croak('String must be defined') unless defined $string; | |||
| 537 | ||||||
| 538 | 59 | 51 | my $regex = $self->{regex}; | |||
| 539 | 59 | 311 | return $string =~ /^$regex$/; | |||
| 540 | } | |||||
| 541 | ||||||
| 542 - 558 | =head2 pattern_info()
Returns detailed information about the pattern.
  my $info = $gen->pattern_info();
  print "Complexity: $info->{complexity}\n";
  print "Min length: $info->{min_length}\n";
  print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n";
C<pattern_info> analyzes a regular expression to produce a structured summary of its characteristics,
including estimated string lengths, detected features, and an overall complexity rating.
It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals,
character classes, and quantifiers.
It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash.
Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level.
=cut | |||||
| 559 | ||||||
| 560 | sub pattern_info { | |||||
| 561 | 40 | 1907 | my $self = $_[0]; | |||
| 562 | ||||||
| 563 | 40 | 41 | return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'}; | |||
| 564 | ||||||
| 565 | 37 | 23 | my $pattern = $self->{'regex_str'}; | |||
| 566 | ||||||
| 567 | # Calculate approximate min/max lengths | |||||
| 568 | 37 | 38 | my ($min_len, $max_len) = $self->_estimate_length($pattern); | |||
| 569 | ||||||
| 570 | # Detect pattern features | |||||
| 571 | 37 | 257 | my %features = ( | |||
| 572 | has_alternation => ($pattern =~ /\|/ ? 1 : 0), | |||||
| 573 | has_backreferences => ($pattern =~ /(\\[1-9]|\\k<)/ ? 1 : 0), | |||||
| 574 | has_unicode => ($pattern =~ /\\p\{/ ? 1 : 0), | |||||
| 575 | has_lookahead => ($pattern =~ /\(\?[=!]/ ? 1 : 0), | |||||
| 576 | has_lookbehind => ($pattern =~ /\(\?<[=!]/ ? 1 : 0), | |||||
| 577 | has_named_groups => ($pattern =~ /\(\?</ ? 1 : 0), | |||||
| 578 | has_possessive => ($pattern =~ /(?:[+*?]\+|\{\d+(?:,\d*)?\}\+)/ ? 1 : 0), | |||||
| 579 | ); | |||||
| 580 | ||||||
| 581 | 37 | 77 | my $info = { | |||
| 582 | pattern => $pattern, | |||||
| 583 | min_length => $min_len, | |||||
| 584 | max_length => $max_len, | |||||
| 585 | estimated_length => int(($min_len + $max_len) / 2), | |||||
| 586 | features => \%features, | |||||
| 587 | complexity => $self->_calculate_complexity(\%features, $pattern), | |||||
| 588 | }; | |||||
| 589 | ||||||
| 590 | 37 | 37 | $self->{'_pattern_info_cache'} = $info; | |||
| 591 | ||||||
| 592 | 37 | 41 | return $info; | |||
| 593 | } | |||||
| 594 | ||||||
| 595 | sub _estimate_length { | |||||
| 596 | 37 | 34 | my ($self, $pattern) = @_; | |||
| 597 | ||||||
| 598 | # Remove anchors and modifiers | |||||
| 599 | 37 | 107 | $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/; | |||
| 600 | 37 | 38 | $pattern =~ s/^\^//; | |||
| 601 | 37 | 22 | $pattern =~ s/\$//; | |||
| 602 | ||||||
| 603 | 37 | 25 | my $min = 0; | |||
| 604 | 37 | 23 | my $max = 0; | |||
| 605 | ||||||
| 606 | # Simple heuristic - count fixed characters and quantifiers | |||||
| 607 | 37 | 22 | my $last_was_atom = 0; # Handle cases like \d{3} where the quantifier modifies the atom count | |||
| 608 | 37 | 74 | while ($pattern =~ /([^+*?{}\[\]\\])|\\[dwsWDN]|\[([^\]]+)\]|\{(\d+)(?:,(\d+))?\}/g) { | |||
| 609 | 221 | 300 | if (defined $1 || (defined $2 && $2)) { | |||
| 610 | 155 | 91 | $min++; | |||
| 611 | 155 | 89 | $max++; | |||
| 612 | 155 | 162 | $last_was_atom = 1; | |||
| 613 | } elsif (defined $3) { | |||||
| 614 | 36 | 35 | if ($last_was_atom) { | |||
| 615 | # Replace the last atomâs contribution | |||||
| 616 | 16 | 17 | $min += $3 - 1; | |||
| 617 | 16 | 20 | $max += (defined $4 ? $4 : $3) - 1; | |||
| 618 | 16 | 19 | $last_was_atom = 0; | |||
| 619 | } else { | |||||
| 620 | # No preceding atom? assume standalone | |||||
| 621 | 20 | 29 | $min += $3; | |||
| 622 | 20 | 35 | $max += defined $4 ? $4 : $3; | |||
| 623 | } | |||||
| 624 | } | |||||
| 625 | } | |||||
| 626 | ||||||
| 627 | # Account for +, *, ? | |||||
| 628 | 37 | 32 | my $plus_count = () = $pattern =~ /\+/g; | |||
| 629 | 37 | 35 | my $star_count = () = $pattern =~ /\*/g; | |||
| 630 | 37 | 26 | my $question_count = () = $pattern =~ /\?/g; | |||
| 631 | ||||||
| 632 | 37 | 24 | $min += $plus_count; # + means at least 1 | |||
| 633 | 37 | 35 | $max += ($plus_count * 5) + ($star_count * 5); # Assume max 5 repetitions | |||
| 634 | 37 | 21 | $min -= $question_count; # ? makes things optional | |||
| 635 | ||||||
| 636 | 37 | 36 | $min = 0 if $min < 0; | |||
| 637 | 37 | 26 | $max = $min + 50 if $max < $min; # Ensure max >= min | |||
| 638 | ||||||
| 639 | 37 | 46 | return ($min, $max); | |||
| 640 | } | |||||
| 641 | ||||||
| 642 | sub _calculate_complexity { | |||||
| 643 | 37 | 32 | my ($self, $features, $pattern) = @_; | |||
| 644 | ||||||
| 645 | 37 | 24 | my $score = 0; | |||
| 646 | ||||||
| 647 | # Base complexity from pattern length | |||||
| 648 | 37 | 37 | $score += length($pattern) / 10; | |||
| 649 | ||||||
| 650 | # Add complexity for features | |||||
| 651 | 37 | 53 | $score += 2 if $features->{has_alternation}; | |||
| 652 | 37 | 34 | $score += 3 if $features->{has_backreferences}; | |||
| 653 | 37 | 29 | $score += 2 if $features->{has_unicode}; | |||
| 654 | 37 | 34 | $score += 2 if $features->{has_lookahead}; | |||
| 655 | 37 | 29 | $score += 2 if $features->{has_lookbehind}; | |||
| 656 | 37 | 33 | $score += 1 if $features->{has_named_groups}; | |||
| 657 | 37 | 24 | $score += 1 if $features->{has_possessive}; | |||
| 658 | ||||||
| 659 | # Classify | |||||
| 660 | 37 | 61 | return 'simple' if $score < 3; | |||
| 661 | 14 | 30 | return 'moderate' if $score < 7; | |||
| 662 | 3 | 6 | return 'complex' if $score < 12; | |||
| 663 | 1 | 2 | return 'very_complex'; | |||
| 664 | } | |||||
| 665 | ||||||
| 666 | sub _build_from_pattern { | |||||
| 667 | 4070 | 2780 | my ($self, $pattern) = @_; | |||
| 668 | ||||||
| 669 | # Reset backreferences for each generation | |||||
| 670 | 4070 | 2876 | $self->{backrefs} = {}; | |||
| 671 | 4070 | 2764 | $self->{named_refs} = {}; | |||
| 672 | 4070 | 2655 | $self->{group_counter} = 0; | |||
| 673 | ||||||
| 674 | # Remove regex delimiters and modifiers | |||||
| 675 | # Handle (?^:...), (?i:...), (?-i:...) etc | |||||
| 676 | 4070 | 5248 | $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/; | |||
| 677 | ||||||
| 678 | # Remove anchors (they're handled by the regex match itself) | |||||
| 679 | 4070 | 2756 | $pattern =~ s/^\^//; | |||
| 680 | 4070 | 2597 | $pattern =~ s/\$//; | |||
| 681 | ||||||
| 682 | 4070 | 3180 | return $self->_parse_sequence($pattern); | |||
| 683 | } | |||||
| 684 | ||||||
| 685 | sub _parse_sequence { | |||||
| 686 | 4212 | 2847 | my ($self, $pattern) = @_; | |||
| 687 | ||||||
| 688 | 4212 | 2541 | my $result = ''; | |||
| 689 | 4212 | 2405 | my $i = 0; | |||
| 690 | 4212 | 2449 | my $len = length($pattern); | |||
| 691 | ||||||
| 692 | 4212 | 3141 | while ($i < $len) { | |||
| 693 | 5649 | 3696 | my $char = substr($pattern, $i, 1); | |||
| 694 | ||||||
| 695 | 5649 | 5692 | if ($char eq '\\') { | |||
| 696 | # Escape sequence | |||||
| 697 | 940 | 584 | $i++; | |||
| 698 | 940 | 596 | my $next = substr($pattern, $i, 1); | |||
| 699 | ||||||
| 700 | 940 | 1495 | if ($next =~ /[1-9]/) { | |||
| 701 | # Backreference | |||||
| 702 | 30 | 17 | my $ref_num = $next; | |||
| 703 | 30 | 26 | if (exists $self->{backrefs}{$ref_num}) { | |||
| 704 | 30 | 28 | $result .= $self->{backrefs}{$ref_num}; | |||
| 705 | } else { | |||||
| 706 | 0 | 0 | croak "Backreference \\$ref_num used before group defined"; | |||
| 707 | } | |||||
| 708 | } elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') { | |||||
| 709 | # Named backreference \k<name> | |||||
| 710 | 7 | 4 | my $end = index($pattern, '>', $i+2); | |||
| 711 | 7 | 6 | my $name = substr($pattern, $i+2, $end-$i-2); | |||
| 712 | 7 | 8 | if (exists $self->{named_refs}{$name}) { | |||
| 713 | 7 | 7 | $result .= $self->{named_refs}{$name}; | |||
| 714 | } else { | |||||
| 715 | 0 | 0 | croak "Named backreference \\k<$name> used before group defined"; | |||
| 716 | } | |||||
| 717 | 7 | 5 | $i = $end; | |||
| 718 | } elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') { | |||||
| 719 | # Unicode property \p{L}, \p{N}, etc. | |||||
| 720 | 25 | 20 | my $end = index($pattern, '}', $i+2); | |||
| 721 | 25 | 23 | my $prop = substr($pattern, $i+2, $end-$i-2); | |||
| 722 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { | |||||
| 723 | 95 | 73 | $self->_unicode_property_char($prop); | |||
| 724 | 25 | 40 | }); | |||
| 725 | 25 | 34 | $result .= $generated; | |||
| 726 | 25 | 17 | $i = $new_i; | |||
| 727 | } elsif ($next eq 'd') { | |||||
| 728 | 730 2297 | 904 2260 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1); | |||
| 729 | 730 | 716 | $result .= $generated; | |||
| 730 | 730 | 459 | $i = $new_i; | |||
| 731 | } elsif ($next eq 'w') { | |||||
| 732 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { | |||||
| 733 | 115 | 434 | my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_'); | |||
| 734 | 115 | 304 | $chars[int(rand(@chars))]; | |||
| 735 | 35 | 68 | }, 1); | |||
| 736 | 35 | 48 | $result .= $generated; | |||
| 737 | 35 | 29 | $i = $new_i; | |||
| 738 | } elsif ($next eq 's') { | |||||
| 739 | 2 2 | 5 2 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1); | |||
| 740 | 2 | 4 | $result .= $generated; | |||
| 741 | 2 | 2 | $i = $new_i; | |||
| 742 | } elsif ($next eq 'D') { | |||||
| 743 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { | |||||
| 744 | 6 504 564 | 8 335 439 | my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126); | |||
| 745 | 6 | 26 | $chars[int(rand(@chars))]; | |||
| 746 | 2 | 4 | }); | |||
| 747 | 2 | 5 | $result .= $generated; | |||
| 748 | 2 | 2 | $i = $new_i; | |||
| 749 | } elsif ($next eq 'W') { | |||||
| 750 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { | |||||
| 751 | 6 186 564 | 7 124 481 | my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126); | |||
| 752 | 6 | 21 | $chars[int(rand(@chars))]; | |||
| 753 | 2 | 6 | }); | |||
| 754 | 2 | 5 | $result .= $generated; | |||
| 755 | 2 | 2 | $i = $new_i; | |||
| 756 | } elsif ($next eq 't') { | |||||
| 757 | 2 | 2 | $result .= "\t"; | |||
| 758 | } elsif ($next eq 'n') { | |||||
| 759 | 2 | 2 | $result .= "\n"; | |||
| 760 | } elsif ($next eq 'r') { | |||||
| 761 | 0 | 0 | $result .= "\r"; | |||
| 762 | } else { | |||||
| 763 | 103 | 65 | $result .= $next; | |||
| 764 | } | |||||
| 765 | 940 | 786 | $i++; | |||
| 766 | } elsif ($char eq '[') { | |||||
| 767 | # Character class | |||||
| 768 | 3750 | 2965 | my $end = $self->_find_matching_bracket($pattern, $i); | |||
| 769 | 3750 | 3006 | croak 'Unmatched [' if $end == -1; | |||
| 770 | ||||||
| 771 | 3750 | 2639 | my $class = substr($pattern, $i+1, $end-$i-1); | |||
| 772 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { | |||||
| 773 | 16988 | 12694 | $self->_random_from_class($class); | |||
| 774 | 3750 | 4370 | }, 1); | |||
| 775 | 3750 | 3780 | $result .= $generated; | |||
| 776 | 3750 | 3227 | $i = $new_i + 1; | |||
| 777 | } elsif ($char eq '(') { | |||||
| 778 | # Group - could be various types | |||||
| 779 | 151 | 143 | my $end = $self->_find_matching_paren($pattern, $i); | |||
| 780 | 151 | 132 | croak 'Unmatched (' if $end == -1; | |||
| 781 | ||||||
| 782 | 151 | 147 | my $group_content = substr($pattern, $i+1, $end-$i-1); | |||
| 783 | ||||||
| 784 | # Check for special group types | |||||
| 785 | 151 | 83 | my $is_capturing = 1; | |||
| 786 | 151 | 91 | my $is_lookahead = 0; | |||
| 787 | 151 | 79 | my $is_lookbehind = 0; | |||
| 788 | 151 | 88 | my $is_negative = 0; | |||
| 789 | 151 | 105 | my $group_name = undef; | |||
| 790 | ||||||
| 791 | 151 | 260 | if ($group_content =~ /^\?:/) { | |||
| 792 | # Non-capturing group | |||||
| 793 | 2 | 1 | $is_capturing = 0; | |||
| 794 | 2 | 3 | $group_content = substr($group_content, 2); | |||
| 795 | } elsif ($group_content =~ /^\?<([^>]+)>/) { | |||||
| 796 | # Named capture (?<name>...) | |||||
| 797 | 15 | 11 | $group_name = $1; | |||
| 798 | 15 | 17 | $group_content = substr($group_content, length($1) + 3); | |||
| 799 | } elsif ($group_content =~ /^\?=/) { | |||||
| 800 | # Positive lookahead (?=...) | |||||
| 801 | 6 | 5 | $is_lookahead = 1; | |||
| 802 | 6 | 3 | $is_capturing = 0; | |||
| 803 | 6 | 6 | $group_content = substr($group_content, 2); | |||
| 804 | } elsif ($group_content =~ /^\?!/) { | |||||
| 805 | # Negative lookahead (?!...) | |||||
| 806 | 1 | 0 | $is_lookahead = 1; | |||
| 807 | 1 | 1 | $is_negative = 1; | |||
| 808 | 1 | 1 | $is_capturing = 0; | |||
| 809 | 1 | 1 | $group_content = substr($group_content, 2); | |||
| 810 | } elsif ($group_content =~ /^\?<=/) { | |||||
| 811 | # Positive lookbehind (?<=...) | |||||
| 812 | 1 | 1 | $is_lookbehind = 1; | |||
| 813 | 1 | 0 | $is_capturing = 0; | |||
| 814 | 1 | 1 | $group_content = substr($group_content, 3); | |||
| 815 | } elsif ($group_content =~ /^\?<!/) { | |||||
| 816 | # Negative lookbehind (?<!...) | |||||
| 817 | 1 | 1 | $is_lookbehind = 1; | |||
| 818 | 1 | 1 | $is_negative = 1; | |||
| 819 | 1 | 1 | $is_capturing = 0; | |||
| 820 | 1 | 1 | $group_content = substr($group_content, 3); | |||
| 821 | } | |||||
| 822 | ||||||
| 823 | # Handle lookaheads/lookbehinds | |||||
| 824 | 151 | 134 | if ($is_lookahead) { | |||
| 825 | # For positive lookahead, generate the pattern but don't advance | |||||
| 826 | # For negative lookahead, avoid the pattern | |||||
| 827 | 7 | 6 | if (!$is_negative) { | |||
| 828 | # Generate what the lookahead expects but don't consume it | |||||
| 829 | # This is a simplification - we just note the constraint | |||||
| 830 | } | |||||
| 831 | # Lookaheads don't add to the result | |||||
| 832 | 7 | 7 | $i = $end + 1; | |||
| 833 | 7 | 8 | next; | |||
| 834 | } elsif ($is_lookbehind) { | |||||
| 835 | # Lookbehinds check what came before | |||||
| 836 | # For generation, we can mostly ignore them | |||||
| 837 | 2 | 2 | $i = $end + 1; | |||
| 838 | 2 | 2 | next; | |||
| 839 | } | |||||
| 840 | ||||||
| 841 | # Check for alternation | |||||
| 842 | 142 | 93 | my $generated; | |||
| 843 | 142 | 126 | if ($group_content =~ /\|/) { | |||
| 844 | 88 | 100 | $generated = $self->_handle_alternation($group_content); | |||
| 845 | } else { | |||||
| 846 | 54 | 69 | $generated = $self->_parse_sequence($group_content); | |||
| 847 | } | |||||
| 848 | ||||||
| 849 | # Store backreference if capturing | |||||
| 850 | 142 | 129 | if ($is_capturing) { | |||
| 851 | 140 | 127 | $self->{group_counter}++; | |||
| 852 | 140 | 152 | $self->{backrefs}{$self->{group_counter}} = $generated; | |||
| 853 | ||||||
| 854 | 140 | 123 | if (defined $group_name) { | |||
| 855 | 15 | 15 | $self->{named_refs}{$group_name} = $generated; | |||
| 856 | } | |||||
| 857 | } | |||||
| 858 | ||||||
| 859 | # Handle quantifier after group (including possessive) | |||||
| 860 | 142 151 | 172 159 | my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1); | |||
| 861 | 142 | 138 | $result .= $final_generated; | |||
| 862 | 142 | 150 | $i = $new_i + 1; | |||
| 863 | } elsif ($char eq '.') { | |||||
| 864 | # Any character (except newline) | |||||
| 865 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { | |||||
| 866 | 2 188 | 3 140 | my @chars = map { chr($_) } (33 .. 126); | |||
| 867 | 2 | 9 | $chars[int(rand(@chars))]; | |||
| 868 | 2 | 4 | }); | |||
| 869 | 2 | 4 | $result .= $generated; | |||
| 870 | 2 | 2 | $i = $new_i + 1; | |||
| 871 | } elsif ($char eq '|') { | |||||
| 872 | # Alternation at top level - just return what we have | |||||
| 873 | # (This is handled by _handle_alternation for groups) | |||||
| 874 | 0 | 0 | last; | |||
| 875 | } elsif ($char =~ /[+*?]/ || $char eq '{') { | |||||
| 876 | # Quantifier without preceding element - shouldn't happen in valid regex | |||||
| 877 | 1 | 90 | croak "$pattern: Quantifier '$char' without preceding element"; | |||
| 878 | } elsif ($char =~ /[\w ]/) { | |||||
| 879 | # Literal character | |||||
| 880 | 736 737 | 840 673 | my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char }); | |||
| 881 | 736 | 733 | $result .= $generated; | |||
| 882 | 736 | 662 | $i = $new_i + 1; | |||
| 883 | } else { | |||||
| 884 | # Other literal characters | |||||
| 885 | 69 | 53 | $result .= $char; | |||
| 886 | 69 | 57 | $i++; | |||
| 887 | } | |||||
| 888 | } | |||||
| 889 | ||||||
| 890 | 4211 | 3761 | return $result; | |||
| 891 | } | |||||
| 892 | ||||||
| 893 | sub _handle_quantifier { | |||||
| 894 | 5426 | 4054 | my ($self, $pattern, $pos, $generator, $check_possessive) = @_; | |||
| 895 | 5426 | 4367 | $check_possessive //= 1; # Default to checking for possessive | |||
| 896 | ||||||
| 897 | 5426 | 3601 | my $next = substr($pattern, $pos + 1, 1); | |||
| 898 | 5426 | 2995 | my $is_possessive = 0; | |||
| 899 | ||||||
| 900 | # Check for possessive quantifier (+) | |||||
| 901 | 5426 | 5683 | if ($check_possessive && $pos + 2 < length($pattern)) { | |||
| 902 | 3936 | 2393 | my $after_next = substr($pattern, $pos + 2, 1); | |||
| 903 | 3936 | 4837 | if (($next =~ /[+*?]/ || $next eq '}') && $after_next eq '+') { | |||
| 904 | 6 | 6 | $is_possessive = 1; | |||
| 905 | } | |||||
| 906 | } | |||||
| 907 | ||||||
| 908 | 5426 | 4474 | if ($next eq '{') { | |||
| 909 | 3458 | 2342 | my $end = index($pattern, '}', $pos + 2); | |||
| 910 | 3458 | 2384 | croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1); | |||
| 911 | 3458 | 2495 | my $quant = substr($pattern, $pos + 2, $end - $pos - 2); | |||
| 912 | ||||||
| 913 | # Check for possessive after } | |||||
| 914 | 3458 | 4132 | if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') { | |||
| 915 | 2 | 0 | $is_possessive = 1; | |||
| 916 | 2 | 2 | $end++; | |||
| 917 | } | |||||
| 918 | ||||||
| 919 | 3458 | 2066 | my $result = ''; | |||
| 920 | 3458 | 3238 | if ($quant =~ /^(\d+)$/) { | |||
| 921 | # Exact: {n} | |||||
| 922 | 3399 | 3815 | $result .= $generator->() for (1 .. $1); | |||
| 923 | } elsif ($quant =~ /^(\d+),(\d+)$/) { | |||||
| 924 | # Range: {n,m} | |||||
| 925 | 59 | 76 | my $count = $1 + int(rand($2 - $1 + 1)); | |||
| 926 | 59 | 62 | $result .= $generator->() for (1 .. $count); | |||
| 927 | } elsif ($quant =~ /^(\d+),$/) { | |||||
| 928 | # Minimum: {n,} | |||||
| 929 | 0 | 0 | my $count = $1 + int(rand(5)); | |||
| 930 | 0 | 0 | $result .= $generator->() for (1 .. $count); | |||
| 931 | } | |||||
| 932 | 3458 | 3355 | return ($result, $end); | |||
| 933 | } elsif ($next eq '+') { | |||||
| 934 | # One or more (possessive: ++) | |||||
| 935 | 11 | 11 | my $actual_end = $pos + 1; | |||
| 936 | 11 | 20 | if ($is_possessive) { | |||
| 937 | 4 | 4 | $actual_end++; | |||
| 938 | } | |||||
| 939 | 11 | 31 | my $count = 1 + int(rand(5)); | |||
| 940 | 11 | 9 | my $result = ''; | |||
| 941 | 11 | 17 | $result .= $generator->() for (1 .. $count); | |||
| 942 | 11 | 14 | return ($result, $actual_end); | |||
| 943 | } elsif ($next eq '*') { | |||||
| 944 | # Zero or more (possessive: *+) | |||||
| 945 | 3 | 3 | my $actual_end = $pos + 1; | |||
| 946 | 3 | 4 | if ($is_possessive) { | |||
| 947 | 1 | 0 | $actual_end++; | |||
| 948 | } | |||||
| 949 | 3 | 6 | my $count = int(rand(6)); | |||
| 950 | 3 | 9 | my $result = ''; | |||
| 951 | 3 | 4 | $result .= $generator->() for (1 .. $count); | |||
| 952 | 3 | 4 | return ($result, $actual_end); | |||
| 953 | } elsif ($next eq '?') { | |||||
| 954 | # Zero or one (possessive: ?+) | |||||
| 955 | 4 | 4 | my $actual_end = $pos + 1; | |||
| 956 | 4 | 5 | if ($is_possessive) { | |||
| 957 | 1 | 1 | $actual_end++; | |||
| 958 | } | |||||
| 959 | 4 | 9 | my $result = rand() < 0.5 ? $generator->() : ''; | |||
| 960 | 4 | 4 | return ($result, $actual_end); | |||
| 961 | } else { | |||||
| 962 | # No quantifier | |||||
| 963 | 1950 | 1354 | return ($generator->(), $pos); | |||
| 964 | } | |||||
| 965 | } | |||||
| 966 | ||||||
| 967 | sub _handle_alternation { | |||||
| 968 | 88 | 85 | my ($self, $pattern) = @_; | |||
| 969 | ||||||
| 970 | # Split on | but respect groups | |||||
| 971 | 88 | 55 | my @alternatives; | |||
| 972 | 88 | 49 | my $current = ''; | |||
| 973 | 88 | 50 | my $depth = 0; | |||
| 974 | ||||||
| 975 | 88 | 124 | for my $char (split //, $pattern) { | |||
| 976 | 1008 | 974 | if ($char eq '(') { | |||
| 977 | 4 | 4 | $depth++; | |||
| 978 | 4 | 3 | $current .= $char; | |||
| 979 | } elsif ($char eq ')') { | |||||
| 980 | 4 | 4 | $depth--; | |||
| 981 | 4 | 2 | $current .= $char; | |||
| 982 | } elsif ($char eq '|' && $depth == 0) { | |||||
| 983 | 162 | 123 | push @alternatives, $current; | |||
| 984 | 162 | 97 | $current = ''; | |||
| 985 | } else { | |||||
| 986 | 838 | 504 | $current .= $char; | |||
| 987 | } | |||||
| 988 | } | |||||
| 989 | 88 | 108 | push @alternatives, $current if length($current); | |||
| 990 | ||||||
| 991 | # Choose one alternative randomly | |||||
| 992 | 88 | 134 | my $chosen = $alternatives[int(rand(@alternatives))]; | |||
| 993 | 88 | 115 | return $self->_parse_sequence($chosen); | |||
| 994 | } | |||||
| 995 | ||||||
| 996 | sub _find_matching_bracket { | |||||
| 997 | 3750 | 2619 | my ($self, $pattern, $start) = @_; | |||
| 998 | ||||||
| 999 | 3750 | 2161 | my $depth = 0; | |||
| 1000 | 3750 | 3019 | for (my $i = $start; $i < length($pattern); $i++) { | |||
| 1001 | 23803 | 14552 | my $char = substr($pattern, $i, 1); | |||
| 1002 | 23803 | 28542 | if ($char eq '[' && ($i == $start || substr($pattern, $i-1, 1) ne '\\')) { | |||
| 1003 | 3750 | 2911 | $depth++; | |||
| 1004 | } elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') { | |||||
| 1005 | 3750 | 2248 | $depth--; | |||
| 1006 | 3750 | 3458 | return $i if $depth == 0; | |||
| 1007 | } | |||||
| 1008 | } | |||||
| 1009 | 0 | 0 | return -1; | |||
| 1010 | } | |||||
| 1011 | ||||||
| 1012 | sub _find_matching_paren { | |||||
| 1013 | 151 | 121 | my ($self, $pattern, $start) = @_; | |||
| 1014 | ||||||
| 1015 | 151 | 90 | my $depth = 0; | |||
| 1016 | 151 | 144 | for (my $i = $start; $i < length($pattern); $i++) { | |||
| 1017 | 1750 | 1085 | my $char = substr($pattern, $i, 1); | |||
| 1018 | 1750 | 1259 | my $prev = $i > 0 ? substr($pattern, $i-1, 1) : ''; | |||
| 1019 | ||||||
| 1020 | 1750 | 2141 | if ($char eq '(' && $prev ne '\\') { | |||
| 1021 | 156 | 140 | $depth++; | |||
| 1022 | } elsif ($char eq ')' && $prev ne '\\') { | |||||
| 1023 | 156 | 113 | $depth--; | |||
| 1024 | 156 | 168 | return $i if $depth == 0; | |||
| 1025 | } | |||||
| 1026 | } | |||||
| 1027 | 0 | 0 | return -1; | |||
| 1028 | } | |||||
| 1029 | ||||||
| 1030 | sub _random_from_class { | |||||
| 1031 | 16988 | 10924 | my ($self, $class) = @_; | |||
| 1032 | ||||||
| 1033 | 16988 | 9883 | my @chars; | |||
| 1034 | ||||||
| 1035 | # Handle negation | |||||
| 1036 | 16988 | 9920 | my $negate = 0; | |||
| 1037 | 16988 | 13129 | if (substr($class, 0, 1) eq '^') { | |||
| 1038 | 10 | 6 | $negate = 1; | |||
| 1039 | 10 | 9 | $class = substr($class, 1); | |||
| 1040 | } | |||||
| 1041 | ||||||
| 1042 | # Parse character class with escape sequences | |||||
| 1043 | 16988 | 9585 | my $i = 0; | |||
| 1044 | 16988 | 12000 | while ($i < length($class)) { | |||
| 1045 | 32570 | 20509 | my $char = substr($class, $i, 1); | |||
| 1046 | ||||||
| 1047 | 32570 | 35656 | if ($char eq '\\') { | |||
| 1048 | 32 | 18 | $i++; | |||
| 1049 | 32 | 20 | my $next = substr($class, $i, 1); | |||
| 1050 | 32 | 33 | if ($next eq 'd') { | |||
| 1051 | 16 | 19 | push @chars, ('0'..'9'); | |||
| 1052 | } elsif ($next eq 'w') { | |||||
| 1053 | 10 | 39 | push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_'); | |||
| 1054 | } elsif ($next eq 's') { | |||||
| 1055 | 0 | 0 | push @chars, (' ', "\t", "\n"); | |||
| 1056 | } elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') { | |||||
| 1057 | # Unicode property in character class | |||||
| 1058 | 6 | 3 | my $end = index($class, '}', $i+2); | |||
| 1059 | 6 | 4 | my $prop = substr($class, $i+2, $end-$i-2); | |||
| 1060 | 6 | 6 | push @chars, $self->_unicode_property_chars($prop); | |||
| 1061 | 6 | 11 | $i = $end; | |||
| 1062 | } else { | |||||
| 1063 | 0 | 0 | push @chars, $next; | |||
| 1064 | } | |||||
| 1065 | } elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') { | |||||
| 1066 | # Range | |||||
| 1067 | 30268 | 19155 | my $end = substr($class, $i+2, 1); | |||
| 1068 | 30268 | 64253 | push @chars, ($char .. $end); | |||
| 1069 | 30268 | 24314 | $i += 2; # Will be incremented again by loop, total +3 | |||
| 1070 | } else { | |||||
| 1071 | 2270 | 1480 | push @chars, $char; | |||
| 1072 | } | |||||
| 1073 | 32570 | 24450 | $i++; | |||
| 1074 | } | |||||
| 1075 | ||||||
| 1076 | 16988 | 11529 | if ($negate) { | |||
| 1077 | 10 100 | 9 78 | my %excluded = map { $_ => 1 } @chars; | |||
| 1078 | 10 940 940 | 11 672 698 | @chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126); | |||
| 1079 | } | |||||
| 1080 | ||||||
| 1081 | 16988 | 29954 | return @chars ? $chars[int(rand(@chars))] : 'X'; | |||
| 1082 | } | |||||
| 1083 | ||||||
| 1084 | sub _unicode_property_char { | |||||
| 1085 | 95 | 68 | my ($self, $prop) = @_; | |||
| 1086 | 95 | 78 | my @chars = $self->_unicode_property_chars($prop); | |||
| 1087 | 95 | 354 | return @chars ? $chars[int(rand(@chars))] : 'X'; | |||
| 1088 | } | |||||
| 1089 | ||||||
| 1090 | sub _unicode_property_chars { | |||||
| 1091 | 101 | 63 | my ($self, $prop) = @_; | |||
| 1092 | ||||||
| 1093 | # Common Unicode properties | |||||
| 1094 | 101 | 113 | if ($prop eq 'L' || $prop eq 'Letter') { | |||
| 1095 | # Letters, skip à and ÷ which are symbols | |||||
| 1096 | 88 5456 | 367 4082 | return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'Ã')..ord('Ã'), ord('Ã')..ord('ö'), ord('ø')..ord('ÿ'))); | |||
| 1097 | } elsif ($prop eq 'N' || $prop eq 'Number') { | |||||
| 1098 | # Numbers | |||||
| 1099 | # return ('0' .. '9', map { chr($_) } (ord('â ').. ord('â³'))); | |||||
| 1100 | 3 | 5 | return ('0' .. '9'); | |||
| 1101 | } elsif ($prop eq 'Lu' || $prop eq 'Uppercase_Letter') { | |||||
| 1102 | # Uppercase letters, skip à which is not a letter | |||||
| 1103 | 6 180 | 19 167 | return ('A' .. 'Z', map { chr($_) } (ord('Ã') .. ord('Ã'), ord('Ã') .. ord('Ã'))); | |||
| 1104 | } elsif ($prop eq 'Ll' || $prop eq 'Lowercase_Letter') { | |||||
| 1105 | # Lowercase letters, skip ÷ which is not a letter | |||||
| 1106 | 4 124 | 10 95 | return ('a' .. 'z', map { chr($_) } (ord('à ') .. ord('ö'), ord('ø') .. ord('ÿ'))); | |||
| 1107 | } elsif ($prop eq 'P' || $prop eq 'Punctuation') { | |||||
| 1108 | # Punctuation | |||||
| 1109 | 0 | 0 | return ('.', ',', '!', '?', ';', ':', '-', 'â', 'â¦'); | |||
| 1110 | } elsif ($prop eq 'S' || $prop eq 'Symbol') { | |||||
| 1111 | # Symbols | |||||
| 1112 | 0 | 0 | return ('$', 'â¬', '£', 'Â¥', '©', '®', 'â¢', '°', '±', 'Ã', '÷'); | |||
| 1113 | } elsif ($prop eq 'Z' || $prop eq 'Separator') { | |||||
| 1114 | # Separators | |||||
| 1115 | 0 | 0 | return (' ', "\t", "\n"); | |||
| 1116 | } elsif ($prop eq 'Nd' || $prop eq 'Decimal_Number') { | |||||
| 1117 | # Decimal numbers | |||||
| 1118 | 0 | 0 | return ('0'..'9'); | |||
| 1119 | } else { | |||||
| 1120 | # Unknown property - return letters as default | |||||
| 1121 | 0 | 0 | return ('a'..'z', 'A'..'Z'); | |||
| 1122 | } | |||||
| 1123 | } | |||||
| 1124 | ||||||
| 1125 - 1131 | =head2 create_random_string
For consistency with L<Data::Random::String>.
  print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";
=cut | |||||
| 1132 | ||||||
| 1133 | sub create_random_string | |||||
| 1134 | { | |||||
| 1135 | 2 | 893 | my $class = shift; | |||
| 1136 | 2 | 5 | my $params = Params::Get::get_params(undef, @_); | |||
| 1137 | ||||||
| 1138 | 2 | 25 | my $regex = $params->{'regex'}; | |||
| 1139 | 2 | 2 | my $length = $params->{'length'}; | |||
| 1140 | ||||||
| 1141 | 2 | 4 | return $class->new($regex, $length)->generate(); | |||
| 1142 | } | |||||
| 1143 | ||||||
| 1144 - 1178 | =head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/Data-Random-String-Matches/coverage/> =item * L<String::Random> =item * L<Regexp::Genex> =back =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut | |||||
| 1179 | ||||||
| 1180 | 1; | |||||