File Coverage

File:blib/lib/Data/Random/String/Matches.pm
Coverage:87.1%

linestmtbrancondsubtimecode
1package Data::Random::String::Matches;
2
3
32
32
953226
50
use 5.014;
4
5
32
32
32
48
27
295
use strict;
6
32
32
32
49
27
608
use warnings;
7
8
32
32
32
59
24
695
use Carp qw(carp croak);
9
32
32
32
5429
159147
590
use Params::Get;
10
32
32
32
5426
2906
64
use utf8;
11
12our $VERSION = '0.04';
13
14 - 182
=head1 NAME

Data::Random::String::Matches - Generate random strings matching a regex

=head1 SYNOPSIS

        use Data::Random::String::Matches;

        # Create a generator with regex and optional length
        my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7);

        # Generate a matching string
        my $str = $gen->generate();
        print $str;  # e.g., "XYZ1234"

        # Alternation
        my $gen2 = Data::Random::String::Matches->new(qr/(cat|dog|bird)/);
        my $animal = $gen2->generate_smart();  # "cat", "dog", or "bird"

        # Backreferences
        my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/);
        my $str3 = $gen3->generate_smart();  # e.g., "abc-abc"

        # Groups and quantifiers
        my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/);
        my $laugh = $gen4->generate_smart();  # "haha", "hahaha", or "hahahaha"

        # Unicode
        $gen = Data::Random::String::Matches->new(qr/\p{L}{5}/);

        # Named captures
        $gen = Data::Random::String::Matches->new(qr/(?<year>\d{4})-\k<year>/);

        # Possessive
        $gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/);

        # Lookaheads
        $gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/);

        # Combined
        $gen = Data::Random::String::Matches->new(
                qr/(?<prefix>\p{Lu}{2})\d++\k<prefix>(?=[A-Z])/
        );

        # Consistency with Legacy software
        print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=head1 DESCRIPTION

This module generates random strings that match a given regular expression pattern.
It parses the regex pattern and intelligently builds matching strings, supporting
a wide range of regex features.

=head1 SUPPORTED REGEX FEATURES

=head2 Character Classes

=over 4

=item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]>

=item * Negated classes: C<[^a-z]>

=item * Ranges: C<[a-zA-Z0-9]>

=item * Escape sequences in classes: C<[\d\w]>

=back

=head2 Escape Sequences

=over 4

=item * C<\d> - digit [0-9]

=item * C<\w> - word character [a-zA-Z0-9_]

=item * C<\s> - whitespace

=item * C<\D> - non-digit

=item * C<\W> - non-word character

=item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return

=back

=head2 Quantifiers

=over 4

=item * C<{n}> - exactly n times

=item * C<{n,m}> - between n and m times

=item * C<{n,}> - n or more times

=item * C<+> - one or more (1-5 times)

=item * C<*> - zero or more (0-5 times)

=item * C<?> - zero or one

=back

=head2 Grouping and Alternation

=over 4

=item * C<(...)> - capturing group

=item * C<(?:...)> - non-capturing group

=item * C<|> - alternation (e.g., C<cat|dog|bird>)

=item * C<\1>, C<\2>, etc. - backreferences

=back

=head2 Other

=over 4

=item * C<.> - any character (printable ASCII)

=item * Literal characters

=item * C<^> and C<$> anchors (stripped during parsing)

=back

=head1 LIMITATIONS

=over 4

=item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported

=item * Named groups ((?<name>...)) are not supported

=item * Possessive quantifiers (*+, ++) are not supported

=item * Unicode properties (\p{L}, \p{N}) are not supported

=item * Some complex nested patterns may not work correctly with smart parsing

=back

=head1 EXAMPLES

        # Email-like pattern
        my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/);

        # API key pattern
        my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/);

        # Phone number
        my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/);

        # Repeated pattern
        my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/);

=head1 METHODS

=head2 new($regex, $length)

Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string.
C<$length> is optional and defaults to 10 (used for fallback generation).

=cut
183
184sub new {
185
215
518129
        my ($class, $regex, $length) = @_;
186
187
215
299
        croak 'Regex pattern is required' unless defined $regex;
188
189        # Convert string to regex if needed
190
214
383
        my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/;
191
192
214
558
        my $self = {
193                regex    => $regex_obj,
194                regex_str => "$regex",
195                length  => $length || 10,
196                backrefs        => {},       # Store backreferences
197                named_refs => {},     # Store named captures
198        };
199
200
214
277
        return bless $self, $class;
201}
202
203 - 209
=head2 generate($max_attempts)

Generates a random string matching the regex. First tries smart parsing, then
falls back to brute force if needed. Tries up to C<$max_attempts> times
(default 1000) before croaking.

=cut
210
211sub generate {
212
4135
18763
        my ($self, $max_attempts) = @_;
213
4135
5856
        $max_attempts //= 1000;
214
215
4135
2880
        my $regex = $self->{regex};
216
4135
2552
        my $length = $self->{length};
217
218        # First try the smart approach
219
4135
4135
2601
3184
        my $str = eval { $self->_build_from_pattern($self->{regex_str}) };
220
4135
8913
        if (defined $str && $str =~ /^$regex$/) {
221
4133
4811
                return $str;
222        }
223
224        # If smart approach failed, show warning in debug mode
225
2
2
        if ($ENV{DEBUG_REGEX_GEN} && $@) {
226
0
0
                warn "Smart generation failed: $@";
227        }
228
229        # Fall back to brute force with character set matching
230
2
3
        for (1 .. $max_attempts) {
231
1010
755
                $str = $self->_random_string_smart($length);
232
1010
1710
                return $str if $str =~ /^$regex$/;
233        }
234
235
2
66
        croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}";
236}
237
238sub _random_string_smart {
239
1010
645
        my ($self, $len) = @_;
240
241
1010
678
        my $regex_str = $self->{regex_str};
242
243        # Detect common patterns and generate appropriate characters
244
1010
563
        my @chars;
245
246
1010
2376
        if ($regex_str =~ /\\d/ || $regex_str =~ /\[0-9\]/ || $regex_str =~ /\[\^[^\]]*[A-Za-z]/) {
247                # Digit patterns
248
0
0
                @chars = ('0'..'9');
249        } elsif ($regex_str =~ /\[A-Z\]/ || $regex_str =~ /\[A-Z[^\]]*\]/) {
250                # Uppercase patterns
251
0
0
                @chars = ('A'..'Z');
252        } elsif ($regex_str =~ /\[a-z\]/ || $regex_str =~ /\[a-z[^\]]*\]/) {
253                # Lowercase patterns
254
0
0
                @chars = ('a'..'z');
255        } elsif ($regex_str =~ /\\w/ || $regex_str =~ /\[a-zA-Z0-9_\]/) {
256                # Word characters
257
0
0
                @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
258        } else {
259                # Default to printable ASCII
260
1010
94940
764
69137
                @chars = map { chr($_) } (33 .. 126);
261        }
262
263
1010
736
        my $str = '';
264
1010
2400
        $str .= $chars[int(rand(@chars))] for (1 .. $len);
265
266
1010
2474
        return $str;
267}
268
269 - 274
=head2 generate_smart()

Parses the regex and builds a matching string directly. Faster and more reliable
than brute force, but may not handle all edge cases.

=cut
275
276sub generate_smart {
277
132
5880
        my $self = $_[0];
278
132
170
        return $self->_build_from_pattern($self->{regex_str});
279}
280
281 - 311
=head2 generate_many($count, $unique)

Generates multiple random strings matching the regex.

    my @strings = $gen->generate_many(10);           # 10 strings (may have duplicates)
    my @strings = $gen->generate_many(10, 1);        # 10 unique strings
    my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings

    # Generate until you have 1000 unique codes
    my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/);
    my @codes = $gen->generate_many(1000, 'unique');

Parameters:

=over 4

=item * C<$count> - Number of strings to generate (required, must be positive)

=item * C<$unique> - If true, ensures all generated strings are unique. May return fewer
than C<$count> strings if uniqueness cannot be achieved within reasonable attempts.
Accepts any true value (1, 'unique', etc.)

=back

Returns: List of generated strings

Dies: If count is not a positive integer

Warns: If unable to generate the requested number of unique strings

=cut
312
313sub generate_many {
314
24
2660
        my ($self, $count, $unique) = @_;
315
316
24
53
        croak 'Count must be a positive integer' unless defined $count && $count > 0;
317
318
21
15
        my @results;
319
320
21
19
        if ($unique) {
321                # Generate unique strings
322
6
5
                my %seen;
323
6
5
                my $attempts = 0;
324
6
5
                my $max_attempts = $count * 100;        # Reasonable limit
325
326
6
15
                while (keys %seen < $count && $attempts < $max_attempts) {
327
1645
1250
                        my $str = $self->generate();
328
1645
1278
                        $seen{$str} = 1;
329
1645
1956
                        $attempts++;
330                }
331
332
6
7
                if (keys %seen < $count) {
333
1
9
                        carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested";
334                }
335
336
6
304
                @results = keys %seen;
337        } else {
338                # Generate any strings (may have duplicates)
339
15
24
                push @results, $self->generate() for (1 .. $count);
340        }
341
342
21
289
        return @results;
343}
344
345 - 349
=head2 get_seed()

Gets the random seed for reproducible generation

=cut
350
351sub get_seed {
352
2
376
        my $self = shift;
353
354
2
6
        return $self->{seed};
355}
356
357 - 361
=head2 set_seed($seed)

Sets the random seed for reproducible generation

=cut
362
363sub set_seed {
364
13
1562
        my $self = shift;
365
13
20
        my $params = Params::Get::get_params('seed', \@_);
366
12
133
        my $seed = $params->{'seed'};
367
368
12
75
        croak 'Seed must be defined' unless defined $seed;
369
370
11
12
        srand($seed);
371
11
10
        $self->{seed} = $seed;
372
373
11
20
        return $self;
374}
375
376 - 389
=head2 suggest_simpler_pattern()

Analyzes patterns and suggests improvements.

  my $suggestion = $gen->suggest_simpler_pattern();

  if ($suggestion) {
    print "Reason: $suggestion->{reason}\n";
    print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern};
    print "Tips:\n";
    print "  - $_\n" for @{$suggestion->{tips}};
  }

=cut
390
391sub suggest_simpler_pattern {
392
21
30
        my $self = $_[0];
393
394
21
19
        my $pattern = $self->{regex_str};
395
21
20
        my $info = $self->pattern_info();
396
397        # Check for patterns that are too complex
398
21
35
        if ($info->{complexity} eq 'very_complex') {
399                return {
400
1
9
                        pattern => undef,
401                        reason  => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.',
402                        tips    => [
403                                'Split alternations into separate generators',
404                                'Avoid deeply nested groups',
405                                'Use fixed-length patterns when possible',
406                        ],
407                };
408        }
409
410        # Suggest removing unnecessary backreferences
411
20
26
        if ($info->{features}{has_backreferences} && $pattern =~ /(\(\w+\)).*\\\d+/) {
412
0
0
                my $simpler = $pattern;
413                # Can't automatically simplify backreferences, but can suggest
414                return {
415
0
0
                        pattern => undef,
416                        reason  => 'Backreferences add complexity. Consider if you really need repeated groups.',
417                        tips    => [
418                                'If the repeated part doesn\'t need to match, use two separate patterns',
419                                'For validation, backreferences are great; for generation, they limit variation',
420                        ],
421                };
422        }
423
424        # Suggest fixed quantifiers instead of ranges
425
20
23
        if ($pattern =~ /\{(\d+),(\d+)\}/) {
426
8
8
                my ($min, $max) = ($1, $2);
427
8
9
                if ($max - $min > 10) {
428
6
6
                        my $mid = int(($min + $max) / 2);
429
6
5
                        my $simpler = $pattern;
430
6
14
                        $simpler =~ s/\{\d+,\d+\}/\{$mid\}/;
431                        return {
432
6
15
                                pattern => $simpler,
433                                reason  => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.",
434                                tips    => [
435                                        'Fixed lengths are faster to generate',
436                                        'If you need variety, generate multiple patterns with different fixed lengths',
437                                ],
438                        };
439                }
440        }
441
442        # Suggest limiting alternations
443
14
14
        if ($info->{features}{has_alternation}) {
444
2
3
                my @alts = split /\|/, $pattern;
445
2
3
                if (@alts > 10) {
446                        return {
447
1
3
                                pattern => undef,
448                                reason  => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.',
449                                tips    => [
450                                        'Create separate generators for different alternatives',
451                                        'Group similar patterns together',
452                                        'Use character classes [abc] instead of (a|b|c)',
453                                ],
454                        };
455                }
456
457                # Check if alternations could be a character class
458
1
2
                if ($pattern =~ /\(([a-zA-Z])\|([a-zA-Z])\|([a-zA-Z])\)/) {
459
1
2
                        my $chars = join('', $1, $2, $3);
460
1
1
                        my $simpler = $pattern;
461
1
2
                        $simpler =~ s/\([a-zA-Z]\|[a-zA-Z]\|[a-zA-Z]\)/[$chars]/;
462                        return {
463
1
3
                                pattern => $simpler,
464                                reason  => 'Single-character alternations can be simplified to character classes.',
465                                tips    => [
466                                        'Use [abc] instead of (a|b|c)',
467                                        'Character classes are faster to process',
468                                ],
469                        };
470                }
471        }
472
473        # Suggest removing lookaheads/lookbehinds for generation
474
12
16
        if ($info->{features}{has_lookahead} || $info->{features}{has_lookbehind}) {
475
3
2
                my $simpler = $pattern;
476
3
5
                $simpler =~ s/\(\?[=!].*?\)//g;   # Remove lookaheads
477
3
3
                $simpler =~ s/\(\?<[=!].*?\)//g;  # Remove lookbehinds
478
479
3
4
                if ($simpler ne $pattern) {
480                        return {
481
3
6
                                pattern => $simpler,
482                                reason  => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.',
483                                tips    => [
484                                        'Lookaheads are great for validation, not generation',
485                                        'The simplified pattern generates the same strings',
486                                ],
487                        };
488                }
489        }
490
491        # Check for Unicode when ASCII would work
492
9
11
        if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) {
493
2
2
                my $simpler = $pattern;
494
2
2
                $simpler =~ s/\\p\{L\}/[A-Za-z]/g;
495                return {
496
2
5
                        pattern => $simpler,
497                        reason  => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.',
498                        tips    => [
499                                'ASCII patterns are faster',
500                                'Only use Unicode if you need non-ASCII characters',
501                        ],
502                };
503        }
504
505        # Check for overly long fixed strings
506
7
5
        if ($pattern =~ /([a-zA-Z]{20,})/) {
507                return {
508
0
0
                        pattern => undef,
509                        reason  => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.',
510                        tips    => [
511                                'Use variables instead of long literals',
512                                'Break into smaller patterns',
513                        ],
514                };
515        }
516
517        # Pattern seems reasonable
518
7
7
        return undef;
519}
520
521 - 529
=head2 validate($string)

Checks if a string matches the pattern without generating.

  if ($gen->validate('1234')) {
    print "Valid!\n";
  }

=cut
530
531sub validate {
532
69
2146
        my $self = shift;
533
69
120
        my $params = Params::Get::get_params('string', \@_);
534
68
814
        my $string = $params->{'string'};
535
536
68
156
        croak('String must be defined') unless defined $string;
537
538
67
55
        my $regex = $self->{regex};
539
67
408
        return $string =~ /^$regex$/;
540}
541
542 - 558
=head2 pattern_info()

Returns detailed information about the pattern.

  my $info = $gen->pattern_info();
  print "Complexity: $info->{complexity}\n";
  print "Min length: $info->{min_length}\n";
  print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n";

C<pattern_info> analyzes a regular expression to produce a structured summary of its characteristics,
including estimated string lengths, detected features, and an overall complexity rating.
It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals,
character classes, and quantifiers.
It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash.
Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level.

=cut
559
560sub pattern_info {
561
41
2489
        my $self = $_[0];
562
563
41
45
        return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'};
564
565
38
33
        my $pattern = $self->{'regex_str'};
566
567        # Calculate approximate min/max lengths
568
38
40
        my ($min_len, $max_len) = $self->_estimate_length($pattern);
569
570        # Detect pattern features
571
38
274
        my %features = (
572                has_alternation     => ($pattern =~ /\|/ ? 1 : 0),
573                has_backreferences  => ($pattern =~ /(\\[1-9]|\\k<)/ ? 1 : 0),
574                has_unicode         => ($pattern =~ /\\p\{/ ? 1 : 0),
575                has_lookahead       => ($pattern =~ /\(\?[=!]/ ? 1 : 0),
576                has_lookbehind      => ($pattern =~ /\(\?<[=!]/ ? 1 : 0),
577                has_named_groups    => ($pattern =~ /\(\?</ ? 1 : 0),
578                has_possessive      => ($pattern =~ /(?:[+*?]\+|\{\d+(?:,\d*)?\}\+)/ ? 1 : 0),
579        );
580
581
38
79
        my $info = {
582                pattern             => $pattern,
583                min_length          => $min_len,
584                max_length          => $max_len,
585                estimated_length    => int(($min_len + $max_len) / 2),
586                features            => \%features,
587                complexity          => $self->_calculate_complexity(\%features, $pattern),
588        };
589
590
38
39
        $self->{'_pattern_info_cache'} = $info;
591
592
38
38
        return $info;
593}
594
595sub _estimate_length {
596
38
29
        my ($self, $pattern) = @_;
597
598        # Remove anchors and modifiers
599
38
119
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
600
38
34
        $pattern =~ s/^\^//;
601
38
34
        $pattern =~ s/\$//;
602
603
38
26
        my $min = 0;
604
38
29
        my $max = 0;
605
606        # Simple heuristic - count fixed characters and quantifiers
607
38
25
        my $last_was_atom = 0;  # Handle cases like \d{3} where the quantifier modifies the atom count
608
38
82
        while ($pattern =~ /([^+*?{}\[\]\\])|\\[dwsWDN]|\[([^\]]+)\]|\{(\d+)(?:,(\d+))?\}/g) {
609
222
341
                if (defined $1 || (defined $2 && $2)) {
610
156
93
                        $min++;
611
156
88
                        $max++;
612
156
175
                        $last_was_atom = 1;
613                } elsif (defined $3) {
614
36
31
                        if ($last_was_atom) {
615                                # Replace the last atom’s contribution
616
16
15
                                $min += $3 - 1;
617
16
21
                                $max += (defined $4 ? $4 : $3) - 1;
618
16
18
                                $last_was_atom = 0;
619                        } else {
620                                # No preceding atom? assume standalone
621
20
16
                                $min += $3;
622
20
31
                                $max += defined $4 ? $4 : $3;
623                        }
624                }
625        }
626
627        # Account for +, *, ?
628
38
42
        my $plus_count = () = $pattern =~ /\+/g;
629
38
33
        my $star_count = () = $pattern =~ /\*/g;
630
38
32
        my $question_count = () = $pattern =~ /\?/g;
631
632
38
25
        $min += $plus_count;  # + means at least 1
633
38
39
        $max += ($plus_count * 5) + ($star_count * 5);  # Assume max 5 repetitions
634
38
25
        $min -= $question_count;  # ? makes things optional
635
636
38
35
        $min = 0 if $min < 0;
637
38
32
        $max = $min + 50 if $max < $min;  # Ensure max >= min
638
639
38
47
        return ($min, $max);
640}
641
642sub _calculate_complexity {
643
38
35
        my ($self, $features, $pattern) = @_;
644
645
38
26
        my $score = 0;
646
647        # Base complexity from pattern length
648
38
38
        $score += length($pattern) / 10;
649
650        # Add complexity for features
651
38
45
        $score += 2 if $features->{has_alternation};
652
38
40
        $score += 3 if $features->{has_backreferences};
653
38
33
        $score += 2 if $features->{has_unicode};
654
38
34
        $score += 2 if $features->{has_lookahead};
655
38
29
        $score += 2 if $features->{has_lookbehind};
656
38
32
        $score += 1 if $features->{has_named_groups};
657
38
33
        $score += 1 if $features->{has_possessive};
658
659        # Classify
660
38
62
        return 'simple'   if $score < 3;
661
15
27
        return 'moderate' if $score < 7;
662
3
6
        return 'complex'  if $score < 12;
663
1
2
        return 'very_complex';
664}
665
666sub _build_from_pattern {
667
4266
2965
        my ($self, $pattern) = @_;
668
669        # Reset backreferences for each generation
670
4266
3059
        $self->{backrefs} = {};
671
4266
2967
        $self->{named_refs} = {};
672
4266
2757
        $self->{group_counter} = 0;
673
674        # Remove regex delimiters and modifiers
675        # Handle (?^:...), (?i:...), (?-i:...) etc
676
4266
6051
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
677
678        # Remove anchors (they're handled by the regex match itself)
679
4266
2875
        $pattern =~ s/^\^//;
680
4266
2709
        $pattern =~ s/\$//;
681
682
4266
3488
        return $self->_parse_sequence($pattern);
683}
684
685sub _parse_sequence {
686
4408
2988
        my ($self, $pattern) = @_;
687
688
4408
2752
        my $result = '';
689
4408
2467
        my $i = 0;
690
4408
2822
        my $len = length($pattern);
691
692
4408
3191
        while ($i < $len) {
693
5834
4182
                my $char = substr($pattern, $i, 1);
694
695
5834
6015
                if ($char eq '\\') {
696                        # Escape sequence
697
941
557
                        $i++;
698
941
638
                        my $next = substr($pattern, $i, 1);
699
700
941
1555
                        if ($next =~ /[1-9]/) {
701                                # Backreference
702
30
21
                                my $ref_num = $next;
703
30
25
                                if (exists $self->{backrefs}{$ref_num}) {
704
30
28
                                        $result .= $self->{backrefs}{$ref_num};
705                                } else {
706
0
0
                                        croak "Backreference \\$ref_num used before group defined";
707                                }
708                        } elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') {
709                                # Named backreference \k<name>
710
7
6
                                my $end = index($pattern, '>', $i+2);
711
7
8
                                my $name = substr($pattern, $i+2, $end-$i-2);
712
7
7
                                if (exists $self->{named_refs}{$name}) {
713
7
4
                                        $result .= $self->{named_refs}{$name};
714                                } else {
715
0
0
                                        croak "Named backreference \\k<$name> used before group defined";
716                                }
717
7
7
                                $i = $end;
718                        } elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') {
719                                # Unicode property \p{L}, \p{N}, etc.
720
25
21
                                my $end = index($pattern, '}', $i+2);
721
25
21
                                my $prop = substr($pattern, $i+2, $end-$i-2);
722                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
723
95
75
                                        $self->_unicode_property_char($prop);
724
25
38
                                });
725
25
35
                                $result .= $generated;
726
25
15
                                $i = $new_i;
727                        } elsif ($next eq 'd') {
728
731
2302
927
2320
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1);
729
731
704
                                $result .= $generated;
730
731
466
                                $i = $new_i;
731                        } elsif ($next eq 'w') {
732                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
733
115
414
                                        my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
734
115
305
                                        $chars[int(rand(@chars))];
735
35
63
                                }, 1);
736
35
51
                                $result .= $generated;
737
35
22
                                $i = $new_i;
738                        } elsif ($next eq 's') {
739
2
2
6
4
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1);
740
2
3
                                $result .= $generated;
741
2
2
                                $i = $new_i;
742                        } elsif ($next eq 'D') {
743                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
744
6
504
564
4
340
461
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126);
745
6
29
                                        $chars[int(rand(@chars))];
746
2
5
                                });
747
2
5
                                $result .= $generated;
748
2
2
                                $i = $new_i;
749                        } elsif ($next eq 'W') {
750                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
751
6
186
564
6
132
506
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126);
752
6
21
                                        $chars[int(rand(@chars))];
753
2
8
                                });
754
2
5
                                $result .= $generated;
755
2
2
                                $i = $new_i;
756                        } elsif ($next eq 't') {
757
2
2
                                $result .= "\t";
758                        } elsif ($next eq 'n') {
759
2
2
                                $result .= "\n";
760                        } elsif ($next eq 'r') {
761
0
0
                                $result .= "\r";
762                        } else {
763
103
77
                                $result .= $next;
764                        }
765
941
789
                        $i++;
766                } elsif ($char eq '[') {
767                        # Character class
768
3946
3232
                        my $end = $self->_find_matching_bracket($pattern, $i);
769
3946
3169
                        croak 'Unmatched [' if $end == -1;
770
771
3946
2795
                        my $class = substr($pattern, $i+1, $end-$i-1);
772                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
773
17206
13394
                                $self->_random_from_class($class);
774
3946
4829
                        }, 1);
775
3946
4076
                        $result .= $generated;
776
3946
3489
                        $i = $new_i + 1;
777                } elsif ($char eq '(') {
778                        # Group - could be various types
779
151
136
                        my $end = $self->_find_matching_paren($pattern, $i);
780
151
156
                        croak 'Unmatched (' if $end == -1;
781
782
151
132
                        my $group_content = substr($pattern, $i+1, $end-$i-1);
783
784                        # Check for special group types
785
151
99
                        my $is_capturing = 1;
786
151
89
                        my $is_lookahead = 0;
787
151
114
                        my $is_lookbehind = 0;
788
151
83
                        my $is_negative = 0;
789
151
99
                        my $group_name = undef;
790
791
151
266
                        if ($group_content =~ /^\?:/) {
792                                # Non-capturing group
793
2
3
                                $is_capturing = 0;
794
2
2
                                $group_content = substr($group_content, 2);
795                        } elsif ($group_content =~ /^\?<([^>]+)>/) {
796                                # Named capture (?<name>...)
797
15
13
                                $group_name = $1;
798
15
16
                                $group_content = substr($group_content, length($1) + 3);
799                        } elsif ($group_content =~ /^\?=/) {
800                                # Positive lookahead (?=...)
801
6
3
                                $is_lookahead = 1;
802
6
4
                                $is_capturing = 0;
803
6
6
                                $group_content = substr($group_content, 2);
804                        } elsif ($group_content =~ /^\?!/) {
805                                # Negative lookahead (?!...)
806
1
0
                                $is_lookahead = 1;
807
1
1
                                $is_negative = 1;
808
1
1
                                $is_capturing = 0;
809
1
1
                                $group_content = substr($group_content, 2);
810                        } elsif ($group_content =~ /^\?<=/) {
811                                # Positive lookbehind (?<=...)
812
1
1
                                $is_lookbehind = 1;
813
1
1
                                $is_capturing = 0;
814
1
1
                                $group_content = substr($group_content, 3);
815                        } elsif ($group_content =~ /^\?<!/) {
816                                # Negative lookbehind (?<!...)
817
1
1
                                $is_lookbehind = 1;
818
1
1
                                $is_negative = 1;
819
1
0
                                $is_capturing = 0;
820
1
1
                                $group_content = substr($group_content, 3);
821                        }
822
823                        # Handle lookaheads/lookbehinds
824
151
152
                        if ($is_lookahead) {
825                                # For positive lookahead, generate the pattern but don't advance
826                                # For negative lookahead, avoid the pattern
827
7
7
                                if (!$is_negative) {
828                                        # Generate what the lookahead expects but don't consume it
829                                        # This is a simplification - we just note the constraint
830                                }
831                                # Lookaheads don't add to the result
832
7
6
                                $i = $end + 1;
833
7
8
                                next;
834                        } elsif ($is_lookbehind) {
835                                # Lookbehinds check what came before
836                                # For generation, we can mostly ignore them
837
2
1
                                $i = $end + 1;
838
2
3
                                next;
839                        }
840
841                        # Check for alternation
842
142
82
                        my $generated;
843
142
120
                        if ($group_content =~ /\|/) {
844
88
78
                                $generated = $self->_handle_alternation($group_content);
845                        } else {
846
54
58
                                $generated = $self->_parse_sequence($group_content);
847                        }
848
849                        # Store backreference if capturing
850
142
130
                        if ($is_capturing) {
851
140
104
                                $self->{group_counter}++;
852
140
162
                                $self->{backrefs}{$self->{group_counter}} = $generated;
853
854
140
125
                                if (defined $group_name) {
855
15
16
                                        $self->{named_refs}{$group_name} = $generated;
856                                }
857                        }
858
859                        # Handle quantifier after group (including possessive)
860
142
159
172
161
                        my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1);
861
142
158
                        $result .= $final_generated;
862
142
146
                        $i = $new_i + 1;
863                } elsif ($char eq '.') {
864                        # Any character (except newline)
865                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
866
2
188
4
140
                                my @chars = map { chr($_) } (33 .. 126);
867
2
8
                                $chars[int(rand(@chars))];
868
2
5
                        });
869
2
4
                        $result .= $generated;
870
2
2
                        $i = $new_i + 1;
871                } elsif ($char eq '|') {
872                        # Alternation at top level - just return what we have
873                        # (This is handled by _handle_alternation for groups)
874
0
0
                        last;
875                } elsif ($char =~ /[+*?]/ || $char eq '{') {
876                        # Quantifier without preceding element - shouldn't happen in valid regex
877
1
112
                        croak "$pattern: Quantifier '$char' without preceding element";
878                } elsif ($char =~ /[\w ]/) {
879                        # Literal character
880
724
731
877
668
                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char });
881
724
685
                        $result .= $generated;
882
724
629
                        $i = $new_i + 1;
883                } else {
884                        # Other literal characters
885
69
63
                        $result .= $char;
886
69
63
                        $i++;
887                }
888        }
889
890
4407
3919
        return $result;
891}
892
893sub _handle_quantifier {
894
5611
4423
        my ($self, $pattern, $pos, $generator, $check_possessive) = @_;
895
5611
4713
        $check_possessive //= 1;        # Default to checking for possessive
896
897
5611
3706
        my $next = substr($pattern, $pos + 1, 1);
898
5611
3310
        my $is_possessive = 0;
899
900        # Check for possessive quantifier (+)
901
5611
6019
        if ($check_possessive && $pos + 2 < length($pattern)) {
902
3934
2577
                my $after_next = substr($pattern, $pos + 2, 1);
903
3934
4984
                if (($next =~ /[+*?]/ || $next eq '}') && $after_next eq '+') {
904
6
4
                        $is_possessive = 1;
905                }
906        }
907
908
5611
4630
        if ($next eq '{') {
909
3467
2428
                my $end = index($pattern, '}', $pos + 2);
910
3467
2588
                croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1);
911
3467
2607
                my $quant = substr($pattern, $pos + 2, $end - $pos - 2);
912
913                # Check for possessive after }
914
3467
4133
                if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') {
915
2
2
                        $is_possessive = 1;
916
2
2
                        $end++;
917                }
918
919
3467
2080
                my $result = '';
920
3467
3518
                if ($quant =~ /^(\d+)$/) {
921                        # Exact: {n}
922
3408
4130
                        $result .= $generator->() for (1 .. $1);
923                } elsif ($quant =~ /^(\d+),(\d+)$/) {
924                        # Range: {n,m}
925
59
79
                        my $count = $1 + int(rand($2 - $1 + 1));
926
59
62
                        $result .= $generator->() for (1 .. $count);
927                } elsif ($quant =~ /^(\d+),$/) {
928                        # Minimum: {n,}
929
0
0
                        my $count = $1 + int(rand(5));
930
0
0
                        $result .= $generator->() for (1 .. $count);
931                }
932
3467
3528
                return ($result, $end);
933        } elsif ($next eq '+') {
934                # One or more (possessive: ++)
935
22
16
                my $actual_end = $pos + 1;
936
22
34
                if ($is_possessive) {
937
4
2
                        $actual_end++;
938                }
939
22
47
                my $count = 1 + int(rand(5));
940
22
15
                my $result = '';
941
22
32
                $result .= $generator->() for (1 .. $count);
942
22
30
                return ($result, $actual_end);
943        } elsif ($next eq '*') {
944                # Zero or more (possessive: *+)
945
3
8
                my $actual_end = $pos + 1;
946
3
4
                if ($is_possessive) {
947
1
1
                        $actual_end++;
948                }
949
3
5
                my $count = int(rand(6));
950
3
3
                my $result = '';
951
3
5
                $result .= $generator->() for (1 .. $count);
952
3
5
                return ($result, $actual_end);
953        } elsif ($next eq '?') {
954                # Zero or one (possessive: ?+)
955
4
4
                my $actual_end = $pos + 1;
956
4
8
                if ($is_possessive) {
957
1
1
                        $actual_end++;
958                }
959
4
7
                my $result = rand() < 0.5 ? $generator->() : '';
960
4
5
                return ($result, $actual_end);
961        } else {
962                # No quantifier
963
2115
1429
                return ($generator->(), $pos);
964        }
965}
966
967sub _handle_alternation {
968
88
74
        my ($self, $pattern) = @_;
969
970        # Split on | but respect groups
971
88
49
        my @alternatives;
972
88
55
        my $current = '';
973
88
56
        my $depth = 0;
974
975
88
117
        for my $char (split //, $pattern) {
976
1008
1003
                if ($char eq '(') {
977
4
4
                        $depth++;
978
4
3
                        $current .= $char;
979                } elsif ($char eq ')') {
980
4
2
                        $depth--;
981
4
4
                        $current .= $char;
982                } elsif ($char eq '|' && $depth == 0) {
983
162
115
                        push @alternatives, $current;
984
162
112
                        $current = '';
985                } else {
986
838
539
                        $current .= $char;
987                }
988        }
989
88
113
        push @alternatives, $current if length($current);
990
991        # Choose one alternative randomly
992
88
130
        my $chosen = $alternatives[int(rand(@alternatives))];
993
88
116
        return $self->_parse_sequence($chosen);
994}
995
996sub _find_matching_bracket {
997
3946
2808
        my ($self, $pattern, $start) = @_;
998
999
3946
2249
        my $depth = 0;
1000
3946
3474
        for (my $i = $start; $i < length($pattern); $i++) {
1001
25098
15559
                my $char = substr($pattern, $i, 1);
1002
25098
30832
                if ($char eq '[' && ($i == $start || substr($pattern, $i-1, 1) ne '\\')) {
1003
3946
3242
                        $depth++;
1004                } elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') {
1005
3946
2334
                        $depth--;
1006
3946
3677
                        return $i if $depth == 0;
1007                }
1008        }
1009
0
0
        return -1;
1010}
1011
1012sub _find_matching_paren {
1013
151
125
        my ($self, $pattern, $start) = @_;
1014
1015
151
94
        my $depth = 0;
1016
151
183
        for (my $i = $start; $i < length($pattern); $i++) {
1017
1750
1109
                my $char = substr($pattern, $i, 1);
1018
1750
1301
                my $prev = $i > 0 ? substr($pattern, $i-1, 1) : '';
1019
1020
1750
2234
                if ($char eq '(' && $prev ne '\\') {
1021
156
148
                        $depth++;
1022                } elsif ($char eq ')' && $prev ne '\\') {
1023
156
98
                        $depth--;
1024
156
179
                        return $i if $depth == 0;
1025                }
1026        }
1027
0
0
        return -1;
1028}
1029
1030sub _random_from_class {
1031
17206
11189
        my ($self, $class) = @_;
1032
1033
17206
9788
        my @chars;
1034
1035        # Debugging this regex: qr/!#-'*+\\-\\.\\^_`|~0-9A-Za-z/
1036        # which used to give this error: 'Argument "#" isn't numeric in range (or flop)'
1037
17206
12904
        warn "DEBUG: class = '$class', length = ", length($class) if ($ENV{DEBUG_REGEX_GEN});
1038
1039        # Handle negation
1040
17206
10004
        my $negate = 0;
1041
17206
13386
        if (substr($class, 0, 1) eq '^') {
1042
10
8
                $negate = 1;
1043
10
8
                $class = substr($class, 1);
1044        }
1045
1046        # Parse character class with escape sequences
1047
17206
9539
        my $i = 0;
1048
17206
11960
        while ($i < length($class)) {
1049
33407
21318
                my $char = substr($class, $i, 1);
1050
1051
33407
23396
                warn "DEBUG: i=$i, char='$char' (ord=", ord($char), ')' if ($ENV{DEBUG_REGEX_GEN});
1052
1053
33407
39398
                if ($char eq '\\') {
1054
165
93
                        $i++;
1055
165
99
                        my $next = substr($class, $i, 1);
1056
165
115
                        warn "DEBUG: Escaped char: $next" if ($ENV{DEBUG_REGEX_GEN});
1057
165
184
                        if ($next eq 'd') {
1058
16
14
                                push @chars, ('0'..'9');
1059                        } elsif ($next eq 'w') {
1060
10
39
                                push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_');
1061                        } elsif ($next eq 's') {
1062
0
0
                                push @chars, (' ', "\t", "\n");
1063                        } elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') {
1064                                # Unicode property in character class
1065
6
5
                                my $end = index($class, '}', $i+2);
1066
6
5
                                my $prop = substr($class, $i+2, $end-$i-2);
1067
6
4
                                push @chars, $self->_unicode_property_chars($prop);
1068
6
11
                                $i = $end;
1069                        } else {
1070                                # Escaped literal character (including \-, \., \^, etc.)
1071
133
84
                                push @chars, $next;
1072                        }
1073                } elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') {
1074                        # Potential range
1075
30555
20154
                        my $end_char = substr($class, $i+2, 1);
1076
1077                        # Check if end is escaped or if this is valid range
1078
30555
33182
                        if ($end_char eq '\\' || $end_char eq ']') {
1079                                # Not a range, dash is literal
1080
0
0
                                push @chars, $char;
1081                        } elsif (ord($end_char) >= ord($char)) {
1082                                # Valid range - use ord/chr to avoid quote interpolation issues
1083
30555
18031
                                my $start_ord = ord($char);
1084
30555
17270
                                my $end_ord = ord($end_char);
1085
30555
561022
23177
415344
                                push @chars, map { chr($_) } ($start_ord .. $end_ord);
1086
30555
31873
                                $i += 2;        # Will be incremented again by loop, total +3
1087                        } else {
1088                                # Invalid range order
1089
0
0
                                push @chars, $char;
1090                        }
1091                } else {
1092
2687
1741
                        push @chars, $char;
1093                }
1094
33407
26152
                $i++;
1095        }
1096
1097
17206
12349
        warn 'DEBUG: Final chars array has ', scalar(@chars), ' elements' if ($ENV{DEBUG_REGEX_GEN});
1098
1099
17206
11996
        if ($negate) {
1100
10
100
8
77
                my %excluded = map { $_ => 1 } @chars;
1101
10
940
940
13
684
698
                @chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126);
1102        }
1103
1104
17206
32454
        return @chars ? $chars[int(rand(@chars))] : 'X';
1105}
1106
1107sub _unicode_property_char {
1108
95
77
        my ($self, $prop) = @_;
1109
95
76
        my @chars = $self->_unicode_property_chars($prop);
1110
95
358
        return @chars ? $chars[int(rand(@chars))] : 'X';
1111}
1112
1113sub _unicode_property_chars {
1114
101
65
        my ($self, $prop) = @_;
1115
1116        # Common Unicode properties
1117
101
111
        if ($prop eq 'L' || $prop eq 'Letter') {
1118                # Letters, skip × and ÷ which are symbols
1119
88
5456
407
4150
                return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'À')..ord('Ö'), ord('Ø')..ord('ö'), ord('ø')..ord('ÿ')));
1120        } elsif ($prop eq 'N' || $prop eq 'Number') {
1121                # Numbers
1122                # return ('0' .. '9', map { chr($_) } (ord('â‘ ').. ord('⑳')));
1123
3
5
                return ('0' .. '9');
1124        } elsif ($prop eq 'Lu' || $prop eq 'Uppercase_Letter') {
1125                # Uppercase letters, skip × which is not a letter
1126
6
180
17
138
                return ('A' .. 'Z', map { chr($_) } (ord('À') .. ord('Ö'), ord('Ø') .. ord('Þ')));
1127        } elsif ($prop eq 'Ll' || $prop eq 'Lowercase_Letter') {
1128                # Lowercase letters, skip ÷ which is not a letter
1129
4
124
12
95
                return ('a' .. 'z', map { chr($_) } (ord('à') .. ord('ö'), ord('ø') .. ord('ÿ')));
1130        } elsif ($prop eq 'P' || $prop eq 'Punctuation') {
1131                # Punctuation
1132
0
0
                return ('.', ',', '!', '?', ';', ':', '-', '—', '…');
1133        } elsif ($prop eq 'S' || $prop eq 'Symbol') {
1134                # Symbols
1135
0
0
                return ('$', '€', '£', 'Â¥', '©', '®', 'â„¢', '°', '±', '×', '÷');
1136        } elsif ($prop eq 'Z' || $prop eq 'Separator') {
1137                # Separators
1138
0
0
                return (' ', "\t", "\n");
1139        } elsif ($prop eq 'Nd' || $prop eq 'Decimal_Number') {
1140                # Decimal numbers
1141
0
0
                return ('0'..'9');
1142        } else {
1143                # Unknown property - return letters as default
1144
0
0
                return ('a'..'z', 'A'..'Z');
1145        }
1146}
1147
1148 - 1154
=head2 create_random_string

For consistency with L<Data::Random::String>.

  print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=cut
1155
1156sub create_random_string
1157{
1158
2
865
        my $class = shift;
1159
2
3
        my $params = Params::Get::get_params(undef, @_);
1160
1161
2
24
        my $regex = $params->{'regex'};
1162
2
2
        my $length = $params->{'length'};
1163
1164
2
3
        return $class->new($regex, $length)->generate();
1165}
1166
1167 - 1201
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/Data-Random-String-Matches/coverage/>

=item * L<String::Random>

=item * L<Regexp::Genex>

=back

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
1202
12031;