File Coverage

File:blib/lib/Data/Random/String/Matches.pm
Coverage:87.3%

linestmtbrancondsubtimecode
1package Data::Random::String::Matches;
2
3
31
31
901758
49
use 5.010;
4
5
31
31
31
52
25
272
use strict;
6
31
31
31
47
26
601
use warnings;
7
8
31
31
31
48
25
682
use Carp qw(carp croak);
9
31
31
31
5530
153280
606
use Params::Get;
10
31
31
31
5512
2733
57
use utf8;
11
12our $VERSION = '0.03';
13
14 - 182
=head1 NAME

Data::Random::String::Matches - Generate random strings matching a regex

=head1 SYNOPSIS

        use Data::Random::String::Matches;

        # Create a generator with regex and optional length
        my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7);

        # Generate a matching string
        my $str = $gen->generate();
        print $str;  # e.g., "XYZ1234"

        # Alternation
        my $gen2 = Data::Random::String::Matches->new(qr/(cat|dog|bird)/);
        my $animal = $gen2->generate_smart();  # "cat", "dog", or "bird"

        # Backreferences
        my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/);
        my $str3 = $gen3->generate_smart();  # e.g., "abc-abc"

        # Groups and quantifiers
        my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/);
        my $laugh = $gen4->generate_smart();  # "haha", "hahaha", or "hahahaha"

        # Unicode
        $gen = Data::Random::String::Matches->new(qr/\p{L}{5}/);

        # Named captures
        $gen = Data::Random::String::Matches->new(qr/(?<year>\d{4})-\k<year>/);

        # Possessive
        $gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/);

        # Lookaheads
        $gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/);

        # Combined
        $gen = Data::Random::String::Matches->new(
                qr/(?<prefix>\p{Lu}{2})\d++\k<prefix>(?=[A-Z])/
        );

        # Consistency with Legacy software
        print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=head1 DESCRIPTION

This module generates random strings that match a given regular expression pattern.
It parses the regex pattern and intelligently builds matching strings, supporting
a wide range of regex features.

=head1 SUPPORTED REGEX FEATURES

=head2 Character Classes

=over 4

=item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]>

=item * Negated classes: C<[^a-z]>

=item * Ranges: C<[a-zA-Z0-9]>

=item * Escape sequences in classes: C<[\d\w]>

=back

=head2 Escape Sequences

=over 4

=item * C<\d> - digit [0-9]

=item * C<\w> - word character [a-zA-Z0-9_]

=item * C<\s> - whitespace

=item * C<\D> - non-digit

=item * C<\W> - non-word character

=item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return

=back

=head2 Quantifiers

=over 4

=item * C<{n}> - exactly n times

=item * C<{n,m}> - between n and m times

=item * C<{n,}> - n or more times

=item * C<+> - one or more (1-5 times)

=item * C<*> - zero or more (0-5 times)

=item * C<?> - zero or one

=back

=head2 Grouping and Alternation

=over 4

=item * C<(...)> - capturing group

=item * C<(?:...)> - non-capturing group

=item * C<|> - alternation (e.g., C<cat|dog|bird>)

=item * C<\1>, C<\2>, etc. - backreferences

=back

=head2 Other

=over 4

=item * C<.> - any character (printable ASCII)

=item * Literal characters

=item * C<^> and C<$> anchors (stripped during parsing)

=back

=head1 LIMITATIONS

=over 4

=item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported

=item * Named groups ((?<name>...)) are not supported

=item * Possessive quantifiers (*+, ++) are not supported

=item * Unicode properties (\p{L}, \p{N}) are not supported

=item * Some complex nested patterns may not work correctly with smart parsing

=back

=head1 EXAMPLES

        # Email-like pattern
        my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/);

        # API key pattern
        my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/);

        # Phone number
        my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/);

        # Repeated pattern
        my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/);

=head1 METHODS

=head2 new($regex, $length)

Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string.
C<$length> is optional and defaults to 10 (used for fallback generation).

=cut
183
184sub new {
185
198
415246
        my ($class, $regex, $length) = @_;
186
187
198
260
        croak 'Regex pattern is required' unless defined $regex;
188
189        # Convert string to regex if needed
190
197
395
        my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/;
191
192
197
539
        my $self = {
193                regex    => $regex_obj,
194                regex_str => "$regex",
195                length  => $length || 10,
196                backrefs        => {},  # Store backreferences
197                named_refs => {},     # Store named captures
198        };
199
200
197
270
        return bless $self, $class;
201}
202
203 - 209
=head2 generate($max_attempts)

Generates a random string matching the regex. First tries smart parsing, then
falls back to brute force if needed. Tries up to C<$max_attempts> times
(default 1000) before croaking.

=cut
210
211sub generate {
212
3939
11480
        my ($self, $max_attempts) = @_;
213
3939
5208
        $max_attempts //= 1000;
214
215
3939
2569
        my $regex = $self->{regex};
216
3939
2406
        my $length = $self->{length};
217
218        # First try the smart approach
219
3939
3939
2404
2978
        my $str = eval { $self->_build_from_pattern($self->{regex_str}) };
220
3939
7825
        if (defined $str && $str =~ /^$regex$/) {
221
3937
4275
                return $str;
222        }
223
224        # If smart approach failed, show warning in debug mode
225
2
4
        if ($ENV{DEBUG_REGEX_GEN} && $@) {
226
0
0
                warn "Smart generation failed: $@";
227        }
228
229        # Fall back to brute force with character set matching
230
2
21
        for (1 .. $max_attempts) {
231
1010
741
                $str = $self->_random_string_smart($length);
232
1010
1691
                return $str if $str =~ /^$regex$/;
233        }
234
235
2
74
        croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}";
236}
237
238sub _random_string_smart {
239
1010
633
        my ($self, $len) = @_;
240
241
1010
635
        my $regex_str = $self->{regex_str};
242
243        # Detect common patterns and generate appropriate characters
244
1010
530
        my @chars;
245
246
1010
2432
        if ($regex_str =~ /\\d/ || $regex_str =~ /\[0-9\]/ || $regex_str =~ /\[\^[^\]]*[A-Za-z]/) {
247                # Digit patterns
248
0
0
                @chars = ('0'..'9');
249        } elsif ($regex_str =~ /\[A-Z\]/ || $regex_str =~ /\[A-Z[^\]]*\]/) {
250                # Uppercase patterns
251
0
0
                @chars = ('A'..'Z');
252        } elsif ($regex_str =~ /\[a-z\]/ || $regex_str =~ /\[a-z[^\]]*\]/) {
253                # Lowercase patterns
254
0
0
                @chars = ('a'..'z');
255        } elsif ($regex_str =~ /\\w/ || $regex_str =~ /\[a-zA-Z0-9_\]/) {
256                # Word characters
257
0
0
                @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
258        } else {
259                # Default to printable ASCII
260
1010
94940
711
68992
                @chars = map { chr($_) } (33 .. 126);
261        }
262
263
1010
707
        my $str = '';
264
1010
2342
        $str .= $chars[int(rand(@chars))] for (1 .. $len);
265
266
1010
2498
        return $str;
267}
268
269 - 274
=head2 generate_smart()

Parses the regex and builds a matching string directly. Faster and more reliable
than brute force, but may not handle all edge cases.

=cut
275
276sub generate_smart {
277
132
5841
        my $self = $_[0];
278
132
170
        return $self->_build_from_pattern($self->{regex_str});
279}
280
281 - 311
=head2 generate_many($count, $unique)

Generates multiple random strings matching the regex.

    my @strings = $gen->generate_many(10);           # 10 strings (may have duplicates)
    my @strings = $gen->generate_many(10, 1);        # 10 unique strings
    my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings

    # Generate until you have 1000 unique codes
    my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/);
    my @codes = $gen->generate_many(1000, 'unique');

Parameters:

=over 4

=item * C<$count> - Number of strings to generate (required, must be positive)

=item * C<$unique> - If true, ensures all generated strings are unique. May return fewer
than C<$count> strings if uniqueness cannot be achieved within reasonable attempts.
Accepts any true value (1, 'unique', etc.)

=back

Returns: List of generated strings

Dies: If count is not a positive integer

Warns: If unable to generate the requested number of unique strings

=cut
312
313sub generate_many {
314
23
2682
        my ($self, $count, $unique) = @_;
315
316
23
48
        croak 'Count must be a positive integer' unless defined $count && $count > 0;
317
318
20
13
        my @results;
319
320
20
16
        if ($unique) {
321                # Generate unique strings
322
5
6
                my %seen;
323
5
3
                my $attempts = 0;
324
5
5
                my $max_attempts = $count * 100;        # Reasonable limit
325
326
5
10
                while (keys %seen < $count && $attempts < $max_attempts) {
327
1628
1186
                        my $str = $self->generate();
328
1628
1237
                        $seen{$str} = 1;
329
1628
1921
                        $attempts++;
330                }
331
332
5
4
                if (keys %seen < $count) {
333
1
12
                        carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested";
334                }
335
336
5
296
                @results = keys %seen;
337        } else {
338                # Generate any strings (may have duplicates)
339
15
23
                push @results, $self->generate() for (1 .. $count);
340        }
341
342
20
250
        return @results;
343}
344
345 - 349
=head2 get_seed()

Gets the random seed for reproducible generation

=cut
350
351sub get_seed {
352
2
338
        my $self = shift;
353
354
2
6
        return $self->{seed};
355}
356
357 - 361
=head2 set_seed($seed)

Sets the random seed for reproducible generation

=cut
362
363sub set_seed {
364
13
1512
        my $self = shift;
365
13
19
        my $params = Params::Get::get_params('seed', \@_);
366
12
130
        my $seed = $params->{'seed'};
367
368
12
70
        croak 'Seed must be defined' unless defined $seed;
369
370
11
12
        srand($seed);
371
11
11
        $self->{seed} = $seed;
372
373
11
18
        return $self;
374}
375
376 - 389
=head2 suggest_simpler_pattern()

Analyzes patterns and suggests improvements.

  my $suggestion = $gen->suggest_simpler_pattern();

  if ($suggestion) {
    print "Reason: $suggestion->{reason}\n";
    print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern};
    print "Tips:\n";
    print "  - $_\n" for @{$suggestion->{tips}};
  }

=cut
390
391sub suggest_simpler_pattern {
392
21
36
        my $self = $_[0];
393
394
21
16
        my $pattern = $self->{regex_str};
395
21
25
        my $info = $self->pattern_info();
396
397        # Check for patterns that are too complex
398
21
21
        if ($info->{complexity} eq 'very_complex') {
399                return {
400
1
6
                        pattern => undef,
401                        reason  => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.',
402                        tips    => [
403                                'Split alternations into separate generators',
404                                'Avoid deeply nested groups',
405                                'Use fixed-length patterns when possible',
406                        ],
407                };
408        }
409
410        # Suggest removing unnecessary backreferences
411
20
21
        if ($info->{features}{has_backreferences} && $pattern =~ /(\(\w+\)).*\\\d+/) {
412
0
0
                my $simpler = $pattern;
413                # Can't automatically simplify backreferences, but can suggest
414                return {
415
0
0
                        pattern => undef,
416                        reason  => 'Backreferences add complexity. Consider if you really need repeated groups.',
417                        tips    => [
418                                'If the repeated part doesn\'t need to match, use two separate patterns',
419                                'For validation, backreferences are great; for generation, they limit variation',
420                        ],
421                };
422        }
423
424        # Suggest fixed quantifiers instead of ranges
425
20
24
        if ($pattern =~ /\{(\d+),(\d+)\}/) {
426
8
12
                my ($min, $max) = ($1, $2);
427
8
10
                if ($max - $min > 10) {
428
6
4
                        my $mid = int(($min + $max) / 2);
429
6
5
                        my $simpler = $pattern;
430
6
15
                        $simpler =~ s/\{\d+,\d+\}/\{$mid\}/;
431                        return {
432
6
17
                                pattern => $simpler,
433                                reason  => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.",
434                                tips    => [
435                                        'Fixed lengths are faster to generate',
436                                        'If you need variety, generate multiple patterns with different fixed lengths',
437                                ],
438                        };
439                }
440        }
441
442        # Suggest limiting alternations
443
14
16
        if ($info->{features}{has_alternation}) {
444
2
4
                my @alts = split /\|/, $pattern;
445
2
3
                if (@alts > 10) {
446                        return {
447
1
4
                                pattern => undef,
448                                reason  => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.',
449                                tips    => [
450                                        'Create separate generators for different alternatives',
451                                        'Group similar patterns together',
452                                        'Use character classes [abc] instead of (a|b|c)',
453                                ],
454                        };
455                }
456
457                # Check if alternations could be a character class
458
1
2
                if ($pattern =~ /\(([a-zA-Z])\|([a-zA-Z])\|([a-zA-Z])\)/) {
459
1
2
                        my $chars = join('', $1, $2, $3);
460
1
1
                        my $simpler = $pattern;
461
1
4
                        $simpler =~ s/\([a-zA-Z]\|[a-zA-Z]\|[a-zA-Z]\)/[$chars]/;
462                        return {
463
1
3
                                pattern => $simpler,
464                                reason  => 'Single-character alternations can be simplified to character classes.',
465                                tips    => [
466                                        'Use [abc] instead of (a|b|c)',
467                                        'Character classes are faster to process',
468                                ],
469                        };
470                }
471        }
472
473        # Suggest removing lookaheads/lookbehinds for generation
474
12
19
        if ($info->{features}{has_lookahead} || $info->{features}{has_lookbehind}) {
475
3
3
                my $simpler = $pattern;
476
3
6
                $simpler =~ s/\(\?[=!].*?\)//g;   # Remove lookaheads
477
3
5
                $simpler =~ s/\(\?<[=!].*?\)//g;  # Remove lookbehinds
478
479
3
4
                if ($simpler ne $pattern) {
480                        return {
481
3
7
                                pattern => $simpler,
482                                reason  => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.',
483                                tips    => [
484                                        'Lookaheads are great for validation, not generation',
485                                        'The simplified pattern generates the same strings',
486                                ],
487                        };
488                }
489        }
490
491        # Check for Unicode when ASCII would work
492
9
13
        if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) {
493
2
2
                my $simpler = $pattern;
494
2
4
                $simpler =~ s/\\p\{L\}/[A-Za-z]/g;
495                return {
496
2
6
                        pattern => $simpler,
497                        reason  => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.',
498                        tips    => [
499                                'ASCII patterns are faster',
500                                'Only use Unicode if you need non-ASCII characters',
501                        ],
502                };
503        }
504
505        # Check for overly long fixed strings
506
7
7
        if ($pattern =~ /([a-zA-Z]{20,})/) {
507                return {
508
0
0
                        pattern => undef,
509                        reason  => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.',
510                        tips    => [
511                                'Use variables instead of long literals',
512                                'Break into smaller patterns',
513                        ],
514                };
515        }
516
517        # Pattern seems reasonable
518
7
6
        return undef;
519}
520
521 - 529
=head2 validate($string)

Checks if a string matches the pattern without generating.

  if ($gen->validate('1234')) {
    print "Valid!\n";
  }

=cut
530
531sub validate {
532
61
1800
        my $self = shift;
533
61
78
        my $params = Params::Get::get_params('string', \@_);
534
60
578
        my $string = $params->{'string'};
535
536
60
96
        croak('String must be defined') unless defined $string;
537
538
59
51
        my $regex = $self->{regex};
539
59
311
        return $string =~ /^$regex$/;
540}
541
542 - 558
=head2 pattern_info()

Returns detailed information about the pattern.

  my $info = $gen->pattern_info();
  print "Complexity: $info->{complexity}\n";
  print "Min length: $info->{min_length}\n";
  print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n";

C<pattern_info> analyzes a regular expression to produce a structured summary of its characteristics,
including estimated string lengths, detected features, and an overall complexity rating.
It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals,
character classes, and quantifiers.
It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash.
Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level.

=cut
559
560sub pattern_info {
561
40
1907
        my $self = $_[0];
562
563
40
41
        return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'};
564
565
37
23
        my $pattern = $self->{'regex_str'};
566
567        # Calculate approximate min/max lengths
568
37
38
        my ($min_len, $max_len) = $self->_estimate_length($pattern);
569
570        # Detect pattern features
571
37
257
        my %features = (
572                has_alternation     => ($pattern =~ /\|/ ? 1 : 0),
573                has_backreferences  => ($pattern =~ /(\\[1-9]|\\k<)/ ? 1 : 0),
574                has_unicode         => ($pattern =~ /\\p\{/ ? 1 : 0),
575                has_lookahead       => ($pattern =~ /\(\?[=!]/ ? 1 : 0),
576                has_lookbehind      => ($pattern =~ /\(\?<[=!]/ ? 1 : 0),
577                has_named_groups    => ($pattern =~ /\(\?</ ? 1 : 0),
578                has_possessive      => ($pattern =~ /(?:[+*?]\+|\{\d+(?:,\d*)?\}\+)/ ? 1 : 0),
579        );
580
581
37
77
        my $info = {
582                pattern             => $pattern,
583                min_length          => $min_len,
584                max_length          => $max_len,
585                estimated_length    => int(($min_len + $max_len) / 2),
586                features            => \%features,
587                complexity          => $self->_calculate_complexity(\%features, $pattern),
588        };
589
590
37
37
        $self->{'_pattern_info_cache'} = $info;
591
592
37
41
        return $info;
593}
594
595sub _estimate_length {
596
37
34
        my ($self, $pattern) = @_;
597
598        # Remove anchors and modifiers
599
37
107
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
600
37
38
        $pattern =~ s/^\^//;
601
37
22
        $pattern =~ s/\$//;
602
603
37
25
        my $min = 0;
604
37
23
        my $max = 0;
605
606        # Simple heuristic - count fixed characters and quantifiers
607
37
22
        my $last_was_atom = 0;  # Handle cases like \d{3} where the quantifier modifies the atom count
608
37
74
        while ($pattern =~ /([^+*?{}\[\]\\])|\\[dwsWDN]|\[([^\]]+)\]|\{(\d+)(?:,(\d+))?\}/g) {
609
221
300
                if (defined $1 || (defined $2 && $2)) {
610
155
91
                        $min++;
611
155
89
                        $max++;
612
155
162
                        $last_was_atom = 1;
613                } elsif (defined $3) {
614
36
35
                        if ($last_was_atom) {
615                                # Replace the last atom’s contribution
616
16
17
                                $min += $3 - 1;
617
16
20
                                $max += (defined $4 ? $4 : $3) - 1;
618
16
19
                                $last_was_atom = 0;
619                        } else {
620                                # No preceding atom? assume standalone
621
20
29
                                $min += $3;
622
20
35
                                $max += defined $4 ? $4 : $3;
623                        }
624                }
625        }
626
627        # Account for +, *, ?
628
37
32
        my $plus_count = () = $pattern =~ /\+/g;
629
37
35
        my $star_count = () = $pattern =~ /\*/g;
630
37
26
        my $question_count = () = $pattern =~ /\?/g;
631
632
37
24
        $min += $plus_count;  # + means at least 1
633
37
35
        $max += ($plus_count * 5) + ($star_count * 5);  # Assume max 5 repetitions
634
37
21
        $min -= $question_count;  # ? makes things optional
635
636
37
36
        $min = 0 if $min < 0;
637
37
26
        $max = $min + 50 if $max < $min;  # Ensure max >= min
638
639
37
46
        return ($min, $max);
640}
641
642sub _calculate_complexity {
643
37
32
        my ($self, $features, $pattern) = @_;
644
645
37
24
        my $score = 0;
646
647        # Base complexity from pattern length
648
37
37
        $score += length($pattern) / 10;
649
650        # Add complexity for features
651
37
53
        $score += 2 if $features->{has_alternation};
652
37
34
        $score += 3 if $features->{has_backreferences};
653
37
29
        $score += 2 if $features->{has_unicode};
654
37
34
        $score += 2 if $features->{has_lookahead};
655
37
29
        $score += 2 if $features->{has_lookbehind};
656
37
33
        $score += 1 if $features->{has_named_groups};
657
37
24
        $score += 1 if $features->{has_possessive};
658
659        # Classify
660
37
61
        return 'simple'   if $score < 3;
661
14
30
        return 'moderate' if $score < 7;
662
3
6
        return 'complex'  if $score < 12;
663
1
2
        return 'very_complex';
664}
665
666sub _build_from_pattern {
667
4070
2780
        my ($self, $pattern) = @_;
668
669        # Reset backreferences for each generation
670
4070
2876
        $self->{backrefs} = {};
671
4070
2764
        $self->{named_refs} = {};
672
4070
2655
        $self->{group_counter} = 0;
673
674        # Remove regex delimiters and modifiers
675        # Handle (?^:...), (?i:...), (?-i:...) etc
676
4070
5248
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
677
678        # Remove anchors (they're handled by the regex match itself)
679
4070
2756
        $pattern =~ s/^\^//;
680
4070
2597
        $pattern =~ s/\$//;
681
682
4070
3180
        return $self->_parse_sequence($pattern);
683}
684
685sub _parse_sequence {
686
4212
2847
        my ($self, $pattern) = @_;
687
688
4212
2541
        my $result = '';
689
4212
2405
        my $i = 0;
690
4212
2449
        my $len = length($pattern);
691
692
4212
3141
        while ($i < $len) {
693
5649
3696
                my $char = substr($pattern, $i, 1);
694
695
5649
5692
                if ($char eq '\\') {
696                        # Escape sequence
697
940
584
                        $i++;
698
940
596
                        my $next = substr($pattern, $i, 1);
699
700
940
1495
                        if ($next =~ /[1-9]/) {
701                                # Backreference
702
30
17
                                my $ref_num = $next;
703
30
26
                                if (exists $self->{backrefs}{$ref_num}) {
704
30
28
                                        $result .= $self->{backrefs}{$ref_num};
705                                } else {
706
0
0
                                        croak "Backreference \\$ref_num used before group defined";
707                                }
708                        } elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') {
709                                # Named backreference \k<name>
710
7
4
                                my $end = index($pattern, '>', $i+2);
711
7
6
                                my $name = substr($pattern, $i+2, $end-$i-2);
712
7
8
                                if (exists $self->{named_refs}{$name}) {
713
7
7
                                        $result .= $self->{named_refs}{$name};
714                                } else {
715
0
0
                                        croak "Named backreference \\k<$name> used before group defined";
716                                }
717
7
5
                                $i = $end;
718                        } elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') {
719                                # Unicode property \p{L}, \p{N}, etc.
720
25
20
                                my $end = index($pattern, '}', $i+2);
721
25
23
                                my $prop = substr($pattern, $i+2, $end-$i-2);
722                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
723
95
73
                                        $self->_unicode_property_char($prop);
724
25
40
                                });
725
25
34
                                $result .= $generated;
726
25
17
                                $i = $new_i;
727                        } elsif ($next eq 'd') {
728
730
2297
904
2260
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1);
729
730
716
                                $result .= $generated;
730
730
459
                                $i = $new_i;
731                        } elsif ($next eq 'w') {
732                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
733
115
434
                                        my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
734
115
304
                                        $chars[int(rand(@chars))];
735
35
68
                                }, 1);
736
35
48
                                $result .= $generated;
737
35
29
                                $i = $new_i;
738                        } elsif ($next eq 's') {
739
2
2
5
2
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1);
740
2
4
                                $result .= $generated;
741
2
2
                                $i = $new_i;
742                        } elsif ($next eq 'D') {
743                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
744
6
504
564
8
335
439
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126);
745
6
26
                                        $chars[int(rand(@chars))];
746
2
4
                                });
747
2
5
                                $result .= $generated;
748
2
2
                                $i = $new_i;
749                        } elsif ($next eq 'W') {
750                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
751
6
186
564
7
124
481
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126);
752
6
21
                                        $chars[int(rand(@chars))];
753
2
6
                                });
754
2
5
                                $result .= $generated;
755
2
2
                                $i = $new_i;
756                        } elsif ($next eq 't') {
757
2
2
                                $result .= "\t";
758                        } elsif ($next eq 'n') {
759
2
2
                                $result .= "\n";
760                        } elsif ($next eq 'r') {
761
0
0
                                $result .= "\r";
762                        } else {
763
103
65
                                $result .= $next;
764                        }
765
940
786
                        $i++;
766                } elsif ($char eq '[') {
767                        # Character class
768
3750
2965
                        my $end = $self->_find_matching_bracket($pattern, $i);
769
3750
3006
                        croak 'Unmatched [' if $end == -1;
770
771
3750
2639
                        my $class = substr($pattern, $i+1, $end-$i-1);
772                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
773
16988
12694
                                $self->_random_from_class($class);
774
3750
4370
                        }, 1);
775
3750
3780
                        $result .= $generated;
776
3750
3227
                        $i = $new_i + 1;
777                } elsif ($char eq '(') {
778                        # Group - could be various types
779
151
143
                        my $end = $self->_find_matching_paren($pattern, $i);
780
151
132
                        croak 'Unmatched (' if $end == -1;
781
782
151
147
                        my $group_content = substr($pattern, $i+1, $end-$i-1);
783
784                        # Check for special group types
785
151
83
                        my $is_capturing = 1;
786
151
91
                        my $is_lookahead = 0;
787
151
79
                        my $is_lookbehind = 0;
788
151
88
                        my $is_negative = 0;
789
151
105
                        my $group_name = undef;
790
791
151
260
                        if ($group_content =~ /^\?:/) {
792                                # Non-capturing group
793
2
1
                                $is_capturing = 0;
794
2
3
                                $group_content = substr($group_content, 2);
795                        } elsif ($group_content =~ /^\?<([^>]+)>/) {
796                                # Named capture (?<name>...)
797
15
11
                                $group_name = $1;
798
15
17
                                $group_content = substr($group_content, length($1) + 3);
799                        } elsif ($group_content =~ /^\?=/) {
800                                # Positive lookahead (?=...)
801
6
5
                                $is_lookahead = 1;
802
6
3
                                $is_capturing = 0;
803
6
6
                                $group_content = substr($group_content, 2);
804                        } elsif ($group_content =~ /^\?!/) {
805                                # Negative lookahead (?!...)
806
1
0
                                $is_lookahead = 1;
807
1
1
                                $is_negative = 1;
808
1
1
                                $is_capturing = 0;
809
1
1
                                $group_content = substr($group_content, 2);
810                        } elsif ($group_content =~ /^\?<=/) {
811                                # Positive lookbehind (?<=...)
812
1
1
                                $is_lookbehind = 1;
813
1
0
                                $is_capturing = 0;
814
1
1
                                $group_content = substr($group_content, 3);
815                        } elsif ($group_content =~ /^\?<!/) {
816                                # Negative lookbehind (?<!...)
817
1
1
                                $is_lookbehind = 1;
818
1
1
                                $is_negative = 1;
819
1
1
                                $is_capturing = 0;
820
1
1
                                $group_content = substr($group_content, 3);
821                        }
822
823                        # Handle lookaheads/lookbehinds
824
151
134
                        if ($is_lookahead) {
825                                # For positive lookahead, generate the pattern but don't advance
826                                # For negative lookahead, avoid the pattern
827
7
6
                                if (!$is_negative) {
828                                        # Generate what the lookahead expects but don't consume it
829                                        # This is a simplification - we just note the constraint
830                                }
831                                # Lookaheads don't add to the result
832
7
7
                                $i = $end + 1;
833
7
8
                                next;
834                        } elsif ($is_lookbehind) {
835                                # Lookbehinds check what came before
836                                # For generation, we can mostly ignore them
837
2
2
                                $i = $end + 1;
838
2
2
                                next;
839                        }
840
841                        # Check for alternation
842
142
93
                        my $generated;
843
142
126
                        if ($group_content =~ /\|/) {
844
88
100
                                $generated = $self->_handle_alternation($group_content);
845                        } else {
846
54
69
                                $generated = $self->_parse_sequence($group_content);
847                        }
848
849                        # Store backreference if capturing
850
142
129
                        if ($is_capturing) {
851
140
127
                                $self->{group_counter}++;
852
140
152
                                $self->{backrefs}{$self->{group_counter}} = $generated;
853
854
140
123
                                if (defined $group_name) {
855
15
15
                                        $self->{named_refs}{$group_name} = $generated;
856                                }
857                        }
858
859                        # Handle quantifier after group (including possessive)
860
142
151
172
159
                        my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1);
861
142
138
                        $result .= $final_generated;
862
142
150
                        $i = $new_i + 1;
863                } elsif ($char eq '.') {
864                        # Any character (except newline)
865                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
866
2
188
3
140
                                my @chars = map { chr($_) } (33 .. 126);
867
2
9
                                $chars[int(rand(@chars))];
868
2
4
                        });
869
2
4
                        $result .= $generated;
870
2
2
                        $i = $new_i + 1;
871                } elsif ($char eq '|') {
872                        # Alternation at top level - just return what we have
873                        # (This is handled by _handle_alternation for groups)
874
0
0
                        last;
875                } elsif ($char =~ /[+*?]/ || $char eq '{') {
876                        # Quantifier without preceding element - shouldn't happen in valid regex
877
1
90
                        croak "$pattern: Quantifier '$char' without preceding element";
878                } elsif ($char =~ /[\w ]/) {
879                        # Literal character
880
736
737
840
673
                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char });
881
736
733
                        $result .= $generated;
882
736
662
                        $i = $new_i + 1;
883                } else {
884                        # Other literal characters
885
69
53
                        $result .= $char;
886
69
57
                        $i++;
887                }
888        }
889
890
4211
3761
        return $result;
891}
892
893sub _handle_quantifier {
894
5426
4054
        my ($self, $pattern, $pos, $generator, $check_possessive) = @_;
895
5426
4367
        $check_possessive //= 1;  # Default to checking for possessive
896
897
5426
3601
        my $next = substr($pattern, $pos + 1, 1);
898
5426
2995
        my $is_possessive = 0;
899
900        # Check for possessive quantifier (+)
901
5426
5683
        if ($check_possessive && $pos + 2 < length($pattern)) {
902
3936
2393
                my $after_next = substr($pattern, $pos + 2, 1);
903
3936
4837
                if (($next =~ /[+*?]/ || $next eq '}') && $after_next eq '+') {
904
6
6
                        $is_possessive = 1;
905                }
906        }
907
908
5426
4474
        if ($next eq '{') {
909
3458
2342
                my $end = index($pattern, '}', $pos + 2);
910
3458
2384
                croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1);
911
3458
2495
                my $quant = substr($pattern, $pos + 2, $end - $pos - 2);
912
913                # Check for possessive after }
914
3458
4132
                if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') {
915
2
0
                        $is_possessive = 1;
916
2
2
                        $end++;
917                }
918
919
3458
2066
                my $result = '';
920
3458
3238
                if ($quant =~ /^(\d+)$/) {
921                        # Exact: {n}
922
3399
3815
                        $result .= $generator->() for (1 .. $1);
923                } elsif ($quant =~ /^(\d+),(\d+)$/) {
924                        # Range: {n,m}
925
59
76
                        my $count = $1 + int(rand($2 - $1 + 1));
926
59
62
                        $result .= $generator->() for (1 .. $count);
927                } elsif ($quant =~ /^(\d+),$/) {
928                        # Minimum: {n,}
929
0
0
                        my $count = $1 + int(rand(5));
930
0
0
                        $result .= $generator->() for (1 .. $count);
931                }
932
3458
3355
                return ($result, $end);
933        } elsif ($next eq '+') {
934                # One or more (possessive: ++)
935
11
11
                my $actual_end = $pos + 1;
936
11
20
                if ($is_possessive) {
937
4
4
                        $actual_end++;
938                }
939
11
31
                my $count = 1 + int(rand(5));
940
11
9
                my $result = '';
941
11
17
                $result .= $generator->() for (1 .. $count);
942
11
14
                return ($result, $actual_end);
943        } elsif ($next eq '*') {
944                # Zero or more (possessive: *+)
945
3
3
                my $actual_end = $pos + 1;
946
3
4
                if ($is_possessive) {
947
1
0
                        $actual_end++;
948                }
949
3
6
                my $count = int(rand(6));
950
3
9
                my $result = '';
951
3
4
                $result .= $generator->() for (1 .. $count);
952
3
4
                return ($result, $actual_end);
953        } elsif ($next eq '?') {
954                # Zero or one (possessive: ?+)
955
4
4
                my $actual_end = $pos + 1;
956
4
5
                if ($is_possessive) {
957
1
1
                        $actual_end++;
958                }
959
4
9
                my $result = rand() < 0.5 ? $generator->() : '';
960
4
4
                return ($result, $actual_end);
961        } else {
962                # No quantifier
963
1950
1354
                return ($generator->(), $pos);
964        }
965}
966
967sub _handle_alternation {
968
88
85
        my ($self, $pattern) = @_;
969
970        # Split on | but respect groups
971
88
55
        my @alternatives;
972
88
49
        my $current = '';
973
88
50
        my $depth = 0;
974
975
88
124
        for my $char (split //, $pattern) {
976
1008
974
                if ($char eq '(') {
977
4
4
                        $depth++;
978
4
3
                        $current .= $char;
979                } elsif ($char eq ')') {
980
4
4
                        $depth--;
981
4
2
                        $current .= $char;
982                } elsif ($char eq '|' && $depth == 0) {
983
162
123
                        push @alternatives, $current;
984
162
97
                        $current = '';
985                } else {
986
838
504
                        $current .= $char;
987                }
988        }
989
88
108
        push @alternatives, $current if length($current);
990
991        # Choose one alternative randomly
992
88
134
        my $chosen = $alternatives[int(rand(@alternatives))];
993
88
115
        return $self->_parse_sequence($chosen);
994}
995
996sub _find_matching_bracket {
997
3750
2619
        my ($self, $pattern, $start) = @_;
998
999
3750
2161
        my $depth = 0;
1000
3750
3019
        for (my $i = $start; $i < length($pattern); $i++) {
1001
23803
14552
                my $char = substr($pattern, $i, 1);
1002
23803
28542
                if ($char eq '[' && ($i == $start || substr($pattern, $i-1, 1) ne '\\')) {
1003
3750
2911
                        $depth++;
1004                } elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') {
1005
3750
2248
                        $depth--;
1006
3750
3458
                        return $i if $depth == 0;
1007                }
1008        }
1009
0
0
        return -1;
1010}
1011
1012sub _find_matching_paren {
1013
151
121
        my ($self, $pattern, $start) = @_;
1014
1015
151
90
        my $depth = 0;
1016
151
144
        for (my $i = $start; $i < length($pattern); $i++) {
1017
1750
1085
                my $char = substr($pattern, $i, 1);
1018
1750
1259
                my $prev = $i > 0 ? substr($pattern, $i-1, 1) : '';
1019
1020
1750
2141
                if ($char eq '(' && $prev ne '\\') {
1021
156
140
                        $depth++;
1022                } elsif ($char eq ')' && $prev ne '\\') {
1023
156
113
                        $depth--;
1024
156
168
                        return $i if $depth == 0;
1025                }
1026        }
1027
0
0
        return -1;
1028}
1029
1030sub _random_from_class {
1031
16988
10924
        my ($self, $class) = @_;
1032
1033
16988
9883
        my @chars;
1034
1035        # Handle negation
1036
16988
9920
        my $negate = 0;
1037
16988
13129
        if (substr($class, 0, 1) eq '^') {
1038
10
6
                $negate = 1;
1039
10
9
                $class = substr($class, 1);
1040        }
1041
1042        # Parse character class with escape sequences
1043
16988
9585
        my $i = 0;
1044
16988
12000
        while ($i < length($class)) {
1045
32570
20509
                my $char = substr($class, $i, 1);
1046
1047
32570
35656
                if ($char eq '\\') {
1048
32
18
                        $i++;
1049
32
20
                        my $next = substr($class, $i, 1);
1050
32
33
                        if ($next eq 'd') {
1051
16
19
                                push @chars, ('0'..'9');
1052                        } elsif ($next eq 'w') {
1053
10
39
                                push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_');
1054                        } elsif ($next eq 's') {
1055
0
0
                                push @chars, (' ', "\t", "\n");
1056                        } elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') {
1057                                # Unicode property in character class
1058
6
3
                                my $end = index($class, '}', $i+2);
1059
6
4
                                my $prop = substr($class, $i+2, $end-$i-2);
1060
6
6
                                push @chars, $self->_unicode_property_chars($prop);
1061
6
11
                                $i = $end;
1062                        } else {
1063
0
0
                                push @chars, $next;
1064                        }
1065                } elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') {
1066                        # Range
1067
30268
19155
                        my $end = substr($class, $i+2, 1);
1068
30268
64253
                        push @chars, ($char .. $end);
1069
30268
24314
                        $i += 2;        # Will be incremented again by loop, total +3
1070                } else {
1071
2270
1480
                        push @chars, $char;
1072                }
1073
32570
24450
                $i++;
1074        }
1075
1076
16988
11529
        if ($negate) {
1077
10
100
9
78
                my %excluded = map { $_ => 1 } @chars;
1078
10
940
940
11
672
698
                @chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126);
1079        }
1080
1081
16988
29954
        return @chars ? $chars[int(rand(@chars))] : 'X';
1082}
1083
1084sub _unicode_property_char {
1085
95
68
        my ($self, $prop) = @_;
1086
95
78
        my @chars = $self->_unicode_property_chars($prop);
1087
95
354
        return @chars ? $chars[int(rand(@chars))] : 'X';
1088}
1089
1090sub _unicode_property_chars {
1091
101
63
        my ($self, $prop) = @_;
1092
1093        # Common Unicode properties
1094
101
113
        if ($prop eq 'L' || $prop eq 'Letter') {
1095                # Letters, skip × and ÷ which are symbols
1096
88
5456
367
4082
                return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'À')..ord('Ö'), ord('Ø')..ord('ö'), ord('ø')..ord('ÿ')));
1097        } elsif ($prop eq 'N' || $prop eq 'Number') {
1098                # Numbers
1099                # return ('0' .. '9', map { chr($_) } (ord('â‘ ').. ord('⑳')));
1100
3
5
                return ('0' .. '9');
1101        } elsif ($prop eq 'Lu' || $prop eq 'Uppercase_Letter') {
1102                # Uppercase letters, skip × which is not a letter
1103
6
180
19
167
                return ('A' .. 'Z', map { chr($_) } (ord('À') .. ord('Ö'), ord('Ø') .. ord('Þ')));
1104        } elsif ($prop eq 'Ll' || $prop eq 'Lowercase_Letter') {
1105                # Lowercase letters, skip ÷ which is not a letter
1106
4
124
10
95
                return ('a' .. 'z', map { chr($_) } (ord('à') .. ord('ö'), ord('ø') .. ord('ÿ')));
1107        } elsif ($prop eq 'P' || $prop eq 'Punctuation') {
1108                # Punctuation
1109
0
0
                return ('.', ',', '!', '?', ';', ':', '-', '—', '…');
1110        } elsif ($prop eq 'S' || $prop eq 'Symbol') {
1111                # Symbols
1112
0
0
                return ('$', '€', '£', 'Â¥', '©', '®', 'â„¢', '°', '±', '×', '÷');
1113        } elsif ($prop eq 'Z' || $prop eq 'Separator') {
1114                # Separators
1115
0
0
                return (' ', "\t", "\n");
1116        } elsif ($prop eq 'Nd' || $prop eq 'Decimal_Number') {
1117                # Decimal numbers
1118
0
0
                return ('0'..'9');
1119        } else {
1120                # Unknown property - return letters as default
1121
0
0
                return ('a'..'z', 'A'..'Z');
1122        }
1123}
1124
1125 - 1131
=head2 create_random_string

For consistency with L<Data::Random::String>.

  print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=cut
1132
1133sub create_random_string
1134{
1135
2
893
        my $class = shift;
1136
2
5
        my $params = Params::Get::get_params(undef, @_);
1137
1138
2
25
        my $regex = $params->{'regex'};
1139
2
2
        my $length = $params->{'length'};
1140
1141
2
4
        return $class->new($regex, $length)->generate();
1142}
1143
1144 - 1178
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/Data-Random-String-Matches/coverage/>

=item * L<String::Random>

=item * L<Regexp::Genex>

=back

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
1179
11801;