File Coverage

File:blib/lib/Data/Random/String/Matches.pm
Coverage:87.1%

linestmtbrancondsubtimecode
1package Data::Random::String::Matches;
2
3
32
32
934207
51
use 5.014;
4
5
32
32
32
56
23
271
use strict;
6
32
32
32
50
26
615
use warnings;
7
8
32
32
32
53
29
715
use Carp qw(carp croak);
9
32
32
32
5540
157437
575
use Params::Get;
10
32
32
32
5699
2784
65
use utf8;
11
12our $VERSION = '0.04';
13
14 - 182
=head1 NAME

Data::Random::String::Matches - Generate random strings matching a regex

=head1 SYNOPSIS

        use Data::Random::String::Matches;

        # Create a generator with regex and optional length
        my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7);

        # Generate a matching string
        my $str = $gen->generate();
        print $str;  # e.g., "XYZ1234"

        # Alternation
        my $gen2 = Data::Random::String::Matches->new(qr/(cat|dog|bird)/);
        my $animal = $gen2->generate_smart();  # "cat", "dog", or "bird"

        # Backreferences
        my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/);
        my $str3 = $gen3->generate_smart();  # e.g., "abc-abc"

        # Groups and quantifiers
        my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/);
        my $laugh = $gen4->generate_smart();  # "haha", "hahaha", or "hahahaha"

        # Unicode
        $gen = Data::Random::String::Matches->new(qr/\p{L}{5}/);

        # Named captures
        $gen = Data::Random::String::Matches->new(qr/(?<year>\d{4})-\k<year>/);

        # Possessive
        $gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/);

        # Lookaheads
        $gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/);

        # Combined
        $gen = Data::Random::String::Matches->new(
                qr/(?<prefix>\p{Lu}{2})\d++\k<prefix>(?=[A-Z])/
        );

        # Consistency with Legacy software
        print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=head1 DESCRIPTION

This module generates random strings that match a given regular expression pattern.
It parses the regex pattern and intelligently builds matching strings, supporting
a wide range of regex features.

=head1 SUPPORTED REGEX FEATURES

=head2 Character Classes

=over 4

=item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]>

=item * Negated classes: C<[^a-z]>

=item * Ranges: C<[a-zA-Z0-9]>

=item * Escape sequences in classes: C<[\d\w]>

=back

=head2 Escape Sequences

=over 4

=item * C<\d> - digit [0-9]

=item * C<\w> - word character [a-zA-Z0-9_]

=item * C<\s> - whitespace

=item * C<\D> - non-digit

=item * C<\W> - non-word character

=item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return

=back

=head2 Quantifiers

=over 4

=item * C<{n}> - exactly n times

=item * C<{n,m}> - between n and m times

=item * C<{n,}> - n or more times

=item * C<+> - one or more (1-5 times)

=item * C<*> - zero or more (0-5 times)

=item * C<?> - zero or one

=back

=head2 Grouping and Alternation

=over 4

=item * C<(...)> - capturing group

=item * C<(?:...)> - non-capturing group

=item * C<|> - alternation (e.g., C<cat|dog|bird>)

=item * C<\1>, C<\2>, etc. - backreferences

=back

=head2 Other

=over 4

=item * C<.> - any character (printable ASCII)

=item * Literal characters

=item * C<^> and C<$> anchors (stripped during parsing)

=back

=head1 LIMITATIONS

=over 4

=item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported

=item * Named groups ((?<name>...)) are not supported

=item * Possessive quantifiers (*+, ++) are not supported

=item * Unicode properties (\p{L}, \p{N}) are not supported

=item * Some complex nested patterns may not work correctly with smart parsing

=back

=head1 EXAMPLES

        # Email-like pattern
        my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/);

        # API key pattern
        my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/);

        # Phone number
        my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/);

        # Repeated pattern
        my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/);

=head1 METHODS

=head2 new($regex, $length)

Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string.
C<$length> is optional and defaults to 10 (used for fallback generation).

=cut
183
184sub new {
185
215
512314
        my ($class, $regex, $length) = @_;
186
187
215
280
        croak 'Regex pattern is required' unless defined $regex;
188
189        # Convert string to regex if needed
190
214
390
        my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/;
191
192
214
563
        my $self = {
193                regex    => $regex_obj,
194                regex_str => "$regex",
195                length  => $length || 10,
196                backrefs        => {},       # Store backreferences
197                named_refs => {},     # Store named captures
198        };
199
200
214
279
        return bless $self, $class;
201}
202
203 - 209
=head2 generate($max_attempts)

Generates a random string matching the regex. First tries smart parsing, then
falls back to brute force if needed. Tries up to C<$max_attempts> times
(default 1000) before croaking.

=cut
210
211sub generate {
212
4139
17023
        my ($self, $max_attempts) = @_;
213
4139
5631
        $max_attempts //= 1000;
214
215
4139
2720
        my $regex = $self->{regex};
216
4139
2454
        my $length = $self->{length};
217
218        # First try the smart approach
219
4139
4139
2483
3427
        my $str = eval { $self->_build_from_pattern($self->{regex_str}) };
220
4139
8681
        if (defined $str && $str =~ /^$regex$/) {
221
4136
4748
                return $str;
222        }
223
224        # If smart approach failed, show warning in debug mode
225
3
4
        if ($ENV{DEBUG_REGEX_GEN} && $@) {
226
0
0
                warn "Smart generation failed: $@";
227        }
228
229        # Fall back to brute force with character set matching
230
3
5
        for (1 .. $max_attempts) {
231
1014
801
                $str = $self->_random_string_smart($length);
232
1014
1749
                return $str if $str =~ /^$regex$/;
233        }
234
235
2
56
        croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}";
236}
237
238sub _random_string_smart {
239
1014
633
        my ($self, $len) = @_;
240
241
1014
630
        my $regex_str = $self->{regex_str};
242
243        # Detect common patterns and generate appropriate characters
244
1014
568
        my @chars;
245
246
1014
2405
        if ($regex_str =~ /\\d/ || $regex_str =~ /\[0-9\]/ || $regex_str =~ /\[\^[^\]]*[A-Za-z]/) {
247                # Digit patterns
248
0
0
                @chars = ('0'..'9');
249        } elsif ($regex_str =~ /\[A-Z\]/ || $regex_str =~ /\[A-Z[^\]]*\]/) {
250                # Uppercase patterns
251
0
0
                @chars = ('A'..'Z');
252        } elsif ($regex_str =~ /\[a-z\]/ || $regex_str =~ /\[a-z[^\]]*\]/) {
253                # Lowercase patterns
254
0
0
                @chars = ('a'..'z');
255        } elsif ($regex_str =~ /\\w/ || $regex_str =~ /\[a-zA-Z0-9_\]/) {
256                # Word characters
257
0
0
                @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
258        } else {
259                # Default to printable ASCII
260
1014
95316
792
69416
                @chars = map { chr($_) } (33 .. 126);
261        }
262
263
1014
686
        my $str = '';
264
1014
2392
        $str .= $chars[int(rand(@chars))] for (1 .. $len);
265
266
1014
2501
        return $str;
267}
268
269 - 274
=head2 generate_smart()

Parses the regex and builds a matching string directly. Faster and more reliable
than brute force, but may not handle all edge cases.

=cut
275
276sub generate_smart {
277
132
6522
        my $self = $_[0];
278
132
155
        return $self->_build_from_pattern($self->{regex_str});
279}
280
281 - 311
=head2 generate_many($count, $unique)

Generates multiple random strings matching the regex.

    my @strings = $gen->generate_many(10);           # 10 strings (may have duplicates)
    my @strings = $gen->generate_many(10, 1);        # 10 unique strings
    my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings

    # Generate until you have 1000 unique codes
    my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/);
    my @codes = $gen->generate_many(1000, 'unique');

Parameters:

=over 4

=item * C<$count> - Number of strings to generate (required, must be positive)

=item * C<$unique> - If true, ensures all generated strings are unique. May return fewer
than C<$count> strings if uniqueness cannot be achieved within reasonable attempts.
Accepts any true value (1, 'unique', etc.)

=back

Returns: List of generated strings

Dies: If count is not a positive integer

Warns: If unable to generate the requested number of unique strings

=cut
312
313sub generate_many {
314
24
2601
        my ($self, $count, $unique) = @_;
315
316
24
47
        croak 'Count must be a positive integer' unless defined $count && $count > 0;
317
318
21
16
        my @results;
319
320
21
18
        if ($unique) {
321                # Generate unique strings
322
6
5
                my %seen;
323
6
3
                my $attempts = 0;
324
6
6
                my $max_attempts = $count * 100;        # Reasonable limit
325
326
6
12
                while (keys %seen < $count && $attempts < $max_attempts) {
327
1649
1238
                        my $str = $self->generate();
328
1649
1225
                        $seen{$str} = 1;
329
1649
1938
                        $attempts++;
330                }
331
332
6
7
                if (keys %seen < $count) {
333
1
7
                        carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested";
334                }
335
336
6
285
                @results = keys %seen;
337        } else {
338                # Generate any strings (may have duplicates)
339
15
22
                push @results, $self->generate() for (1 .. $count);
340        }
341
342
21
226
        return @results;
343}
344
345 - 349
=head2 get_seed()

Gets the random seed for reproducible generation

=cut
350
351sub get_seed {
352
2
347
        my $self = shift;
353
354
2
4
        return $self->{seed};
355}
356
357 - 361
=head2 set_seed($seed)

Sets the random seed for reproducible generation

=cut
362
363sub set_seed {
364
13
1399
        my $self = shift;
365
13
15
        my $params = Params::Get::get_params('seed', \@_);
366
12
110
        my $seed = $params->{'seed'};
367
368
12
52
        croak 'Seed must be defined' unless defined $seed;
369
370
11
11
        srand($seed);
371
11
10
        $self->{seed} = $seed;
372
373
11
19
        return $self;
374}
375
376 - 389
=head2 suggest_simpler_pattern()

Analyzes patterns and suggests improvements.

  my $suggestion = $gen->suggest_simpler_pattern();

  if ($suggestion) {
    print "Reason: $suggestion->{reason}\n";
    print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern};
    print "Tips:\n";
    print "  - $_\n" for @{$suggestion->{tips}};
  }

=cut
390
391sub suggest_simpler_pattern {
392
21
31
        my $self = $_[0];
393
394
21
17
        my $pattern = $self->{regex_str};
395
21
21
        my $info = $self->pattern_info();
396
397        # Check for patterns that are too complex
398
21
19
        if ($info->{complexity} eq 'very_complex') {
399                return {
400
1
5
                        pattern => undef,
401                        reason  => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.',
402                        tips    => [
403                                'Split alternations into separate generators',
404                                'Avoid deeply nested groups',
405                                'Use fixed-length patterns when possible',
406                        ],
407                };
408        }
409
410        # Suggest removing unnecessary backreferences
411
20
32
        if ($info->{features}{has_backreferences} && $pattern =~ /(\(\w+\)).*\\\d+/) {
412
0
0
                my $simpler = $pattern;
413                # Can't automatically simplify backreferences, but can suggest
414                return {
415
0
0
                        pattern => undef,
416                        reason  => 'Backreferences add complexity. Consider if you really need repeated groups.',
417                        tips    => [
418                                'If the repeated part doesn\'t need to match, use two separate patterns',
419                                'For validation, backreferences are great; for generation, they limit variation',
420                        ],
421                };
422        }
423
424        # Suggest fixed quantifiers instead of ranges
425
20
25
        if ($pattern =~ /\{(\d+),(\d+)\}/) {
426
8
8
                my ($min, $max) = ($1, $2);
427
8
9
                if ($max - $min > 10) {
428
6
6
                        my $mid = int(($min + $max) / 2);
429
6
6
                        my $simpler = $pattern;
430
6
13
                        $simpler =~ s/\{\d+,\d+\}/\{$mid\}/;
431                        return {
432
6
14
                                pattern => $simpler,
433                                reason  => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.",
434                                tips    => [
435                                        'Fixed lengths are faster to generate',
436                                        'If you need variety, generate multiple patterns with different fixed lengths',
437                                ],
438                        };
439                }
440        }
441
442        # Suggest limiting alternations
443
14
13
        if ($info->{features}{has_alternation}) {
444
2
4
                my @alts = split /\|/, $pattern;
445
2
2
                if (@alts > 10) {
446                        return {
447
1
3
                                pattern => undef,
448                                reason  => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.',
449                                tips    => [
450                                        'Create separate generators for different alternatives',
451                                        'Group similar patterns together',
452                                        'Use character classes [abc] instead of (a|b|c)',
453                                ],
454                        };
455                }
456
457                # Check if alternations could be a character class
458
1
2
                if ($pattern =~ /\(([a-zA-Z])\|([a-zA-Z])\|([a-zA-Z])\)/) {
459
1
2
                        my $chars = join('', $1, $2, $3);
460
1
1
                        my $simpler = $pattern;
461
1
3
                        $simpler =~ s/\([a-zA-Z]\|[a-zA-Z]\|[a-zA-Z]\)/[$chars]/;
462                        return {
463
1
2
                                pattern => $simpler,
464                                reason  => 'Single-character alternations can be simplified to character classes.',
465                                tips    => [
466                                        'Use [abc] instead of (a|b|c)',
467                                        'Character classes are faster to process',
468                                ],
469                        };
470                }
471        }
472
473        # Suggest removing lookaheads/lookbehinds for generation
474
12
14
        if ($info->{features}{has_lookahead} || $info->{features}{has_lookbehind}) {
475
3
3
                my $simpler = $pattern;
476
3
5
                $simpler =~ s/\(\?[=!].*?\)//g;   # Remove lookaheads
477
3
2
                $simpler =~ s/\(\?<[=!].*?\)//g;  # Remove lookbehinds
478
479
3
4
                if ($simpler ne $pattern) {
480                        return {
481
3
6
                                pattern => $simpler,
482                                reason  => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.',
483                                tips    => [
484                                        'Lookaheads are great for validation, not generation',
485                                        'The simplified pattern generates the same strings',
486                                ],
487                        };
488                }
489        }
490
491        # Check for Unicode when ASCII would work
492
9
13
        if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) {
493
2
2
                my $simpler = $pattern;
494
2
2
                $simpler =~ s/\\p\{L\}/[A-Za-z]/g;
495                return {
496
2
4
                        pattern => $simpler,
497                        reason  => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.',
498                        tips    => [
499                                'ASCII patterns are faster',
500                                'Only use Unicode if you need non-ASCII characters',
501                        ],
502                };
503        }
504
505        # Check for overly long fixed strings
506
7
8
        if ($pattern =~ /([a-zA-Z]{20,})/) {
507                return {
508
0
0
                        pattern => undef,
509                        reason  => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.',
510                        tips    => [
511                                'Use variables instead of long literals',
512                                'Break into smaller patterns',
513                        ],
514                };
515        }
516
517        # Pattern seems reasonable
518
7
7
        return undef;
519}
520
521 - 529
=head2 validate($string)

Checks if a string matches the pattern without generating.

  if ($gen->validate('1234')) {
    print "Valid!\n";
  }

=cut
530
531sub validate {
532
69
1792
        my $self = shift;
533
69
83
        my $params = Params::Get::get_params('string', \@_);
534
68
610
        my $string = $params->{'string'};
535
536
68
102
        croak('String must be defined') unless defined $string;
537
538
67
49
        my $regex = $self->{regex};
539
67
315
        return $string =~ /^$regex$/;
540}
541
542 - 558
=head2 pattern_info()

Returns detailed information about the pattern.

  my $info = $gen->pattern_info();
  print "Complexity: $info->{complexity}\n";
  print "Min length: $info->{min_length}\n";
  print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n";

C<pattern_info> analyzes a regular expression to produce a structured summary of its characteristics,
including estimated string lengths, detected features, and an overall complexity rating.
It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals,
character classes, and quantifiers.
It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash.
Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level.

=cut
559
560sub pattern_info {
561
41
1886
        my $self = $_[0];
562
563
41
44
        return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'};
564
565
38
31
        my $pattern = $self->{'regex_str'};
566
567        # Calculate approximate min/max lengths
568
38
57
        my ($min_len, $max_len) = $self->_estimate_length($pattern);
569
570        # Detect pattern features
571
38
263
        my %features = (
572                has_alternation     => ($pattern =~ /\|/ ? 1 : 0),
573                has_backreferences  => ($pattern =~ /(\\[1-9]|\\k<)/ ? 1 : 0),
574                has_unicode         => ($pattern =~ /\\p\{/ ? 1 : 0),
575                has_lookahead       => ($pattern =~ /\(\?[=!]/ ? 1 : 0),
576                has_lookbehind      => ($pattern =~ /\(\?<[=!]/ ? 1 : 0),
577                has_named_groups    => ($pattern =~ /\(\?</ ? 1 : 0),
578                has_possessive      => ($pattern =~ /(?:[+*?]\+|\{\d+(?:,\d*)?\}\+)/ ? 1 : 0),
579        );
580
581
38
70
        my $info = {
582                pattern             => $pattern,
583                min_length          => $min_len,
584                max_length          => $max_len,
585                estimated_length    => int(($min_len + $max_len) / 2),
586                features            => \%features,
587                complexity          => $self->_calculate_complexity(\%features, $pattern),
588        };
589
590
38
49
        $self->{'_pattern_info_cache'} = $info;
591
592
38
41
        return $info;
593}
594
595sub _estimate_length {
596
38
30
        my ($self, $pattern) = @_;
597
598        # Remove anchors and modifiers
599
38
105
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
600
38
31
        $pattern =~ s/^\^//;
601
38
27
        $pattern =~ s/\$//;
602
603
38
27
        my $min = 0;
604
38
24
        my $max = 0;
605
606        # Simple heuristic - count fixed characters and quantifiers
607
38
24
        my $last_was_atom = 0;  # Handle cases like \d{3} where the quantifier modifies the atom count
608
38
72
        while ($pattern =~ /([^+*?{}\[\]\\])|\\[dwsWDN]|\[([^\]]+)\]|\{(\d+)(?:,(\d+))?\}/g) {
609
222
290
                if (defined $1 || (defined $2 && $2)) {
610
156
93
                        $min++;
611
156
77
                        $max++;
612
156
171
                        $last_was_atom = 1;
613                } elsif (defined $3) {
614
36
29
                        if ($last_was_atom) {
615                                # Replace the last atom’s contribution
616
16
15
                                $min += $3 - 1;
617
16
17
                                $max += (defined $4 ? $4 : $3) - 1;
618
16
16
                                $last_was_atom = 0;
619                        } else {
620                                # No preceding atom? assume standalone
621
20
13
                                $min += $3;
622
20
36
                                $max += defined $4 ? $4 : $3;
623                        }
624                }
625        }
626
627        # Account for +, *, ?
628
38
35
        my $plus_count = () = $pattern =~ /\+/g;
629
38
25
        my $star_count = () = $pattern =~ /\*/g;
630
38
32
        my $question_count = () = $pattern =~ /\?/g;
631
632
38
27
        $min += $plus_count;  # + means at least 1
633
38
27
        $max += ($plus_count * 5) + ($star_count * 5);  # Assume max 5 repetitions
634
38
37
        $min -= $question_count;  # ? makes things optional
635
636
38
43
        $min = 0 if $min < 0;
637
38
33
        $max = $min + 50 if $max < $min;  # Ensure max >= min
638
639
38
43
        return ($min, $max);
640}
641
642sub _calculate_complexity {
643
38
36
        my ($self, $features, $pattern) = @_;
644
645
38
21
        my $score = 0;
646
647        # Base complexity from pattern length
648
38
33
        $score += length($pattern) / 10;
649
650        # Add complexity for features
651
38
39
        $score += 2 if $features->{has_alternation};
652
38
34
        $score += 3 if $features->{has_backreferences};
653
38
27
        $score += 2 if $features->{has_unicode};
654
38
34
        $score += 2 if $features->{has_lookahead};
655
38
32
        $score += 2 if $features->{has_lookbehind};
656
38
32
        $score += 1 if $features->{has_named_groups};
657
38
30
        $score += 1 if $features->{has_possessive};
658
659        # Classify
660
38
57
        return 'simple'   if $score < 3;
661
15
26
        return 'moderate' if $score < 7;
662
3
6
        return 'complex'  if $score < 12;
663
1
2
        return 'very_complex';
664}
665
666sub _build_from_pattern {
667
4270
2905
        my ($self, $pattern) = @_;
668
669        # Reset backreferences for each generation
670
4270
2937
        $self->{backrefs} = {};
671
4270
2993
        $self->{named_refs} = {};
672
4270
2889
        $self->{group_counter} = 0;
673
674        # Remove regex delimiters and modifiers
675        # Handle (?^:...), (?i:...), (?-i:...) etc
676
4270
5730
        $pattern =~ s/^\(\?\^?[iumsx-]*:(.*)\)$/$1/;
677
678        # Remove anchors (they're handled by the regex match itself)
679
4270
2791
        $pattern =~ s/^\^//;
680
4270
2678
        $pattern =~ s/\$//;
681
682
4270
3266
        return $self->_parse_sequence($pattern);
683}
684
685sub _parse_sequence {
686
4412
3037
        my ($self, $pattern) = @_;
687
688
4412
2671
        my $result = '';
689
4412
2573
        my $i = 0;
690
4412
2609
        my $len = length($pattern);
691
692
4412
3392
        while ($i < $len) {
693
5840
3893
                my $char = substr($pattern, $i, 1);
694
695
5840
6042
                if ($char eq '\\') {
696                        # Escape sequence
697
939
548
                        $i++;
698
939
597
                        my $next = substr($pattern, $i, 1);
699
700
939
1637
                        if ($next =~ /[1-9]/) {
701                                # Backreference
702
30
16
                                my $ref_num = $next;
703
30
28
                                if (exists $self->{backrefs}{$ref_num}) {
704
30
26
                                        $result .= $self->{backrefs}{$ref_num};
705                                } else {
706
0
0
                                        croak "Backreference \\$ref_num used before group defined";
707                                }
708                        } elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') {
709                                # Named backreference \k<name>
710
7
7
                                my $end = index($pattern, '>', $i+2);
711
7
5
                                my $name = substr($pattern, $i+2, $end-$i-2);
712
7
8
                                if (exists $self->{named_refs}{$name}) {
713
7
6
                                        $result .= $self->{named_refs}{$name};
714                                } else {
715
0
0
                                        croak "Named backreference \\k<$name> used before group defined";
716                                }
717
7
3
                                $i = $end;
718                        } elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') {
719                                # Unicode property \p{L}, \p{N}, etc.
720
25
35
                                my $end = index($pattern, '}', $i+2);
721
25
21
                                my $prop = substr($pattern, $i+2, $end-$i-2);
722                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
723
95
74
                                        $self->_unicode_property_char($prop);
724
25
38
                                });
725
25
31
                                $result .= $generated;
726
25
20
                                $i = $new_i;
727                        } elsif ($next eq 'd') {
728
729
2305
949
2252
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1);
729
729
728
                                $result .= $generated;
730
729
473
                                $i = $new_i;
731                        } elsif ($next eq 'w') {
732                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
733
115
412
                                        my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
734
115
298
                                        $chars[int(rand(@chars))];
735
35
66
                                }, 1);
736
35
52
                                $result .= $generated;
737
35
24
                                $i = $new_i;
738                        } elsif ($next eq 's') {
739
2
2
6
4
                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1);
740
2
5
                                $result .= $generated;
741
2
2
                                $i = $new_i;
742                        } elsif ($next eq 'D') {
743                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
744
6
504
564
8
335
452
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126);
745
6
26
                                        $chars[int(rand(@chars))];
746
2
4
                                });
747
2
4
                                $result .= $generated;
748
2
2
                                $i = $new_i;
749                        } elsif ($next eq 'W') {
750                                my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
751
6
186
564
5
125
492
                                        my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126);
752
6
22
                                        $chars[int(rand(@chars))];
753
2
7
                                });
754
2
5
                                $result .= $generated;
755
2
2
                                $i = $new_i;
756                        } elsif ($next eq 't') {
757
2
2
                                $result .= "\t";
758                        } elsif ($next eq 'n') {
759
2
2
                                $result .= "\n";
760                        } elsif ($next eq 'r') {
761
0
0
                                $result .= "\r";
762                        } else {
763
103
62
                                $result .= $next;
764                        }
765
939
794
                        $i++;
766                } elsif ($char eq '[') {
767                        # Character class
768
3951
3300
                        my $end = $self->_find_matching_bracket($pattern, $i);
769
3951
2963
                        croak 'Unmatched [' if $end == -1;
770
771
3951
2735
                        my $class = substr($pattern, $i+1, $end-$i-1);
772                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
773
17212
13191
                                $self->_random_from_class($class);
774
3951
4973
                        }, 1);
775
3951
4137
                        $result .= $generated;
776
3951
3580
                        $i = $new_i + 1;
777                } elsif ($char eq '(') {
778                        # Group - could be various types
779
151
145
                        my $end = $self->_find_matching_paren($pattern, $i);
780
151
127
                        croak 'Unmatched (' if $end == -1;
781
782
151
132
                        my $group_content = substr($pattern, $i+1, $end-$i-1);
783
784                        # Check for special group types
785
151
88
                        my $is_capturing = 1;
786
151
106
                        my $is_lookahead = 0;
787
151
80
                        my $is_lookbehind = 0;
788
151
94
                        my $is_negative = 0;
789
151
95
                        my $group_name = undef;
790
791
151
258
                        if ($group_content =~ /^\?:/) {
792                                # Non-capturing group
793
2
2
                                $is_capturing = 0;
794
2
2
                                $group_content = substr($group_content, 2);
795                        } elsif ($group_content =~ /^\?<([^>]+)>/) {
796                                # Named capture (?<name>...)
797
15
15
                                $group_name = $1;
798
15
14
                                $group_content = substr($group_content, length($1) + 3);
799                        } elsif ($group_content =~ /^\?=/) {
800                                # Positive lookahead (?=...)
801
6
4
                                $is_lookahead = 1;
802
6
5
                                $is_capturing = 0;
803
6
5
                                $group_content = substr($group_content, 2);
804                        } elsif ($group_content =~ /^\?!/) {
805                                # Negative lookahead (?!...)
806
1
0
                                $is_lookahead = 1;
807
1
1
                                $is_negative = 1;
808
1
1
                                $is_capturing = 0;
809
1
1
                                $group_content = substr($group_content, 2);
810                        } elsif ($group_content =~ /^\?<=/) {
811                                # Positive lookbehind (?<=...)
812
1
1
                                $is_lookbehind = 1;
813
1
1
                                $is_capturing = 0;
814
1
1
                                $group_content = substr($group_content, 3);
815                        } elsif ($group_content =~ /^\?<!/) {
816                                # Negative lookbehind (?<!...)
817
1
2
                                $is_lookbehind = 1;
818
1
1
                                $is_negative = 1;
819
1
1
                                $is_capturing = 0;
820
1
2
                                $group_content = substr($group_content, 3);
821                        }
822
823                        # Handle lookaheads/lookbehinds
824
151
159
                        if ($is_lookahead) {
825                                # For positive lookahead, generate the pattern but don't advance
826                                # For negative lookahead, avoid the pattern
827
7
6
                                if (!$is_negative) {
828                                        # Generate what the lookahead expects but don't consume it
829                                        # This is a simplification - we just note the constraint
830                                }
831                                # Lookaheads don't add to the result
832
7
4
                                $i = $end + 1;
833
7
7
                                next;
834                        } elsif ($is_lookbehind) {
835                                # Lookbehinds check what came before
836                                # For generation, we can mostly ignore them
837
2
2
                                $i = $end + 1;
838
2
4
                                next;
839                        }
840
841                        # Check for alternation
842
142
90
                        my $generated;
843
142
135
                        if ($group_content =~ /\|/) {
844
88
82
                                $generated = $self->_handle_alternation($group_content);
845                        } else {
846
54
55
                                $generated = $self->_parse_sequence($group_content);
847                        }
848
849                        # Store backreference if capturing
850
142
119
                        if ($is_capturing) {
851
140
110
                                $self->{group_counter}++;
852
140
143
                                $self->{backrefs}{$self->{group_counter}} = $generated;
853
854
140
131
                                if (defined $group_name) {
855
15
15
                                        $self->{named_refs}{$group_name} = $generated;
856                                }
857                        }
858
859                        # Handle quantifier after group (including possessive)
860
142
158
172
157
                        my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1);
861
142
149
                        $result .= $final_generated;
862
142
142
                        $i = $new_i + 1;
863                } elsif ($char eq '.') {
864                        # Any character (except newline)
865                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
866
2
188
4
141
                                my @chars = map { chr($_) } (33 .. 126);
867
2
8
                                $chars[int(rand(@chars))];
868
2
4
                        });
869
2
4
                        $result .= $generated;
870
2
2
                        $i = $new_i + 1;
871                } elsif ($char eq '|') {
872                        # Alternation at top level - just return what we have
873                        # (This is handled by _handle_alternation for groups)
874
0
0
                        last;
875                } elsif ($char =~ /[+*?]/ || $char eq '{') {
876                        # Quantifier without preceding element - shouldn't happen in valid regex
877
1
74
                        croak "$pattern: Quantifier '$char' without preceding element";
878                } elsif ($char =~ /[\w ]/) {
879                        # Literal character
880
727
730
883
657
                        my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char });
881
727
691
                        $result .= $generated;
882
727
625
                        $i = $new_i + 1;
883                } else {
884                        # Other literal characters
885
69
40
                        $result .= $char;
886
69
54
                        $i++;
887                }
888        }
889
890
4411
3959
        return $result;
891}
892
893sub _handle_quantifier {
894
5617
4296
        my ($self, $pattern, $pos, $generator, $check_possessive) = @_;
895
5617
4659
        $check_possessive //= 1;  # Default to checking for possessive
896
897
5617
3725
        my $next = substr($pattern, $pos + 1, 1);
898
5617
3230
        my $is_possessive = 0;
899
900        # Check for possessive quantifier (+)
901
5617
6192
        if ($check_possessive && $pos + 2 < length($pattern)) {
902
3929
2738
                my $after_next = substr($pattern, $pos + 2, 1);
903
3929
4928
                if (($next =~ /[+*?]/ || $next eq '}') && $after_next eq '+') {
904
6
5
                        $is_possessive = 1;
905                }
906        }
907
908
5617
4780
        if ($next eq '{') {
909
3459
2530
                my $end = index($pattern, '}', $pos + 2);
910
3459
2444
                croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1);
911
3459
2619
                my $quant = substr($pattern, $pos + 2, $end - $pos - 2);
912
913                # Check for possessive after }
914
3459
4101
                if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') {
915
2
0
                        $is_possessive = 1;
916
2
2
                        $end++;
917                }
918
919
3459
2109
                my $result = '';
920
3459
3363
                if ($quant =~ /^(\d+)$/) {
921                        # Exact: {n}
922
3400
4125
                        $result .= $generator->() for (1 .. $1);
923                } elsif ($quant =~ /^(\d+),(\d+)$/) {
924                        # Range: {n,m}
925
59
88
                        my $count = $1 + int(rand($2 - $1 + 1));
926
59
64
                        $result .= $generator->() for (1 .. $count);
927                } elsif ($quant =~ /^(\d+),$/) {
928                        # Minimum: {n,}
929
0
0
                        my $count = $1 + int(rand(5));
930
0
0
                        $result .= $generator->() for (1 .. $count);
931                }
932
3459
3554
                return ($result, $end);
933        } elsif ($next eq '+') {
934                # One or more (possessive: ++)
935
22
12
                my $actual_end = $pos + 1;
936
22
28
                if ($is_possessive) {
937
4
4
                        $actual_end++;
938                }
939
22
49
                my $count = 1 + int(rand(5));
940
22
20
                my $result = '';
941
22
28
                $result .= $generator->() for (1 .. $count);
942
22
31
                return ($result, $actual_end);
943        } elsif ($next eq '*') {
944                # Zero or more (possessive: *+)
945
3
3
                my $actual_end = $pos + 1;
946
3
10
                if ($is_possessive) {
947
1
1
                        $actual_end++;
948                }
949
3
6
                my $count = int(rand(6));
950
3
3
                my $result = '';
951
3
4
                $result .= $generator->() for (1 .. $count);
952
3
6
                return ($result, $actual_end);
953        } elsif ($next eq '?') {
954                # Zero or one (possessive: ?+)
955
4
4
                my $actual_end = $pos + 1;
956
4
7
                if ($is_possessive) {
957
1
1
                        $actual_end++;
958                }
959
4
6
                my $result = rand() < 0.5 ? $generator->() : '';
960
4
5
                return ($result, $actual_end);
961        } else {
962                # No quantifier
963
2129
1480
                return ($generator->(), $pos);
964        }
965}
966
967sub _handle_alternation {
968
88
72
        my ($self, $pattern) = @_;
969
970        # Split on | but respect groups
971
88
46
        my @alternatives;
972
88
62
        my $current = '';
973
88
47
        my $depth = 0;
974
975
88
125
        for my $char (split //, $pattern) {
976
1008
988
                if ($char eq '(') {
977
4
3
                        $depth++;
978
4
3
                        $current .= $char;
979                } elsif ($char eq ')') {
980
4
2
                        $depth--;
981
4
3
                        $current .= $char;
982                } elsif ($char eq '|' && $depth == 0) {
983
162
125
                        push @alternatives, $current;
984
162
120
                        $current = '';
985                } else {
986
838
527
                        $current .= $char;
987                }
988        }
989
88
113
        push @alternatives, $current if length($current);
990
991        # Choose one alternative randomly
992
88
133
        my $chosen = $alternatives[int(rand(@alternatives))];
993
88
126
        return $self->_parse_sequence($chosen);
994}
995
996sub _find_matching_bracket {
997
3951
2790
        my ($self, $pattern, $start) = @_;
998
999
3951
2300
        my $depth = 0;
1000
3951
3299
        for (my $i = $start; $i < length($pattern); $i++) {
1001
25134
15229
                my $char = substr($pattern, $i, 1);
1002
25134
30873
                if ($char eq '[' && ($i == $start || substr($pattern, $i-1, 1) ne '\\')) {
1003
3951
3208
                        $depth++;
1004                } elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') {
1005
3951
2335
                        $depth--;
1006
3951
3654
                        return $i if $depth == 0;
1007                }
1008        }
1009
0
0
        return -1;
1010}
1011
1012sub _find_matching_paren {
1013
151
134
        my ($self, $pattern, $start) = @_;
1014
1015
151
104
        my $depth = 0;
1016
151
179
        for (my $i = $start; $i < length($pattern); $i++) {
1017
1750
1174
                my $char = substr($pattern, $i, 1);
1018
1750
1334
                my $prev = $i > 0 ? substr($pattern, $i-1, 1) : '';
1019
1020
1750
2385
                if ($char eq '(' && $prev ne '\\') {
1021
156
166
                        $depth++;
1022                } elsif ($char eq ')' && $prev ne '\\') {
1023
156
91
                        $depth--;
1024
156
188
                        return $i if $depth == 0;
1025                }
1026        }
1027
0
0
        return -1;
1028}
1029
1030sub _random_from_class {
1031
17212
11122
        my ($self, $class) = @_;
1032
1033
17212
9520
        my @chars;
1034
1035        # Debugging this regex: qr/!#-'*+\\-\\.\\^_`|~0-9A-Za-z/
1036        # which used to give this error: 'Argument "#" isn't numeric in range (or flop)'
1037
17212
12761
        warn "DEBUG: class = '$class', length = ", length($class) if ($ENV{DEBUG_REGEX_GEN});
1038
1039        # Handle negation
1040
17212
10226
        my $negate = 0;
1041
17212
12847
        if (substr($class, 0, 1) eq '^') {
1042
10
6
                $negate = 1;
1043
10
7
                $class = substr($class, 1);
1044        }
1045
1046        # Parse character class with escape sequences
1047
17212
10083
        my $i = 0;
1048
17212
12363
        while ($i < length($class)) {
1049
33440
20848
                my $char = substr($class, $i, 1);
1050
1051
33440
23944
                warn "DEBUG: i=$i, char='$char' (ord=", ord($char), ')' if ($ENV{DEBUG_REGEX_GEN});
1052
1053
33440
38601
                if ($char eq '\\') {
1054
168
93
                        $i++;
1055
168
120
                        my $next = substr($class, $i, 1);
1056
168
119
                        warn "DEBUG: Escaped char: $next" if ($ENV{DEBUG_REGEX_GEN});
1057
168
172
                        if ($next eq 'd') {
1058
16
16
                                push @chars, ('0'..'9');
1059                        } elsif ($next eq 'w') {
1060
10
39
                                push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_');
1061                        } elsif ($next eq 's') {
1062
0
0
                                push @chars, (' ', "\t", "\n");
1063                        } elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') {
1064                                # Unicode property in character class
1065
6
4
                                my $end = index($class, '}', $i+2);
1066
6
6
                                my $prop = substr($class, $i+2, $end-$i-2);
1067
6
5
                                push @chars, $self->_unicode_property_chars($prop);
1068
6
12
                                $i = $end;
1069                        } else {
1070                                # Escaped literal character (including \-, \., \^, etc.)
1071
136
88
                                push @chars, $next;
1072                        }
1073                } elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') {
1074                        # Potential range
1075
30564
20683
                        my $end_char = substr($class, $i+2, 1);
1076
1077                        # Check if end is escaped or if this is valid range
1078
30564
33494
                        if ($end_char eq '\\' || $end_char eq ']') {
1079                                # Not a range, dash is literal
1080
0
0
                                push @chars, $char;
1081                        } elsif (ord($end_char) >= ord($char)) {
1082                                # Valid range - use ord/chr to avoid quote interpolation issues
1083
30564
17497
                                my $start_ord = ord($char);
1084
30564
18040
                                my $end_ord = ord($end_char);
1085
30564
561218
23296
413042
                                push @chars, map { chr($_) } ($start_ord .. $end_ord);
1086
30564
31281
                                $i += 2;  # Will be incremented again by loop, total +3
1087                        } else {
1088                                # Invalid range order
1089
0
0
                                push @chars, $char;
1090                        }
1091                } else {
1092
2708
1726
                        push @chars, $char;
1093                }
1094
33440
26014
                $i++;
1095        }
1096
1097
17212
12304
        warn 'DEBUG: Final chars array has ', scalar(@chars), ' elements' if ($ENV{DEBUG_REGEX_GEN});
1098
1099
17212
12036
        if ($negate) {
1100
10
100
9
76
                my %excluded = map { $_ => 1 } @chars;
1101
10
940
940
12
666
670
                @chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126);
1102        }
1103
1104
17212
32526
        return @chars ? $chars[int(rand(@chars))] : 'X';
1105}
1106
1107sub _unicode_property_char {
1108
95
66
        my ($self, $prop) = @_;
1109
95
76
        my @chars = $self->_unicode_property_chars($prop);
1110
95
367
        return @chars ? $chars[int(rand(@chars))] : 'X';
1111}
1112
1113sub _unicode_property_chars {
1114
101
69
        my ($self, $prop) = @_;
1115
1116        # Common Unicode properties
1117
101
113
        if ($prop eq 'L' || $prop eq 'Letter') {
1118                # Letters, skip × and ÷ which are symbols
1119
88
5456
378
4089
                return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'À')..ord('Ö'), ord('Ø')..ord('ö'), ord('ø')..ord('ÿ')));
1120        } elsif ($prop eq 'N' || $prop eq 'Number') {
1121                # Numbers
1122                # return ('0' .. '9', map { chr($_) } (ord('â‘ ').. ord('⑳')));
1123
3
5
                return ('0' .. '9');
1124        } elsif ($prop eq 'Lu' || $prop eq 'Uppercase_Letter') {
1125                # Uppercase letters, skip × which is not a letter
1126
6
180
17
163
                return ('A' .. 'Z', map { chr($_) } (ord('À') .. ord('Ö'), ord('Ø') .. ord('Þ')));
1127        } elsif ($prop eq 'Ll' || $prop eq 'Lowercase_Letter') {
1128                # Lowercase letters, skip ÷ which is not a letter
1129
4
124
12
99
                return ('a' .. 'z', map { chr($_) } (ord('à') .. ord('ö'), ord('ø') .. ord('ÿ')));
1130        } elsif ($prop eq 'P' || $prop eq 'Punctuation') {
1131                # Punctuation
1132
0
0
                return ('.', ',', '!', '?', ';', ':', '-', '—', '…');
1133        } elsif ($prop eq 'S' || $prop eq 'Symbol') {
1134                # Symbols
1135
0
0
                return ('$', '€', '£', 'Â¥', '©', '®', 'â„¢', '°', '±', '×', '÷');
1136        } elsif ($prop eq 'Z' || $prop eq 'Separator') {
1137                # Separators
1138
0
0
                return (' ', "\t", "\n");
1139        } elsif ($prop eq 'Nd' || $prop eq 'Decimal_Number') {
1140                # Decimal numbers
1141
0
0
                return ('0'..'9');
1142        } else {
1143                # Unknown property - return letters as default
1144
0
0
                return ('a'..'z', 'A'..'Z');
1145        }
1146}
1147
1148 - 1154
=head2 create_random_string

For consistency with L<Data::Random::String>.

  print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n";

=cut
1155
1156sub create_random_string
1157{
1158
2
630
        my $class = shift;
1159
2
4
        my $params = Params::Get::get_params(undef, @_);
1160
1161
2
21
        my $regex = $params->{'regex'};
1162
2
3
        my $length = $params->{'length'};
1163
1164
2
3
        return $class->new($regex, $length)->generate();
1165}
1166
1167 - 1201
=head1 AUTHOR

Nigel Horne, C<< <njh at nigelhorne.com> >>

=head1 SEE ALSO

=over 4

=item * Test coverage report: L<https://nigelhorne.github.io/Data-Random-String-Matches/coverage/>

=item * L<String::Random>

=item * L<Regexp::Genex>

=back

=head1 LICENCE AND COPYRIGHT

Copyright 2025 Nigel Horne.

Usage is subject to licence terms.

The licence terms of this software are as follows:

=over 4

=item * Personal single user, single computer use: GPL2

=item * All other users (including Commercial, Charity, Educational, Government)
  must apply in writing for a licence for use from Nigel Horne at the
  above e-mail.

=back

=cut
1202
12031;