File Coverage

File:	blib/lib/Data/Random/String/Matches.pm
Coverage:	87.1%

line	stmt	bran	cond	sub	time	code
1						package Data::Random::String::Matches;
2
3	32 32			32	934207 51	use 5.014;
4
5	32 32 32			32	56 23 271	use strict;
6	32 32 32			32	50 26 615	use warnings;
7
8	32 32 32			32	53 29 715	use Carp qw(carp croak);
9	32 32 32			32	5540 157437 575	use Params::Get;
10	32 32 32			32	5699 2784 65	use utf8;
11
12						our $VERSION = '0.04';
13
14 - 182						=head1 NAME Data::Random::String::Matches - Generate random strings matching a regex =head1 SYNOPSIS use Data::Random::String::Matches; # Create a generator with regex and optional length my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/, 7); # Generate a matching string my $str = $gen->generate(); print $str; # e.g., "XYZ1234" # Alternation my $gen2 = Data::Random::String::Matches->new(qr/(cat\|dog\|bird)/); my $animal = $gen2->generate_smart(); # "cat", "dog", or "bird" # Backreferences my $gen3 = Data::Random::String::Matches->new(qr/(\w{3})-\1/); my $str3 = $gen3->generate_smart(); # e.g., "abc-abc" # Groups and quantifiers my $gen4 = Data::Random::String::Matches->new(qr/(ha){2,4}/); my $laugh = $gen4->generate_smart(); # "haha", "hahaha", or "hahahaha" # Unicode $gen = Data::Random::String::Matches->new(qr/\p{L}{5}/); # Named captures $gen = Data::Random::String::Matches->new(qr/(?<year>\d{4})-\k<year>/); # Possessive $gen = Data::Random::String::Matches->new(qr/\d++[A-Z]/); # Lookaheads $gen = Data::Random::String::Matches->new(qr/\d{3}(?=[A-Z])/); # Combined $gen = Data::Random::String::Matches->new( qr/(?<prefix>\p{Lu}{2})\d++\k<prefix>(?=[A-Z])/ ); # Consistency with Legacy software print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n"; =head1 DESCRIPTION This module generates random strings that match a given regular expression pattern. It parses the regex pattern and intelligently builds matching strings, supporting a wide range of regex features. =head1 SUPPORTED REGEX FEATURES =head2 Character Classes =over 4 =item * Basic classes: C<[a-z]>, C<[A-Z]>, C<[0-9]>, C<[abc]> =item * Negated classes: C<[^a-z]> =item * Ranges: C<[a-zA-Z0-9]> =item * Escape sequences in classes: C<[\d\w]> =back =head2 Escape Sequences =over 4 =item * C<\d> - digit [0-9] =item * C<\w> - word character [a-zA-Z0-9_] =item * C<\s> - whitespace =item * C<\D> - non-digit =item * C<\W> - non-word character =item * C<\t>, C<\n>, C<\r> - tab, newline, carriage return =back =head2 Quantifiers =over 4 =item * C<{n}> - exactly n times =item * C<{n,m}> - between n and m times =item * C<{n,}> - n or more times =item * C<+> - one or more (1-5 times) =item * C<> - zero or more (0-5 times) =item C<?> - zero or one =back =head2 Grouping and Alternation =over 4 =item * C<(...)> - capturing group =item * C<(?:...)> - non-capturing group =item * C<\|> - alternation (e.g., C<cat\|dog\|bird>) =item * C<\1>, C<\2>, etc. - backreferences =back =head2 Other =over 4 =item * C<.> - any character (printable ASCII) =item * Literal characters =item * C<^> and C<$> anchors (stripped during parsing) =back =head1 LIMITATIONS =over 4 =item * Lookaheads and lookbehinds ((?=...), (?!...)) are not supported =item * Named groups ((?<name>...)) are not supported =item * Possessive quantifiers (+, ++) are not supported =item Unicode properties (\p{L}, \p{N}) are not supported =item * Some complex nested patterns may not work correctly with smart parsing =back =head1 EXAMPLES # Email-like pattern my $gen = Data::Random::String::Matches->new(qr/[a-z]+@[a-z]+\.com/); # API key pattern my $gen = Data::Random::String::Matches->new(qr/^AIza[0-9A-Za-z_-]{35}$/); # Phone number my $gen = Data::Random::String::Matches->new(qr/\d{3}-\d{3}-\d{4}/); # Repeated pattern my $gen = Data::Random::String::Matches->new(qr/(\w{4})-\1/); =head1 METHODS =head2 new($regex, $length) Creates a new generator. C<$regex> can be a compiled regex (qr//) or a string. C<$length> is optional and defaults to 10 (used for fallback generation). =cut
183
184						sub new {
185	215			215	512314	my ($class, $regex, $length) = @_;
186
187	215	100			280	croak 'Regex pattern is required' unless defined $regex;
188
189						# Convert string to regex if needed
190	214	100			390	my $regex_obj = ref($regex) eq 'Regexp' ? $regex : qr/$regex/;
191
192	214		100		563	my $self = {
193						regex => $regex_obj,
194						regex_str => "$regex",
195						length => $length \|\| 10,
196						backrefs => {}, # Store backreferences
197						named_refs => {}, # Store named captures
198						};
199
200	214				279	return bless $self, $class;
201						}
202
203 - 209						=head2 generate($max_attempts) Generates a random string matching the regex. First tries smart parsing, then falls back to brute force if needed. Tries up to C<$max_attempts> times (default 1000) before croaking. =cut
210
211						sub generate {
212	4139			4139	17023	my ($self, $max_attempts) = @_;
213	4139		100		5631	$max_attempts //= 1000;
214
215	4139				2720	my $regex = $self->{regex};
216	4139				2454	my $length = $self->{length};
217
218						# First try the smart approach
219	4139 4139				2483 3427	my $str = eval { $self->_build_from_pattern($self->{regex_str}) };
220	4139	100	100		8681	if (defined $str && $str =~ /^$regex$/) {
221	4136				4748	return $str;
222						}
223
224						# If smart approach failed, show warning in debug mode
225	3	0	33		4	if ($ENV{DEBUG_REGEX_GEN} && $@) {
226	0				0	warn "Smart generation failed: $@";
227						}
228
229						# Fall back to brute force with character set matching
230	3				5	for (1 .. $max_attempts) {
231	1014				801	$str = $self->_random_string_smart($length);
232	1014	100			1749	return $str if $str =~ /^$regex$/;
233						}
234
235	2				56	croak "Failed to generate matching string after $max_attempts attempts. Pattern: $self->{regex_str}";
236						}
237
238						sub _random_string_smart {
239	1014			1014	633	my ($self, $len) = @_;
240
241	1014				630	my $regex_str = $self->{regex_str};
242
243						# Detect common patterns and generate appropriate characters
244	1014				568	my @chars;
245
246	1014	50 50 50 50	33 25 33 33		2405	if ($regex_str =~ /\\d/ \|\| $regex_str =~ /\[0-9\]/ \|\| $regex_str =~ /\[\^[^\]]*[A-Za-z]/) {
247						# Digit patterns
248	0				0	@chars = ('0'..'9');
249						} elsif ($regex_str =~ /\[A-Z\]/ \|\| $regex_str =~ /\[A-Z[^\]]*\]/) {
250						# Uppercase patterns
251	0				0	@chars = ('A'..'Z');
252						} elsif ($regex_str =~ /\[a-z\]/ \|\| $regex_str =~ /\[a-z[^\]]*\]/) {
253						# Lowercase patterns
254	0				0	@chars = ('a'..'z');
255						} elsif ($regex_str =~ /\\w/ \|\| $regex_str =~ /\[a-zA-Z0-9_\]/) {
256						# Word characters
257	0				0	@chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
258						} else {
259						# Default to printable ASCII
260	1014 95316				792 69416	@chars = map { chr($_) } (33 .. 126);
261						}
262
263	1014				686	my $str = '';
264	1014				2392	$str .= $chars[int(rand(@chars))] for (1 .. $len);
265
266	1014				2501	return $str;
267						}
268
269 - 274						=head2 generate_smart() Parses the regex and builds a matching string directly. Faster and more reliable than brute force, but may not handle all edge cases. =cut
275
276						sub generate_smart {
277	132			132	6522	my $self = $_[0];
278	132				155	return $self->_build_from_pattern($self->{regex_str});
279						}
280
281 - 311						=head2 generate_many($count, $unique) Generates multiple random strings matching the regex. my @strings = $gen->generate_many(10); # 10 strings (may have duplicates) my @strings = $gen->generate_many(10, 1); # 10 unique strings my @strings = $gen->generate_many(10, 'unique'); # 10 unique strings # Generate until you have 1000 unique codes my $gen = Data::Random::String::Matches->new(qr/[A-Z]{3}\d{4}/); my @codes = $gen->generate_many(1000, 'unique'); Parameters: =over 4 =item * C<$count> - Number of strings to generate (required, must be positive) =item * C<$unique> - If true, ensures all generated strings are unique. May return fewer than C<$count> strings if uniqueness cannot be achieved within reasonable attempts. Accepts any true value (1, 'unique', etc.) =back Returns: List of generated strings Dies: If count is not a positive integer Warns: If unable to generate the requested number of unique strings =cut
312
313						sub generate_many {
314	24			24	2601	my ($self, $count, $unique) = @_;
315
316	24	100	100		47	croak 'Count must be a positive integer' unless defined $count && $count > 0;
317
318	21				16	my @results;
319
320	21	100			18	if ($unique) {
321						# Generate unique strings
322	6				5	my %seen;
323	6				3	my $attempts = 0;
324	6				6	my $max_attempts = $count * 100; # Reasonable limit
325
326	6		100		12	while (keys %seen < $count && $attempts < $max_attempts) {
327	1649				1238	my $str = $self->generate();
328	1649				1225	$seen{$str} = 1;
329	1649				1938	$attempts++;
330						}
331
332	6	100			7	if (keys %seen < $count) {
333	1				7	carp 'Only generated ', (scalar keys %seen), " unique strings out of $count requested";
334						}
335
336	6				285	@results = keys %seen;
337						} else {
338						# Generate any strings (may have duplicates)
339	15				22	push @results, $self->generate() for (1 .. $count);
340						}
341
342	21				226	return @results;
343						}
344
345 - 349						=head2 get_seed() Gets the random seed for reproducible generation =cut
350
351						sub get_seed {
352	2			2	347	my $self = shift;
353
354	2				4	return $self->{seed};
355						}
356
357 - 361						=head2 set_seed($seed) Sets the random seed for reproducible generation =cut
362
363						sub set_seed {
364	13			13	1399	my $self = shift;
365	13				15	my $params = Params::Get::get_params('seed', \@_);
366	12				110	my $seed = $params->{'seed'};
367
368	12	100			52	croak 'Seed must be defined' unless defined $seed;
369
370	11				11	srand($seed);
371	11				10	$self->{seed} = $seed;
372
373	11				19	return $self;
374						}
375
376 - 389						=head2 suggest_simpler_pattern() Analyzes patterns and suggests improvements. my $suggestion = $gen->suggest_simpler_pattern(); if ($suggestion) { print "Reason: $suggestion->{reason}\n"; print "Better pattern: $suggestion->{pattern}\n" if $suggestion->{pattern}; print "Tips:\n"; print " - $_\n" for @{$suggestion->{tips}}; } =cut
390
391						sub suggest_simpler_pattern {
392	21			21	31	my $self = $_[0];
393
394	21				17	my $pattern = $self->{regex_str};
395	21				21	my $info = $self->pattern_info();
396
397						# Check for patterns that are too complex
398	21	100			19	if ($info->{complexity} eq 'very_complex') {
399						return {
400	1				5	pattern => undef,
401						reason => 'Pattern is very complex. Consider breaking it into multiple simpler patterns.',
402						tips => [
403						'Split alternations into separate generators',
404						'Avoid deeply nested groups',
405						'Use fixed-length patterns when possible',
406						],
407						};
408						}
409
410						# Suggest removing unnecessary backreferences
411	20	50	67		32	if ($info->{features}{has_backreferences} && $pattern =~ /($\w+$).*\\\d+/) {
412	0				0	my $simpler = $pattern;
413						# Can't automatically simplify backreferences, but can suggest
414						return {
415	0				0	pattern => undef,
416						reason => 'Backreferences add complexity. Consider if you really need repeated groups.',
417						tips => [
418						'If the repeated part doesn\'t need to match, use two separate patterns',
419						'For validation, backreferences are great; for generation, they limit variation',
420						],
421						};
422						}
423
424						# Suggest fixed quantifiers instead of ranges
425	20	100			25	if ($pattern =~ /\{(\d+),(\d+)\}/) {
426	8				8	my ($min, $max) = ($1, $2);
427	8	100			9	if ($max - $min > 10) {
428	6				6	my $mid = int(($min + $max) / 2);
429	6				6	my $simpler = $pattern;
430	6				13	$simpler =~ s/\{\d+,\d+\}/\{$mid\}/;
431						return {
432	6				14	pattern => $simpler,
433						reason => "Large quantifier range {$min,$max} creates high variability. Consider fixed length {$mid}.",
434						tips => [
435						'Fixed lengths are faster to generate',
436						'If you need variety, generate multiple patterns with different fixed lengths',
437						],
438						};
439						}
440						}
441
442						# Suggest limiting alternations
443	14	100			13	if ($info->{features}{has_alternation}) {
444	2				4	my @alts = split /\\|/, $pattern;
445	2	100			2	if (@alts > 10) {
446						return {
447	1				3	pattern => undef,
448						reason => 'Too many alternations (' . scalar(@alts) . '). Consider splitting into multiple patterns.',
449						tips => [
450						'Create separate generators for different alternatives',
451						'Group similar patterns together',
452						'Use character classes [abc] instead of (a\|b\|c)',
453						],
454						};
455						}
456
457						# Check if alternations could be a character class
458	1	50			2	if ($pattern =~ /$([a-zA-Z])\\|([a-zA-Z])\\|([a-zA-Z])$/) {
459	1				2	my $chars = join('', $1, $2, $3);
460	1				1	my $simpler = $pattern;
461	1				3	$simpler =~ s/$[a-zA-Z]\\|[a-zA-Z]\\|[a-zA-Z]$/[$chars]/;
462						return {
463	1				2	pattern => $simpler,
464						reason => 'Single-character alternations can be simplified to character classes.',
465						tips => [
466						'Use [abc] instead of (a\|b\|c)',
467						'Character classes are faster to process',
468						],
469						};
470						}
471						}
472
473						# Suggest removing lookaheads/lookbehinds for generation
474	12	100	100		14	if ($info->{features}{has_lookahead} \|\| $info->{features}{has_lookbehind}) {
475	3				3	my $simpler = $pattern;
476	3				5	$simpler =~ s/$\?[=!].*?$//g; # Remove lookaheads
477	3				2	$simpler =~ s/$\?<[=!].*?$//g; # Remove lookbehinds
478
479	3	50			4	if ($simpler ne $pattern) {
480						return {
481	3				6	pattern => $simpler,
482						reason => 'Lookaheads/lookbehinds add complexity but don\'t contribute to generated strings.',
483						tips => [
484						'Lookaheads are great for validation, not generation',
485						'The simplified pattern generates the same strings',
486						],
487						};
488						}
489						}
490
491						# Check for Unicode when ASCII would work
492	9	100	67		13	if ($info->{features}{has_unicode} && $pattern =~ /\\p\{L\}/) {
493	2				2	my $simpler = $pattern;
494	2				2	$simpler =~ s/\\p\{L\}/[A-Za-z]/g;
495						return {
496	2				4	pattern => $simpler,
497						reason => 'Unicode \\p{L} can be simplified to [A-Za-z] if you only need ASCII letters.',
498						tips => [
499						'ASCII patterns are faster',
500						'Only use Unicode if you need non-ASCII characters',
501						],
502						};
503						}
504
505						# Check for overly long fixed strings
506	7	50			8	if ($pattern =~ /([a-zA-Z]{20,})/) {
507						return {
508	0				0	pattern => undef,
509						reason => 'Pattern contains very long fixed literal strings. Consider if you need such specific patterns.',
510						tips => [
511						'Use variables instead of long literals',
512						'Break into smaller patterns',
513						],
514						};
515						}
516
517						# Pattern seems reasonable
518	7				7	return undef;
519						}
520
521 - 529						=head2 validate($string) Checks if a string matches the pattern without generating. if ($gen->validate('1234')) { print "Valid!\n"; } =cut
530
531						sub validate {
532	69			69	1792	my $self = shift;
533	69				83	my $params = Params::Get::get_params('string', \@_);
534	68				610	my $string = $params->{'string'};
535
536	68	100			102	croak('String must be defined') unless defined $string;
537
538	67				49	my $regex = $self->{regex};
539	67				315	return $string =~ /^$regex$/;
540						}
541
542 - 558						=head2 pattern_info() Returns detailed information about the pattern. my $info = $gen->pattern_info(); print "Complexity: $info->{complexity}\n"; print "Min length: $info->{min_length}\n"; print "Has Unicode: ", $info->{features}{has_unicode} ? "Yes" : "No", "\n"; C<pattern_info> analyzes a regular expression to produce a structured summary of its characteristics, including estimated string lengths, detected features, and an overall complexity rating. It first calls C<_estimate_length> to heuristically compute the minimum and maximum possible lengths of strings matching the pattern by scanning for literals, character classes, and quantifiers. It then detects the presence of advanced regex constructions such as alternation, lookahead or lookbehind assertions, named groups, and Unicode properties, storing them in a feature hash. Finally, it calculates a rough "complexity" classification based on pattern length and detected features-returning a hash reference that describes the regex's structure, estimated lengths, and complexity level. =cut
559
560						sub pattern_info {
561	41			41	1886	my $self = $_[0];
562
563	41	100			44	return $self->{'_pattern_info_cache'} if $self->{'_pattern_info_cache'};
564
565	38				31	my $pattern = $self->{'regex_str'};
566
567						# Calculate approximate min/max lengths
568	38				57	my ($min_len, $max_len) = $self->_estimate_length($pattern);
569
570						# Detect pattern features
571	38	100 100 100 100 100 100 100			263	my %features = (
572						has_alternation => ($pattern =~ /\\|/ ? 1 : 0),
573						has_backreferences => ($pattern =~ /(\\[1-9]\|\\k<)/ ? 1 : 0),
574						has_unicode => ($pattern =~ /\\p\{/ ? 1 : 0),
575						has_lookahead => ($pattern =~ /\(\?[=!]/ ? 1 : 0),
576						has_lookbehind => ($pattern =~ /\(\?<[=!]/ ? 1 : 0),
577						has_named_groups => ($pattern =~ /\(\?</ ? 1 : 0),
578						has_possessive => ($pattern =~ /(?:[+?]\+\|\{\d+(?:,\d)?\}\+)/ ? 1 : 0),
579						);
580
581	38				70	my $info = {
582						pattern => $pattern,
583						min_length => $min_len,
584						max_length => $max_len,
585						estimated_length => int(($min_len + $max_len) / 2),
586						features => \%features,
587						complexity => $self->_calculate_complexity(\%features, $pattern),
588						};
589
590	38				49	$self->{'_pattern_info_cache'} = $info;
591
592	38				41	return $info;
593						}
594
595						sub _estimate_length {
596	38			38	30	my ($self, $pattern) = @_;
597
598						# Remove anchors and modifiers
599	38				105	$pattern =~ s/^$\?\^?[iumsx-]:(.)$$/$1/;
600	38				31	$pattern =~ s/^\^//;
601	38				27	$pattern =~ s/\$//;
602
603	38				27	my $min = 0;
604	38				24	my $max = 0;
605
606						# Simple heuristic - count fixed characters and quantifiers
607	38				24	my $last_was_atom = 0; # Handle cases like \d{3} where the quantifier modifies the atom count
608	38				72	while ($pattern =~ /([^+*?{}\[\]\\])\|\\[dwsWDN]\|\[([^\]]+)\]\|\{(\d+)(?:,(\d+))?\}/g) {
609	222	100 100	75		290	if (defined $1 \|\| (defined $2 && $2)) {
610	156				93	$min++;
611	156				77	$max++;
612	156				171	$last_was_atom = 1;
613						} elsif (defined $3) {
614	36	100			29	if ($last_was_atom) {
615						# Replace the last atomâ€™s contribution
616	16				15	$min += $3 - 1;
617	16	100			17	$max += (defined $4 ? $4 : $3) - 1;
618	16				16	$last_was_atom = 0;
619						} else {
620						# No preceding atom? assume standalone
621	20				13	$min += $3;
622	20	100			36	$max += defined $4 ? $4 : $3;
623						}
624						}
625						}
626
627						# Account for +, *, ?
628	38				35	my $plus_count = () = $pattern =~ /\+/g;
629	38				25	my $star_count = () = $pattern =~ /\*/g;
630	38				32	my $question_count = () = $pattern =~ /\?/g;
631
632	38				27	$min += $plus_count; # + means at least 1
633	38				27	$max += ($plus_count * 5) + ($star_count * 5); # Assume max 5 repetitions
634	38				37	$min -= $question_count; # ? makes things optional
635
636	38	50			43	$min = 0 if $min < 0;
637	38	50			33	$max = $min + 50 if $max < $min; # Ensure max >= min
638
639	38				43	return ($min, $max);
640						}
641
642						sub _calculate_complexity {
643	38			38	36	my ($self, $features, $pattern) = @_;
644
645	38				21	my $score = 0;
646
647						# Base complexity from pattern length
648	38				33	$score += length($pattern) / 10;
649
650						# Add complexity for features
651	38	100			39	$score += 2 if $features->{has_alternation};
652	38	100			34	$score += 3 if $features->{has_backreferences};
653	38	100			27	$score += 2 if $features->{has_unicode};
654	38	100			34	$score += 2 if $features->{has_lookahead};
655	38	100			32	$score += 2 if $features->{has_lookbehind};
656	38	100			32	$score += 1 if $features->{has_named_groups};
657	38	100			30	$score += 1 if $features->{has_possessive};
658
659						# Classify
660	38	100			57	return 'simple' if $score < 3;
661	15	100			26	return 'moderate' if $score < 7;
662	3	100			6	return 'complex' if $score < 12;
663	1				2	return 'very_complex';
664						}
665
666						sub _build_from_pattern {
667	4270			4270	2905	my ($self, $pattern) = @_;
668
669						# Reset backreferences for each generation
670	4270				2937	$self->{backrefs} = {};
671	4270				2993	$self->{named_refs} = {};
672	4270				2889	$self->{group_counter} = 0;
673
674						# Remove regex delimiters and modifiers
675						# Handle (?^:...), (?i:...), (?-i:...) etc
676	4270				5730	$pattern =~ s/^$\?\^?[iumsx-]:(.)$$/$1/;
677
678						# Remove anchors (they're handled by the regex match itself)
679	4270				2791	$pattern =~ s/^\^//;
680	4270				2678	$pattern =~ s/\$//;
681
682	4270				3266	return $self->_parse_sequence($pattern);
683						}
684
685						sub _parse_sequence {
686	4412			4412	3037	my ($self, $pattern) = @_;
687
688	4412				2671	my $result = '';
689	4412				2573	my $i = 0;
690	4412				2609	my $len = length($pattern);
691
692	4412				3392	while ($i < $len) {
693	5840				3893	my $char = substr($pattern, $i, 1);
694
695	5840	100 100 100 100 50 100 100	67		6042	if ($char eq '\\') {
696						# Escape sequence
697	939				548	$i++;
698	939				597	my $next = substr($pattern, $i, 1);
699
700	939	100 100 100 100 100 100 100 100 100 100 50	67 67		1637	if ($next =~ /[1-9]/) {
701						# Backreference
702	30				16	my $ref_num = $next;
703	30	50			28	if (exists $self->{backrefs}{$ref_num}) {
704	30				26	$result .= $self->{backrefs}{$ref_num};
705						} else {
706	0				0	croak "Backreference \\$ref_num used before group defined";
707						}
708						} elsif ($next eq 'k' && substr($pattern, $i+1, 1) eq '<') {
709						# Named backreference \k<name>
710	7				7	my $end = index($pattern, '>', $i+2);
711	7				5	my $name = substr($pattern, $i+2, $end-$i-2);
712	7	50			8	if (exists $self->{named_refs}{$name}) {
713	7				6	$result .= $self->{named_refs}{$name};
714						} else {
715	0				0	croak "Named backreference \\k<$name> used before group defined";
716						}
717	7				3	$i = $end;
718						} elsif ($next eq 'p' && substr($pattern, $i+1, 1) eq '{') {
719						# Unicode property \p{L}, \p{N}, etc.
720	25				35	my $end = index($pattern, '}', $i+2);
721	25				21	my $prop = substr($pattern, $i+2, $end-$i-2);
722						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
723	95			95	74	$self->_unicode_property_char($prop);
724	25				38	});
725	25				31	$result .= $generated;
726	25				20	$i = $new_i;
727						} elsif ($next eq 'd') {
728	729 2305			2305	949 2252	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { int(rand(10)) }, 1);
729	729				728	$result .= $generated;
730	729				473	$i = $new_i;
731						} elsif ($next eq 'w') {
732						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
733	115			115	412	my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
734	115				298	$chars[int(rand(@chars))];
735	35				66	}, 1);
736	35				52	$result .= $generated;
737	35				24	$i = $new_i;
738						} elsif ($next eq 's') {
739	2 2			2	6 4	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { ' ' }, 1);
740	2				5	$result .= $generated;
741	2				2	$i = $new_i;
742						} elsif ($next eq 'D') {
743						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
744	6 504 564			6	8 335 452	my @chars = map { chr($_) } grep { chr($_) !~ /\d/ } (33..126);
745	6				26	$chars[int(rand(@chars))];
746	2				4	});
747	2				4	$result .= $generated;
748	2				2	$i = $new_i;
749						} elsif ($next eq 'W') {
750						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
751	6 186 564			6	5 125 492	my @chars = map { chr($_) } grep { chr($_) !~ /\w/ } (33..126);
752	6				22	$chars[int(rand(@chars))];
753	2				7	});
754	2				5	$result .= $generated;
755	2				2	$i = $new_i;
756						} elsif ($next eq 't') {
757	2				2	$result .= "\t";
758						} elsif ($next eq 'n') {
759	2				2	$result .= "\n";
760						} elsif ($next eq 'r') {
761	0				0	$result .= "\r";
762						} else {
763	103				62	$result .= $next;
764						}
765	939				794	$i++;
766						} elsif ($char eq '[') {
767						# Character class
768	3951				3300	my $end = $self->_find_matching_bracket($pattern, $i);
769	3951	50			2963	croak 'Unmatched [' if $end == -1;
770
771	3951				2735	my $class = substr($pattern, $i+1, $end-$i-1);
772						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub {
773	17212			17212	13191	$self->_random_from_class($class);
774	3951				4973	}, 1);
775	3951				4137	$result .= $generated;
776	3951				3580	$i = $new_i + 1;
777						} elsif ($char eq '(') {
778						# Group - could be various types
779	151				145	my $end = $self->_find_matching_paren($pattern, $i);
780	151	50			127	croak 'Unmatched (' if $end == -1;
781
782	151				132	my $group_content = substr($pattern, $i+1, $end-$i-1);
783
784						# Check for special group types
785	151				88	my $is_capturing = 1;
786	151				106	my $is_lookahead = 0;
787	151				80	my $is_lookbehind = 0;
788	151				94	my $is_negative = 0;
789	151				95	my $group_name = undef;
790
791	151	100 100 100 100 100 100			258	if ($group_content =~ /^\?:/) {
792						# Non-capturing group
793	2				2	$is_capturing = 0;
794	2				2	$group_content = substr($group_content, 2);
795						} elsif ($group_content =~ /^\?<([^>]+)>/) {
796						# Named capture (?<name>...)
797	15				15	$group_name = $1;
798	15				14	$group_content = substr($group_content, length($1) + 3);
799						} elsif ($group_content =~ /^\?=/) {
800						# Positive lookahead (?=...)
801	6				4	$is_lookahead = 1;
802	6				5	$is_capturing = 0;
803	6				5	$group_content = substr($group_content, 2);
804						} elsif ($group_content =~ /^\?!/) {
805						# Negative lookahead (?!...)
806	1				0	$is_lookahead = 1;
807	1				1	$is_negative = 1;
808	1				1	$is_capturing = 0;
809	1				1	$group_content = substr($group_content, 2);
810						} elsif ($group_content =~ /^\?<=/) {
811						# Positive lookbehind (?<=...)
812	1				1	$is_lookbehind = 1;
813	1				1	$is_capturing = 0;
814	1				1	$group_content = substr($group_content, 3);
815						} elsif ($group_content =~ /^\?<!/) {
816						# Negative lookbehind (?<!...)
817	1				2	$is_lookbehind = 1;
818	1				1	$is_negative = 1;
819	1				1	$is_capturing = 0;
820	1				2	$group_content = substr($group_content, 3);
821						}
822
823						# Handle lookaheads/lookbehinds
824	151	100 100			159	if ($is_lookahead) {
825						# For positive lookahead, generate the pattern but don't advance
826						# For negative lookahead, avoid the pattern
827	7	100			6	if (!$is_negative) {
828						# Generate what the lookahead expects but don't consume it
829						# This is a simplification - we just note the constraint
830						}
831						# Lookaheads don't add to the result
832	7				4	$i = $end + 1;
833	7				7	next;
834						} elsif ($is_lookbehind) {
835						# Lookbehinds check what came before
836						# For generation, we can mostly ignore them
837	2				2	$i = $end + 1;
838	2				4	next;
839						}
840
841						# Check for alternation
842	142				90	my $generated;
843	142	100			135	if ($group_content =~ /\\|/) {
844	88				82	$generated = $self->_handle_alternation($group_content);
845						} else {
846	54				55	$generated = $self->_parse_sequence($group_content);
847						}
848
849						# Store backreference if capturing
850	142	100			119	if ($is_capturing) {
851	140				110	$self->{group_counter}++;
852	140				143	$self->{backrefs}{$self->{group_counter}} = $generated;
853
854	140	100			131	if (defined $group_name) {
855	15				15	$self->{named_refs}{$group_name} = $generated;
856						}
857						}
858
859						# Handle quantifier after group (including possessive)
860	142 158			158	172 157	my ($final_generated, $new_i) = $self->_handle_quantifier($pattern, $end, sub { $generated }, 1);
861	142				149	$result .= $final_generated;
862	142				142	$i = $new_i + 1;
863						} elsif ($char eq '.') {
864						# Any character (except newline)
865						my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub {
866	2 188			2	4 141	my @chars = map { chr($_) } (33 .. 126);
867	2				8	$chars[int(rand(@chars))];
868	2				4	});
869	2				4	$result .= $generated;
870	2				2	$i = $new_i + 1;
871						} elsif ($char eq '\|') {
872						# Alternation at top level - just return what we have
873						# (This is handled by _handle_alternation for groups)
874	0				0	last;
875						} elsif ($char =~ /[+*?]/ \|\| $char eq '{') {
876						# Quantifier without preceding element - shouldn't happen in valid regex
877	1				74	croak "$pattern: Quantifier '$char' without preceding element";
878						} elsif ($char =~ /[\w ]/) {
879						# Literal character
880	727 730			730	883 657	my ($generated, $new_i) = $self->_handle_quantifier($pattern, $i, sub { $char });
881	727				691	$result .= $generated;
882	727				625	$i = $new_i + 1;
883						} else {
884						# Other literal characters
885	69				40	$result .= $char;
886	69				54	$i++;
887						}
888						}
889
890	4411				3959	return $result;
891						}
892
893						sub _handle_quantifier {
894	5617			5617	4296	my ($self, $pattern, $pos, $generator, $check_possessive) = @_;
895	5617		100		4659	$check_possessive //= 1; # Default to checking for possessive
896
897	5617				3725	my $next = substr($pattern, $pos + 1, 1);
898	5617				3230	my $is_possessive = 0;
899
900						# Check for possessive quantifier (+)
901	5617	100	67		6192	if ($check_possessive && $pos + 2 < length($pattern)) {
902	3929				2738	my $after_next = substr($pattern, $pos + 2, 1);
903	3929	100	60		4928	if (($next =~ /[+*?]/ \|\| $next eq '}') && $after_next eq '+') {
904	6				5	$is_possessive = 1;
905						}
906						}
907
908	5617	100 100 100 100			4780	if ($next eq '{') {
909	3459				2530	my $end = index($pattern, '}', $pos + 2);
910	3459	50			2444	croak "Unmatched '{' at position $pos in pattern: $pattern" if ($end == -1);
911	3459				2619	my $quant = substr($pattern, $pos + 2, $end - $pos - 2);
912
913						# Check for possessive after }
914	3459	100	75		4101	if ($check_possessive && $end + 1 < length($pattern) && substr($pattern, $end + 1, 1) eq '+') {
915	2				0	$is_possessive = 1;
916	2				2	$end++;
917						}
918
919	3459				2109	my $result = '';
920	3459	100 50 0			3363	if ($quant =~ /^(\d+)$/) {
921						# Exact: {n}
922	3400				4125	$result .= $generator->() for (1 .. $1);
923						} elsif ($quant =~ /^(\d+),(\d+)$/) {
924						# Range: {n,m}
925	59				88	my $count = $1 + int(rand($2 - $1 + 1));
926	59				64	$result .= $generator->() for (1 .. $count);
927						} elsif ($quant =~ /^(\d+),$/) {
928						# Minimum: {n,}
929	0				0	my $count = $1 + int(rand(5));
930	0				0	$result .= $generator->() for (1 .. $count);
931						}
932	3459				3554	return ($result, $end);
933						} elsif ($next eq '+') {
934						# One or more (possessive: ++)
935	22				12	my $actual_end = $pos + 1;
936	22	100			28	if ($is_possessive) {
937	4				4	$actual_end++;
938						}
939	22				49	my $count = 1 + int(rand(5));
940	22				20	my $result = '';
941	22				28	$result .= $generator->() for (1 .. $count);
942	22				31	return ($result, $actual_end);
943						} elsif ($next eq '*') {
944						# Zero or more (possessive: *+)
945	3				3	my $actual_end = $pos + 1;
946	3	100			10	if ($is_possessive) {
947	1				1	$actual_end++;
948						}
949	3				6	my $count = int(rand(6));
950	3				3	my $result = '';
951	3				4	$result .= $generator->() for (1 .. $count);
952	3				6	return ($result, $actual_end);
953						} elsif ($next eq '?') {
954						# Zero or one (possessive: ?+)
955	4				4	my $actual_end = $pos + 1;
956	4	100			7	if ($is_possessive) {
957	1				1	$actual_end++;
958						}
959	4	50			6	my $result = rand() < 0.5 ? $generator->() : '';
960	4				5	return ($result, $actual_end);
961						} else {
962						# No quantifier
963	2129				1480	return ($generator->(), $pos);
964						}
965						}
966
967						sub _handle_alternation {
968	88			88	72	my ($self, $pattern) = @_;
969
970						# Split on \| but respect groups
971	88				46	my @alternatives;
972	88				62	my $current = '';
973	88				47	my $depth = 0;
974
975	88				125	for my $char (split //, $pattern) {
976	1008	100 100 100	100		988	if ($char eq '(') {
977	4				3	$depth++;
978	4				3	$current .= $char;
979						} elsif ($char eq ')') {
980	4				2	$depth--;
981	4				3	$current .= $char;
982						} elsif ($char eq '\|' && $depth == 0) {
983	162				125	push @alternatives, $current;
984	162				120	$current = '';
985						} else {
986	838				527	$current .= $char;
987						}
988						}
989	88	50			113	push @alternatives, $current if length($current);
990
991						# Choose one alternative randomly
992	88				133	my $chosen = $alternatives[int(rand(@alternatives))];
993	88				126	return $self->_parse_sequence($chosen);
994						}
995
996						sub _find_matching_bracket {
997	3951			3951	2790	my ($self, $pattern, $start) = @_;
998
999	3951				2300	my $depth = 0;
1000	3951				3299	for (my $i = $start; $i < length($pattern); $i++) {
1001	25134				15229	my $char = substr($pattern, $i, 1);
1002	25134	100 100	67 50		30873	if ($char eq '[' && ($i == $start \|\| substr($pattern, $i-1, 1) ne '\\')) {
1003	3951				3208	$depth++;
1004						} elsif ($char eq ']' && substr($pattern, $i-1, 1) ne '\\') {
1005	3951				2335	$depth--;
1006	3951	50			3654	return $i if $depth == 0;
1007						}
1008						}
1009	0				0	return -1;
1010						}
1011
1012						sub _find_matching_paren {
1013	151			151	134	my ($self, $pattern, $start) = @_;
1014
1015	151				104	my $depth = 0;
1016	151				179	for (my $i = $start; $i < length($pattern); $i++) {
1017	1750				1174	my $char = substr($pattern, $i, 1);
1018	1750	100			1334	my $prev = $i > 0 ? substr($pattern, $i-1, 1) : '';
1019
1020	1750	100 100	67 67		2385	if ($char eq '(' && $prev ne '\\') {
1021	156				166	$depth++;
1022						} elsif ($char eq ')' && $prev ne '\\') {
1023	156				91	$depth--;
1024	156	100			188	return $i if $depth == 0;
1025						}
1026						}
1027	0				0	return -1;
1028						}
1029
1030						sub _random_from_class {
1031	17212			17212	11122	my ($self, $class) = @_;
1032
1033	17212				9520	my @chars;
1034
1035						# Debugging this regex: qr/!#-'*+\\-\\.\\^_`\|~0-9A-Za-z/
1036						# which used to give this error: 'Argument "#" isn't numeric in range (or flop)'
1037	17212	50			12761	warn "DEBUG: class = '$class', length = ", length($class) if ($ENV{DEBUG_REGEX_GEN});
1038
1039						# Handle negation
1040	17212				10226	my $negate = 0;
1041	17212	100			12847	if (substr($class, 0, 1) eq '^') {
1042	10				6	$negate = 1;
1043	10				7	$class = substr($class, 1);
1044						}
1045
1046						# Parse character class with escape sequences
1047	17212				10083	my $i = 0;
1048	17212				12363	while ($i < length($class)) {
1049	33440				20848	my $char = substr($class, $i, 1);
1050
1051	33440	50			23944	warn "DEBUG: i=$i, char='$char' (ord=", ord($char), ')' if ($ENV{DEBUG_REGEX_GEN});
1052
1053	33440	100 100	100		38601	if ($char eq '\\') {
1054	168				93	$i++;
1055	168				120	my $next = substr($class, $i, 1);
1056	168	50			119	warn "DEBUG: Escaped char: $next" if ($ENV{DEBUG_REGEX_GEN});
1057	168	100 100 50 100	67		172	if ($next eq 'd') {
1058	16				16	push @chars, ('0'..'9');
1059						} elsif ($next eq 'w') {
1060	10				39	push @chars, ('a'..'z', 'A'..'Z', '0'..'9', '_');
1061						} elsif ($next eq 's') {
1062	0				0	push @chars, (' ', "\t", "\n");
1063						} elsif ($next eq 'p' && substr($class, $i+1, 1) eq '{') {
1064						# Unicode property in character class
1065	6				4	my $end = index($class, '}', $i+2);
1066	6				6	my $prop = substr($class, $i+2, $end-$i-2);
1067	6				5	push @chars, $self->_unicode_property_chars($prop);
1068	6				12	$i = $end;
1069						} else {
1070						# Escaped literal character (including \-, \., \^, etc.)
1071	136				88	push @chars, $next;
1072						}
1073						} elsif ($i + 2 < length($class) && substr($class, $i+1, 1) eq '-') {
1074						# Potential range
1075	30564				20683	my $end_char = substr($class, $i+2, 1);
1076
1077						# Check if end is escaped or if this is valid range
1078	30564	50 50	33		33494	if ($end_char eq '\\' \|\| $end_char eq ']') {
1079						# Not a range, dash is literal
1080	0				0	push @chars, $char;
1081						} elsif (ord($end_char) >= ord($char)) {
1082						# Valid range - use ord/chr to avoid quote interpolation issues
1083	30564				17497	my $start_ord = ord($char);
1084	30564				18040	my $end_ord = ord($end_char);
1085	30564 561218				23296 413042	push @chars, map { chr($_) } ($start_ord .. $end_ord);
1086	30564				31281	$i += 2; # Will be incremented again by loop, total +3
1087						} else {
1088						# Invalid range order
1089	0				0	push @chars, $char;
1090						}
1091						} else {
1092	2708				1726	push @chars, $char;
1093						}
1094	33440				26014	$i++;
1095						}
1096
1097	17212	50			12304	warn 'DEBUG: Final chars array has ', scalar(@chars), ' elements' if ($ENV{DEBUG_REGEX_GEN});
1098
1099	17212	100			12036	if ($negate) {
1100	10 100				9 76	my %excluded = map { $_ => 1 } @chars;
1101	10 940 940				12 666 670	@chars = grep { !$excluded{$_} } map { chr($_) } (33 .. 126);
1102						}
1103
1104	17212	50			32526	return @chars ? $chars[int(rand(@chars))] : 'X';
1105						}
1106
1107						sub _unicode_property_char {
1108	95			95	66	my ($self, $prop) = @_;
1109	95				76	my @chars = $self->_unicode_property_chars($prop);
1110	95	50			367	return @chars ? $chars[int(rand(@chars))] : 'X';
1111						}
1112
1113						sub _unicode_property_chars {
1114	101			101	69	my ($self, $prop) = @_;
1115
1116						# Common Unicode properties
1117	101	100 100 100 50 0 0 0 0	0 100 67 67 33 0 0 0		113	if ($prop eq 'L' \|\| $prop eq 'Letter') {
1118						# Letters, skip Ã— and Ã· which are symbols
1119	88 5456				378 4089	return ('a' .. 'z', 'A' .. 'Z', map { chr($_) } ((ord'Ã€')..ord('Ã–'), ord('Ã˜')..ord('Ã¶'), ord('Ã¸')..ord('Ã¿')));
1120						} elsif ($prop eq 'N' \|\| $prop eq 'Number') {
1121						# Numbers
1122						# return ('0' .. '9', map { chr($_) } (ord('â‘ ').. ord('â‘³')));
1123	3				5	return ('0' .. '9');
1124						} elsif ($prop eq 'Lu' \|\| $prop eq 'Uppercase_Letter') {
1125						# Uppercase letters, skip Ã— which is not a letter
1126	6 180				17 163	return ('A' .. 'Z', map { chr($_) } (ord('Ã€') .. ord('Ã–'), ord('Ã˜') .. ord('Ãž')));
1127						} elsif ($prop eq 'Ll' \|\| $prop eq 'Lowercase_Letter') {
1128						# Lowercase letters, skip Ã· which is not a letter
1129	4 124				12 99	return ('a' .. 'z', map { chr($_) } (ord('Ã ') .. ord('Ã¶'), ord('Ã¸') .. ord('Ã¿')));
1130						} elsif ($prop eq 'P' \|\| $prop eq 'Punctuation') {
1131						# Punctuation
1132	0				0	return ('.', ',', '!', '?', ';', ':', '-', 'â€”', 'â€¦');
1133						} elsif ($prop eq 'S' \|\| $prop eq 'Symbol') {
1134						# Symbols
1135	0				0	return ('$', 'â‚¬', 'Â£', 'Â¥', 'Â©', 'Â®', 'â„¢', 'Â°', 'Â±', 'Ã—', 'Ã·');
1136						} elsif ($prop eq 'Z' \|\| $prop eq 'Separator') {
1137						# Separators
1138	0				0	return (' ', "\t", "\n");
1139						} elsif ($prop eq 'Nd' \|\| $prop eq 'Decimal_Number') {
1140						# Decimal numbers
1141	0				0	return ('0'..'9');
1142						} else {
1143						# Unknown property - return letters as default
1144	0				0	return ('a'..'z', 'A'..'Z');
1145						}
1146						}
1147
1148 - 1154						=head2 create_random_string For consistency with L<Data::Random::String>. print Data::Random::String::Matches->create_random_string(length => 3, regex => '\d{3}'), "\n"; =cut
1155
1156						sub create_random_string
1157						{
1158	2			2	630	my $class = shift;
1159	2				4	my $params = Params::Get::get_params(undef, @_);
1160
1161	2				21	my $regex = $params->{'regex'};
1162	2				3	my $length = $params->{'length'};
1163
1164	2				3	return $class->new($regex, $length)->generate();
1165						}
1166
1167 - 1201						=head1 AUTHOR Nigel Horne, C<< <njh at nigelhorne.com> >> =head1 SEE ALSO =over 4 =item * Test coverage report: L<https://nigelhorne.github.io/Data-Random-String-Matches/coverage/> =item * L<String::Random> =item * L<Regexp::Genex> =back =head1 LICENCE AND COPYRIGHT Copyright 2025 Nigel Horne. Usage is subject to licence terms. The licence terms of this software are as follows: =over 4 =item * Personal single user, single computer use: GPL2 =item * All other users (including Commercial, Charity, Educational, Government) must apply in writing for a licence for use from Nigel Horne at the above e-mail. =back =cut
1202
1203						1;