File Coverage

File:	bin/extract-schemas
Coverage:	45.9%

line	stmt	bran	cond	sub	time	code
1						#!/usr/bin/env perl
2
3	5 5 5			5	9519 4 103	use strict;
4	5 5 5			5	10 3 96	use warnings;
5
6	5 5 5			5	1113 16374 161	use Data::Dumper;
7	5 5 5			5	14 6 129	use File::Path qw(make_path);
8	5 5 5			5	9 4 56	use File::Spec;
9	5 5 5			5	1543 22760 10	use Getopt::Long;
10	5 5 5			5	1067 2417 139	use FindBin;
11	5 5 5			5	828 1397 18	use lib "$FindBin::Bin/../lib";
12	5 5 5			5	1326 129065 233	use Pod::Usage;
13
14	5 5 5			5	3200 9 7850	use App::Test::Generator::SchemaExtractor;
15
16 - 114						=head1 NAME extract-schemas - Extract test schemas from Perl modules =head1 SYNOPSIS extract-schemas [options] <module.pm> Options: --output-dir DIR Output directory for schema files (default: schemas/) --strict-pod=off\|warn\|fatal --verbose Show detailed analysis --fuzz Run coverage-guided fuzzing on extracted schemas --fuzz-iters N Iterations per method when fuzzing (default: 100) (no short form, to avoid conflict with --fuzz/-f) --fuzz-all Fuzz all methods, including those with no input schema --corpus-dir DIR Directory to persist fuzz corpora (default: schemas/corpus/) --help Show this help message --man Show full documentation Examples: extract-schemas lib/MyModule.pm extract-schemas --output-dir my_schemas --verbose lib/MyModule.pm extract-schemas --fuzz lib/MyModule.pm extract-schemas --fuzz --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm extract-schemas --fuzz --fuzz-all lib/MyModule.pm =head1 QUICK START Run C<extract-schemas --strict-pod=warn -v --fuzz lib/MyModule.pm> to analyse your module and automatically probe each method with hundreds of fuzzed inputs, looking for crashes caused by inputs that should be valid. Anything suspicious is saved to C<schemas/corpus/>. If genuine bugs are found, run C<fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t> to turn them into regression tests that will fail until you fix the underlying code and pass forever after. Run C<extract-schemas --fuzz> regularly - each run builds on the last, probing deeper into your code each time. Otherwise, for each of the functions in MyModule.pm, C<fuzz-harness-generator -r schemas/function.yml> =head1 DESCRIPTION This tool analyzes a Perl module and generates YAML schema files for each method, suitable for use with L<App::Test::Generator> using the C<fuzz-harness-generator> program which will create the C<.t> file to run through C<prove>. The extractor uses three sources of information: =over 4 =item 1. POD Documentation Parses parameter descriptions from POD to extract types and constraints. =item 2. Code Analysis Analyzes validation patterns in the code (ref checks, length checks, etc.) =item 3. Method Signatures Extracts parameter names from method signatures. =back The tool assigns a confidence level (high/medium/low) to each schema based on how much information it could infer. =head1 FUZZING When C<--fuzz> is specified, the tool will additionally run C<App::Test::Generator::CoverageGuidedFuzzer> against each method after schema extraction. By default all methods with at least one known input parameter are fuzzed, regardless of confidence level. Use C<--fuzz-all> to also attempt fuzzing methods with no input schema (these will use purely random generation). The fuzzer will: =over 4 =item * Load and C<require> the target module at runtime =item * Run coverage-guided fuzzing using the extracted schema as input spec =item * Report any crashes or unexpected errors found =item * Persist a corpus to C<--corpus-dir> for incremental improvement across runs =back Corpus files are named C<< <corpus-dir>/<method>.json >> and are automatically loaded on subsequent runs, so each run builds on the last. =cut
115
116						# ---------------------------------------------------------------------------
117						# Option parsing
118						# ---------------------------------------------------------------------------
119
120	5				383228	my %cli_opts = (
121						help => 0,
122						man => 0,
123						);
124
125	5				10	my %extractor_opts = (
126						output_dir => 'schemas',
127						strict_pod => 'warn',
128						verbose => 0,
129						);
130
131	5				4	my $fuzz = 0;
132	5				3	my $fuzz_all = 0;
133	5				6	my $fuzz_iters = 100;
134	5				5	my $corpus_dir; # default set after output_dir is known
135
136						GetOptions(
137						'output-dir\|o=s' => \$extractor_opts{output_dir},
138						'strict-pod\|s=s' => \$extractor_opts{strict_pod},
139						'verbose\|v' => \$extractor_opts{verbose},
140						'fuzz\|f' => \$fuzz,
141						'fuzz-all' => \$fuzz_all,
142						'fuzz-iters=i' => \$fuzz_iters,
143						'corpus-dir\|c=s' => \$corpus_dir,
144						'help\|h' => \$cli_opts{help},
145						'man\|m' => \$cli_opts{man},
146	5	50			18	) or pod2usage(2);
147
148	5	100			2460	pod2usage(-exitval => 0, -verbose => 1) if $cli_opts{help};
149	4	50			6	pod2usage(-exitval => 0, -verbose => 2) if $cli_opts{man};
150
151	4	100			9	if ($extractor_opts{strict_pod} !~ /^(off\|warn\|fatal)$/) {
152	1				8	die "Invalid --strict-pod value '$extractor_opts{strict_pod}'. Expected off, warn, or fatal";
153						}
154
155	3	100			6	my $input_file = shift @ARGV or pod2usage('Error: No input file specified');
156	2	50			14	die "Error: File not found: $input_file" unless -f $input_file;
157
158						# Default corpus dir sits under the output dir
159	2		33		17	$corpus_dir //= File::Spec->catdir($extractor_opts{output_dir}, 'corpus');
160
161						# ---------------------------------------------------------------------------
162						# Schema extraction
163						# ---------------------------------------------------------------------------
164
165	2				8	print "Extracting schemas from: $input_file\n";
166	2				2	print "Output directory: $extractor_opts{output_dir}\n\n";
167
168	2	100			146	make_path($extractor_opts{output_dir}) unless -d $extractor_opts{output_dir};
169
170	2				7	my $extractor = App::Test::Generator::SchemaExtractor->new(
171						input_file => $input_file,
172						%extractor_opts,
173						);
174
175	2				4	my $schemas = $extractor->extract_all();
176
177						# ---------------------------------------------------------------------------
178						# Optional: coverage-guided fuzzing
179						# ---------------------------------------------------------------------------
180
181	2				2	my %fuzz_results; # method_name => report hashref
182
183	2	50			2	if ($fuzz) {
184	0				0	require App::Test::Generator::CoverageGuidedFuzzer;
185	0	0			0	make_path($corpus_dir) unless -d $corpus_dir;
186
187						# Load the target module once so all methods are callable
188	0				0	my $package = _load_target_module($input_file, $schemas);
189
190						# Try to build a default instance for object method calls.
191						# Most OO modules need a $self as the first argument.
192						# We try new() with no args, then new({}), then give up and fuzz as functions.
193	0				0	my $instance = _try_construct($package);
194	0	0			0	if ($instance) {
195	0				0	print "Constructed $package instance for method calls.\n";
196						} else {
197	0				0	print "Could not construct $package instance; fuzzing as functions.\n";
198						}
199
200	0	0			0	print "Fuzzing with $fuzz_iters iterations per method",
201						($fuzz_all ? ' (all methods)' : ' (methods with known inputs)'),
202						"...\n\n";
203
204	0				0	foreach my $method (sort keys %$schemas) {
205	0				0	my $schema = $schemas->{$method};
206	0		0		0	my $iconf = $schema->{_confidence}{input}{level} // 'low';
207
208	0	0			0	unless ($fuzz_all) {
209						# Skip methods with no input schema at all â€” there is nothing to fuzz
210	0 0	0	0 0		0 0	next if $iconf eq 'none' && !%{ $schema->{input} // {} };
211						}
212
213	0				0	my $sub_ref = $package->can($method);
214	0	0			0	unless ($sub_ref) {
215	0				0	warn " Skipping $method: not callable in $package\n";
216	0				0	next;
217						}
218
219						# Skip constructors and AUTOLOAD â€” not suitable for direct fuzzing
220	0	0			0	if ($method =~ /^(new\|AUTOLOAD\|DESTROY\|import)$/) {
221						print " Skipping $method (constructor/special method)\n"
222	0	0			0	if $extractor_opts{verbose};
223	0				0	next;
224						}
225
226	0				0	my $corpus_file = File::Spec->catfile($corpus_dir, "$method.json");
227
228	0				0	print " Fuzzing $method ($iconf confidence)... ";
229
230	0				0	my $fuzzer = App::Test::Generator::CoverageGuidedFuzzer->new(
231						schema => $schema,
232						target_sub => $sub_ref,
233						instance => $instance,
234						iterations => $fuzz_iters,
235						);
236
237	0	0			0	$fuzzer->load_corpus($corpus_file) if -f $corpus_file;
238
239	0				0	my $report = $fuzzer->run();
240	0				0	$fuzzer->save_corpus($corpus_file);
241
242	0				0	$fuzz_results{$method} = $report;
243
244						printf "%d bugs, %d branches covered\n",
245						$report->{bugs_found},
246	0				0	$report->{branches_covered};
247						}
248
249	0				0	print "\n";
250						}
251
252						# ---------------------------------------------------------------------------
253						# Summary report
254						# ---------------------------------------------------------------------------
255
256	2				3	print '=' x 70, "\n",
257						"EXTRACTION SUMMARY\n",
258						'=' x 70, "\n\n";
259
260	2				5	my %input_confidence_counts = (high => 0, medium => 0, low => 0, none => 0);
261	2				4	my %output_confidence_counts = (high => 0, medium => 0, low => 0, none => 0);
262
263	2				4	foreach my $method (sort keys %$schemas) {
264	2				2	my $schema = $schemas->{$method};
265	2		50		4	my $iconf = $schema->{_confidence}{input}{level} // 'low';
266	2		50		2	my $oconf = $schema->{_confidence}{output}{level} // 'low';
267	2				2	$input_confidence_counts{$iconf}++;
268	2				1	$output_confidence_counts{$oconf}++;
269
270	2 4 2				2 5 4	my $param_count = scalar grep { $_ !~ /^_/ } keys %{ $schema->{input} };
271
272	2				2	my $fuzz_col = '';
273	2	50			2	if (exists $fuzz_results{$method}) {
274	0				0	my $r = $fuzz_results{$method};
275						$fuzz_col = $r->{bugs_found}
276						? sprintf(' BUGS: %d', $r->{bugs_found})
277	0	0			0	: ' fuzz: ok';
278						}
279
280	2				8	printf "%-30s %d params [%s input confidence] [%s output confidence]%s\n",
281						$method, $param_count, uc($iconf), uc($oconf), $fuzz_col;
282						}
283
284	2				3	print "\n";
285	2				6	print 'Total methods: ', (scalar keys %$schemas), "\n";
286	2				2	print " Input:\n";
287	2				3	print " High confidence: $input_confidence_counts{high}\n";
288	2				3	print " Medium confidence: $input_confidence_counts{medium}\n";
289	2				2	print " Low confidence: $input_confidence_counts{low}\n";
290	2				2	print " Output:\n";
291	2				2	print " High confidence: $output_confidence_counts{high}\n";
292	2				2	print " Medium confidence: $output_confidence_counts{medium}\n";
293	2				2	print " Low confidence: $output_confidence_counts{low}\n";
294	2				2	print "\n";
295
296	2	50	33		5	if ($input_confidence_counts{low} > 0 \|\| $input_confidence_counts{medium} > 0) {
297	0				0	print "RECOMMENDATION:\n",
298						"Review the generated schemas in $extractor_opts{output_dir}/\n",
299						"Focus on methods with medium/low confidence ratings.\n\n";
300						}
301
302						# Fuzz bug detail
303	2	50			2	if (%fuzz_results) {
304	0				0	my $total_bugs = 0;
305	0				0	$total_bugs += $_->{bugs_found} for values %fuzz_results;
306
307	0	0			0	if ($total_bugs) {
308	0				0	print '=' x 70, "\n",
309						"FUZZING BUGS FOUND ($total_bugs total)\n",
310						'=' x 70, "\n\n";
311
312	0				0	foreach my $method (sort keys %fuzz_results) {
313	0				0	my $r = $fuzz_results{$method};
314	0	0			0	next unless $r->{bugs_found};
315	0				0	print " $method:\n";
316	0 0				0 0	for my $i (0 .. $#{ $r->{bugs} }) {
317	0				0	my $bug = $r->{bugs}[$i];
318	0	0			0	my $inp = defined($bug->{input}) ? qq("$bug->{input}") : 'undef';
319						printf " Bug %d: input=%-30s error=%s\n",
320	0				0	$i + 1, $inp, $bug->{error};
321						}
322	0				0	print "\n";
323						}
324	0				0	print "Corpora saved to: $corpus_dir/\n\n";
325						} else {
326	0				0	print "Fuzzing complete: no bugs found across ",
327						scalar(keys %fuzz_results), " methods.\n\n";
328						}
329						}
330
331	2	100			3	if ($extractor_opts{verbose}) {
332	1				2	print "Schemas:\n\t", Dumper($schemas);
333						}
334
335	2				255	print "Schema files written to: $extractor_opts{output_dir}/\n";
336
337						# ---------------------------------------------------------------------------
338						# Helper: load the target module so methods become callable
339						# ---------------------------------------------------------------------------
340
341						sub _load_target_module {
342	0			0		my ($input_file, $schemas) = @_;
343
344						# Derive the package name from the first schema entry that has 'module' set
345	0					my ($package) = map { $schemas->{$_}{module} }
346	0 0					grep { $schemas->{$_}{module} }
347						keys %$schemas;
348
349	0	0				die 'Could not determine package name from extracted schemas' unless $package;
350
351						# Add the module's containing lib dir to @INC
352						# Walks up from the file looking for a 'lib' directory
353	0					my $abs = File::Spec->rel2abs($input_file);
354	0					my @dirs = File::Spec->splitdir( (File::Spec->splitpath($abs))[1] );
355
356	0					while (@dirs) {
357	0					my $candidate = File::Spec->catdir(@dirs, 'lib');
358	0	0				if (-d $candidate) {
359	0					lib->import($candidate);
360	0					last;
361						}
362	0					pop @dirs;
363						}
364
365	0	0				eval "require $package"
366						or die "Could not load $package for fuzzing: $@";
367
368	0					return $package;
369						}
370
371						# Try to construct a default instance of the target package for method calls.
372						# Attempts new() with progressively more forgiving argument lists.
373						# Returns the instance on success, undef if nothing works.
374						sub _try_construct {
375	0			0		my ($package) = @_;
376
377	0					for my $args ([], [{}], [undef]) {
378	0 0					my $obj = eval { $package->new(@$args) };
379	0	0				next if $@;
380	0	0	0			next unless defined $obj && ref $obj;
381	0					return $obj;
382						}
383
384	0					return undef;
385						}
386