| File: | bin/extract-schemas |
| Coverage: | 45.9% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #!/usr/bin/env perl | |||||
| 2 | ||||||
| 3 | 5 5 5 | 9519 4 103 | use strict; | |||
| 4 | 5 5 5 | 10 3 96 | use warnings; | |||
| 5 | ||||||
| 6 | 5 5 5 | 1113 16374 161 | use Data::Dumper; | |||
| 7 | 5 5 5 | 14 6 129 | use File::Path qw(make_path); | |||
| 8 | 5 5 5 | 9 4 56 | use File::Spec; | |||
| 9 | 5 5 5 | 1543 22760 10 | use Getopt::Long; | |||
| 10 | 5 5 5 | 1067 2417 139 | use FindBin; | |||
| 11 | 5 5 5 | 828 1397 18 | use lib "$FindBin::Bin/../lib"; | |||
| 12 | 5 5 5 | 1326 129065 233 | use Pod::Usage; | |||
| 13 | ||||||
| 14 | 5 5 5 | 3200 9 7850 | use App::Test::Generator::SchemaExtractor; | |||
| 15 | ||||||
| 16 - 114 | =head1 NAME extract-schemas - Extract test schemas from Perl modules =head1 SYNOPSIS extract-schemas [options] <module.pm> Options: --output-dir DIR Output directory for schema files (default: schemas/) --strict-pod=off|warn|fatal --verbose Show detailed analysis --fuzz Run coverage-guided fuzzing on extracted schemas --fuzz-iters N Iterations per method when fuzzing (default: 100) (no short form, to avoid conflict with --fuzz/-f) --fuzz-all Fuzz all methods, including those with no input schema --corpus-dir DIR Directory to persist fuzz corpora (default: schemas/corpus/) --help Show this help message --man Show full documentation Examples: extract-schemas lib/MyModule.pm extract-schemas --output-dir my_schemas --verbose lib/MyModule.pm extract-schemas --fuzz lib/MyModule.pm extract-schemas --fuzz --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm extract-schemas --fuzz --fuzz-all lib/MyModule.pm =head1 QUICK START Run C<extract-schemas --strict-pod=warn -v --fuzz lib/MyModule.pm> to analyse your module and automatically probe each method with hundreds of fuzzed inputs, looking for crashes caused by inputs that should be valid. Anything suspicious is saved to C<schemas/corpus/>. If genuine bugs are found, run C<fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t> to turn them into regression tests that will fail until you fix the underlying code and pass forever after. Run C<extract-schemas --fuzz> regularly - each run builds on the last, probing deeper into your code each time. Otherwise, for each of the functions in MyModule.pm, C<fuzz-harness-generator -r schemas/function.yml> =head1 DESCRIPTION This tool analyzes a Perl module and generates YAML schema files for each method, suitable for use with L<App::Test::Generator> using the C<fuzz-harness-generator> program which will create the C<.t> file to run through C<prove>. The extractor uses three sources of information: =over 4 =item 1. POD Documentation Parses parameter descriptions from POD to extract types and constraints. =item 2. Code Analysis Analyzes validation patterns in the code (ref checks, length checks, etc.) =item 3. Method Signatures Extracts parameter names from method signatures. =back The tool assigns a confidence level (high/medium/low) to each schema based on how much information it could infer. =head1 FUZZING When C<--fuzz> is specified, the tool will additionally run C<App::Test::Generator::CoverageGuidedFuzzer> against each method after schema extraction. By default all methods with at least one known input parameter are fuzzed, regardless of confidence level. Use C<--fuzz-all> to also attempt fuzzing methods with no input schema (these will use purely random generation). The fuzzer will: =over 4 =item * Load and C<require> the target module at runtime =item * Run coverage-guided fuzzing using the extracted schema as input spec =item * Report any crashes or unexpected errors found =item * Persist a corpus to C<--corpus-dir> for incremental improvement across runs =back Corpus files are named C<< <corpus-dir>/<method>.json >> and are automatically loaded on subsequent runs, so each run builds on the last. =cut | |||||
| 115 | ||||||
| 116 | # --------------------------------------------------------------------------- | |||||
| 117 | # Option parsing | |||||
| 118 | # --------------------------------------------------------------------------- | |||||
| 119 | ||||||
| 120 | 5 | 383228 | my %cli_opts = ( | |||
| 121 | help => 0, | |||||
| 122 | man => 0, | |||||
| 123 | ); | |||||
| 124 | ||||||
| 125 | 5 | 10 | my %extractor_opts = ( | |||
| 126 | output_dir => 'schemas', | |||||
| 127 | strict_pod => 'warn', | |||||
| 128 | verbose => 0, | |||||
| 129 | ); | |||||
| 130 | ||||||
| 131 | 5 | 4 | my $fuzz = 0; | |||
| 132 | 5 | 3 | my $fuzz_all = 0; | |||
| 133 | 5 | 6 | my $fuzz_iters = 100; | |||
| 134 | 5 | 5 | my $corpus_dir; # default set after output_dir is known | |||
| 135 | ||||||
| 136 | GetOptions( | |||||
| 137 | 'output-dir|o=s' => \$extractor_opts{output_dir}, | |||||
| 138 | 'strict-pod|s=s' => \$extractor_opts{strict_pod}, | |||||
| 139 | 'verbose|v' => \$extractor_opts{verbose}, | |||||
| 140 | 'fuzz|f' => \$fuzz, | |||||
| 141 | 'fuzz-all' => \$fuzz_all, | |||||
| 142 | 'fuzz-iters=i' => \$fuzz_iters, | |||||
| 143 | 'corpus-dir|c=s' => \$corpus_dir, | |||||
| 144 | 'help|h' => \$cli_opts{help}, | |||||
| 145 | 'man|m' => \$cli_opts{man}, | |||||
| 146 | 5 | 18 | ) or pod2usage(2); | |||
| 147 | ||||||
| 148 | 5 | 2460 | pod2usage(-exitval => 0, -verbose => 1) if $cli_opts{help}; | |||
| 149 | 4 | 6 | pod2usage(-exitval => 0, -verbose => 2) if $cli_opts{man}; | |||
| 150 | ||||||
| 151 | 4 | 9 | if ($extractor_opts{strict_pod} !~ /^(off|warn|fatal)$/) { | |||
| 152 | 1 | 8 | die "Invalid --strict-pod value '$extractor_opts{strict_pod}'. Expected off, warn, or fatal"; | |||
| 153 | } | |||||
| 154 | ||||||
| 155 | 3 | 6 | my $input_file = shift @ARGV or pod2usage('Error: No input file specified'); | |||
| 156 | 2 | 14 | die "Error: File not found: $input_file" unless -f $input_file; | |||
| 157 | ||||||
| 158 | # Default corpus dir sits under the output dir | |||||
| 159 | 2 | 17 | $corpus_dir //= File::Spec->catdir($extractor_opts{output_dir}, 'corpus'); | |||
| 160 | ||||||
| 161 | # --------------------------------------------------------------------------- | |||||
| 162 | # Schema extraction | |||||
| 163 | # --------------------------------------------------------------------------- | |||||
| 164 | ||||||
| 165 | 2 | 8 | print "Extracting schemas from: $input_file\n"; | |||
| 166 | 2 | 2 | print "Output directory: $extractor_opts{output_dir}\n\n"; | |||
| 167 | ||||||
| 168 | 2 | 146 | make_path($extractor_opts{output_dir}) unless -d $extractor_opts{output_dir}; | |||
| 169 | ||||||
| 170 | 2 | 7 | my $extractor = App::Test::Generator::SchemaExtractor->new( | |||
| 171 | input_file => $input_file, | |||||
| 172 | %extractor_opts, | |||||
| 173 | ); | |||||
| 174 | ||||||
| 175 | 2 | 4 | my $schemas = $extractor->extract_all(); | |||
| 176 | ||||||
| 177 | # --------------------------------------------------------------------------- | |||||
| 178 | # Optional: coverage-guided fuzzing | |||||
| 179 | # --------------------------------------------------------------------------- | |||||
| 180 | ||||||
| 181 | 2 | 2 | my %fuzz_results; # method_name => report hashref | |||
| 182 | ||||||
| 183 | 2 | 2 | if ($fuzz) { | |||
| 184 | 0 | 0 | require App::Test::Generator::CoverageGuidedFuzzer; | |||
| 185 | 0 | 0 | make_path($corpus_dir) unless -d $corpus_dir; | |||
| 186 | ||||||
| 187 | # Load the target module once so all methods are callable | |||||
| 188 | 0 | 0 | my $package = _load_target_module($input_file, $schemas); | |||
| 189 | ||||||
| 190 | # Try to build a default instance for object method calls. | |||||
| 191 | # Most OO modules need a $self as the first argument. | |||||
| 192 | # We try new() with no args, then new({}), then give up and fuzz as functions. | |||||
| 193 | 0 | 0 | my $instance = _try_construct($package); | |||
| 194 | 0 | 0 | if ($instance) { | |||
| 195 | 0 | 0 | print "Constructed $package instance for method calls.\n"; | |||
| 196 | } else { | |||||
| 197 | 0 | 0 | print "Could not construct $package instance; fuzzing as functions.\n"; | |||
| 198 | } | |||||
| 199 | ||||||
| 200 | 0 | 0 | print "Fuzzing with $fuzz_iters iterations per method", | |||
| 201 | ($fuzz_all ? ' (all methods)' : ' (methods with known inputs)'), | |||||
| 202 | "...\n\n"; | |||||
| 203 | ||||||
| 204 | 0 | 0 | foreach my $method (sort keys %$schemas) { | |||
| 205 | 0 | 0 | my $schema = $schemas->{$method}; | |||
| 206 | 0 | 0 | my $iconf = $schema->{_confidence}{input}{level} // 'low'; | |||
| 207 | ||||||
| 208 | 0 | 0 | unless ($fuzz_all) { | |||
| 209 | # Skip methods with no input schema at all â there is nothing to fuzz | |||||
| 210 | 0 0 | 0 0 | next if $iconf eq 'none' && !%{ $schema->{input} // {} }; | |||
| 211 | } | |||||
| 212 | ||||||
| 213 | 0 | 0 | my $sub_ref = $package->can($method); | |||
| 214 | 0 | 0 | unless ($sub_ref) { | |||
| 215 | 0 | 0 | warn " Skipping $method: not callable in $package\n"; | |||
| 216 | 0 | 0 | next; | |||
| 217 | } | |||||
| 218 | ||||||
| 219 | # Skip constructors and AUTOLOAD â not suitable for direct fuzzing | |||||
| 220 | 0 | 0 | if ($method =~ /^(new|AUTOLOAD|DESTROY|import)$/) { | |||
| 221 | print " Skipping $method (constructor/special method)\n" | |||||
| 222 | 0 | 0 | if $extractor_opts{verbose}; | |||
| 223 | 0 | 0 | next; | |||
| 224 | } | |||||
| 225 | ||||||
| 226 | 0 | 0 | my $corpus_file = File::Spec->catfile($corpus_dir, "$method.json"); | |||
| 227 | ||||||
| 228 | 0 | 0 | print " Fuzzing $method ($iconf confidence)... "; | |||
| 229 | ||||||
| 230 | 0 | 0 | my $fuzzer = App::Test::Generator::CoverageGuidedFuzzer->new( | |||
| 231 | schema => $schema, | |||||
| 232 | target_sub => $sub_ref, | |||||
| 233 | instance => $instance, | |||||
| 234 | iterations => $fuzz_iters, | |||||
| 235 | ); | |||||
| 236 | ||||||
| 237 | 0 | 0 | $fuzzer->load_corpus($corpus_file) if -f $corpus_file; | |||
| 238 | ||||||
| 239 | 0 | 0 | my $report = $fuzzer->run(); | |||
| 240 | 0 | 0 | $fuzzer->save_corpus($corpus_file); | |||
| 241 | ||||||
| 242 | 0 | 0 | $fuzz_results{$method} = $report; | |||
| 243 | ||||||
| 244 | printf "%d bugs, %d branches covered\n", | |||||
| 245 | $report->{bugs_found}, | |||||
| 246 | 0 | 0 | $report->{branches_covered}; | |||
| 247 | } | |||||
| 248 | ||||||
| 249 | 0 | 0 | print "\n"; | |||
| 250 | } | |||||
| 251 | ||||||
| 252 | # --------------------------------------------------------------------------- | |||||
| 253 | # Summary report | |||||
| 254 | # --------------------------------------------------------------------------- | |||||
| 255 | ||||||
| 256 | 2 | 3 | print '=' x 70, "\n", | |||
| 257 | "EXTRACTION SUMMARY\n", | |||||
| 258 | '=' x 70, "\n\n"; | |||||
| 259 | ||||||
| 260 | 2 | 5 | my %input_confidence_counts = (high => 0, medium => 0, low => 0, none => 0); | |||
| 261 | 2 | 4 | my %output_confidence_counts = (high => 0, medium => 0, low => 0, none => 0); | |||
| 262 | ||||||
| 263 | 2 | 4 | foreach my $method (sort keys %$schemas) { | |||
| 264 | 2 | 2 | my $schema = $schemas->{$method}; | |||
| 265 | 2 | 4 | my $iconf = $schema->{_confidence}{input}{level} // 'low'; | |||
| 266 | 2 | 2 | my $oconf = $schema->{_confidence}{output}{level} // 'low'; | |||
| 267 | 2 | 2 | $input_confidence_counts{$iconf}++; | |||
| 268 | 2 | 1 | $output_confidence_counts{$oconf}++; | |||
| 269 | ||||||
| 270 | 2 4 2 | 2 5 4 | my $param_count = scalar grep { $_ !~ /^_/ } keys %{ $schema->{input} }; | |||
| 271 | ||||||
| 272 | 2 | 2 | my $fuzz_col = ''; | |||
| 273 | 2 | 2 | if (exists $fuzz_results{$method}) { | |||
| 274 | 0 | 0 | my $r = $fuzz_results{$method}; | |||
| 275 | $fuzz_col = $r->{bugs_found} | |||||
| 276 | ? sprintf(' BUGS: %d', $r->{bugs_found}) | |||||
| 277 | 0 | 0 | : ' fuzz: ok'; | |||
| 278 | } | |||||
| 279 | ||||||
| 280 | 2 | 8 | printf "%-30s %d params [%s input confidence] [%s output confidence]%s\n", | |||
| 281 | $method, $param_count, uc($iconf), uc($oconf), $fuzz_col; | |||||
| 282 | } | |||||
| 283 | ||||||
| 284 | 2 | 3 | print "\n"; | |||
| 285 | 2 | 6 | print 'Total methods: ', (scalar keys %$schemas), "\n"; | |||
| 286 | 2 | 2 | print " Input:\n"; | |||
| 287 | 2 | 3 | print " High confidence: $input_confidence_counts{high}\n"; | |||
| 288 | 2 | 3 | print " Medium confidence: $input_confidence_counts{medium}\n"; | |||
| 289 | 2 | 2 | print " Low confidence: $input_confidence_counts{low}\n"; | |||
| 290 | 2 | 2 | print " Output:\n"; | |||
| 291 | 2 | 2 | print " High confidence: $output_confidence_counts{high}\n"; | |||
| 292 | 2 | 2 | print " Medium confidence: $output_confidence_counts{medium}\n"; | |||
| 293 | 2 | 2 | print " Low confidence: $output_confidence_counts{low}\n"; | |||
| 294 | 2 | 2 | print "\n"; | |||
| 295 | ||||||
| 296 | 2 | 5 | if ($input_confidence_counts{low} > 0 || $input_confidence_counts{medium} > 0) { | |||
| 297 | 0 | 0 | print "RECOMMENDATION:\n", | |||
| 298 | "Review the generated schemas in $extractor_opts{output_dir}/\n", | |||||
| 299 | "Focus on methods with medium/low confidence ratings.\n\n"; | |||||
| 300 | } | |||||
| 301 | ||||||
| 302 | # Fuzz bug detail | |||||
| 303 | 2 | 2 | if (%fuzz_results) { | |||
| 304 | 0 | 0 | my $total_bugs = 0; | |||
| 305 | 0 | 0 | $total_bugs += $_->{bugs_found} for values %fuzz_results; | |||
| 306 | ||||||
| 307 | 0 | 0 | if ($total_bugs) { | |||
| 308 | 0 | 0 | print '=' x 70, "\n", | |||
| 309 | "FUZZING BUGS FOUND ($total_bugs total)\n", | |||||
| 310 | '=' x 70, "\n\n"; | |||||
| 311 | ||||||
| 312 | 0 | 0 | foreach my $method (sort keys %fuzz_results) { | |||
| 313 | 0 | 0 | my $r = $fuzz_results{$method}; | |||
| 314 | 0 | 0 | next unless $r->{bugs_found}; | |||
| 315 | 0 | 0 | print " $method:\n"; | |||
| 316 | 0 0 | 0 0 | for my $i (0 .. $#{ $r->{bugs} }) { | |||
| 317 | 0 | 0 | my $bug = $r->{bugs}[$i]; | |||
| 318 | 0 | 0 | my $inp = defined($bug->{input}) ? qq("$bug->{input}") : 'undef'; | |||
| 319 | printf " Bug %d: input=%-30s error=%s\n", | |||||
| 320 | 0 | 0 | $i + 1, $inp, $bug->{error}; | |||
| 321 | } | |||||
| 322 | 0 | 0 | print "\n"; | |||
| 323 | } | |||||
| 324 | 0 | 0 | print "Corpora saved to: $corpus_dir/\n\n"; | |||
| 325 | } else { | |||||
| 326 | 0 | 0 | print "Fuzzing complete: no bugs found across ", | |||
| 327 | scalar(keys %fuzz_results), " methods.\n\n"; | |||||
| 328 | } | |||||
| 329 | } | |||||
| 330 | ||||||
| 331 | 2 | 3 | if ($extractor_opts{verbose}) { | |||
| 332 | 1 | 2 | print "Schemas:\n\t", Dumper($schemas); | |||
| 333 | } | |||||
| 334 | ||||||
| 335 | 2 | 255 | print "Schema files written to: $extractor_opts{output_dir}/\n"; | |||
| 336 | ||||||
| 337 | # --------------------------------------------------------------------------- | |||||
| 338 | # Helper: load the target module so methods become callable | |||||
| 339 | # --------------------------------------------------------------------------- | |||||
| 340 | ||||||
| 341 | sub _load_target_module { | |||||
| 342 | 0 | my ($input_file, $schemas) = @_; | ||||
| 343 | ||||||
| 344 | # Derive the package name from the first schema entry that has 'module' set | |||||
| 345 | 0 | my ($package) = map { $schemas->{$_}{module} } | ||||
| 346 | 0 0 | grep { $schemas->{$_}{module} } | ||||
| 347 | keys %$schemas; | |||||
| 348 | ||||||
| 349 | 0 | die 'Could not determine package name from extracted schemas' unless $package; | ||||
| 350 | ||||||
| 351 | # Add the module's containing lib dir to @INC | |||||
| 352 | # Walks up from the file looking for a 'lib' directory | |||||
| 353 | 0 | my $abs = File::Spec->rel2abs($input_file); | ||||
| 354 | 0 | my @dirs = File::Spec->splitdir( (File::Spec->splitpath($abs))[1] ); | ||||
| 355 | ||||||
| 356 | 0 | while (@dirs) { | ||||
| 357 | 0 | my $candidate = File::Spec->catdir(@dirs, 'lib'); | ||||
| 358 | 0 | if (-d $candidate) { | ||||
| 359 | 0 | lib->import($candidate); | ||||
| 360 | 0 | last; | ||||
| 361 | } | |||||
| 362 | 0 | pop @dirs; | ||||
| 363 | } | |||||
| 364 | ||||||
| 365 | 0 | eval "require $package" | ||||
| 366 | or die "Could not load $package for fuzzing: $@"; | |||||
| 367 | ||||||
| 368 | 0 | return $package; | ||||
| 369 | } | |||||
| 370 | ||||||
| 371 | # Try to construct a default instance of the target package for method calls. | |||||
| 372 | # Attempts new() with progressively more forgiving argument lists. | |||||
| 373 | # Returns the instance on success, undef if nothing works. | |||||
| 374 | sub _try_construct { | |||||
| 375 | 0 | my ($package) = @_; | ||||
| 376 | ||||||
| 377 | 0 | for my $args ([], [{}], [undef]) { | ||||
| 378 | 0 0 | my $obj = eval { $package->new(@$args) }; | ||||
| 379 | 0 | next if $@; | ||||
| 380 | 0 | next unless defined $obj && ref $obj; | ||||
| 381 | 0 | return $obj; | ||||
| 382 | } | |||||
| 383 | ||||||
| 384 | 0 | return undef; | ||||
| 385 | } | |||||
| 386 | ||||||