| File: | bin/fuzz-harness-generator |
| Coverage: | 38.6% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | #!/usr/bin/env perl | |||||
| 2 | ||||||
| 3 | 6 6 6 | 11727 6 96 | use strict; | |||
| 4 | 6 6 6 | 8 4 116 | use warnings; | |||
| 5 | 6 6 6 | 961 48194 9 | use autodie qw(:all); | |||
| 6 | ||||||
| 7 | 6 6 6 | 59214 9 130 | use App::Test::Generator; | |||
| 8 | 6 6 6 | 15 3 49 | use File::Spec; | |||
| 9 | 6 6 6 | 7 5 155 | use File::Temp; | |||
| 10 | 6 6 6 | 1886 27450 13 | use Getopt::Long qw(GetOptions); | |||
| 11 | 6 6 6 | 1579 110509 299 | use Pod::Usage; | |||
| 12 | 6 6 6 | 17 4 5004 | use YAML::XS qw(LoadFile); | |||
| 13 | ||||||
| 14 - 92 | =head1 NAME fuzz-harness-generator - Generate fuzzing + corpus-based test harnesses from test schemas =head1 SYNOPSIS fuzz-harness-generator [-r] [-o output_file] input.yaml fuzz-harness-generator --dry-run input.yaml fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t fuzz-harness-generator --replay-corpus schemas/corpus/translate.json -o t/fuzz_replay.t =head1 DESCRIPTION This tool generates a test file that fuzzes and validates a target module's function or method, using both randomized fuzz cases and a static corpus cases (Perl or YAML). It can also generate regression test files from corpus JSON files previously written by C<extract-schemas --fuzz>, using C<--replay-corpus>. A starter C<input.yaml> can be created using C<extract-schemas> which is also in this package. =head1 OPTIONS =over 4 =item B<--help> Show this help. =item B<--input> The input configuration file =item B<--output> The (optional) output file. =item B<--dry-run> Validate the input configuration and schema extraction without writing any output files or running tests. =item B<--run> Call C<prove> on the output file. C<fuzz-harness-generator -r t/conf/data_text_append.conf> will, therefore, dynamically create and run tests on the C<append> method of L<Data::Text> =item B<--replay-corpus> PATH Instead of generating a fuzz harness, generate a regression test file from one or more corpus JSON files previously written by C<extract-schemas --fuzz>. PATH may be either: =over 4 =item * A single corpus file, e.g. C<schemas/corpus/translate.json> =item * A directory, e.g. C<schemas/corpus/> â all C<*.json> files in that directory will be included =back The generated test file contains one failing test per bug recorded in the corpus. Each test calls the target method with the exact input that previously caused a crash and expects it B<not> to die. Tests will be red until the underlying bug is fixed, at which point they go green and stay green â acting as permanent regression tests. Only corpus entries with recorded bugs are included. Clean corpus entries (inputs that did not cause a bug) are ignored. =item B<--version> Prints the version of L<App::Test::Generator> =back =cut | |||||
| 93 | ||||||
| 94 | 6 | 430539 | my $infile; | |||
| 95 | my $outfile; | |||||
| 96 | 6 | 0 | my $help; | |||
| 97 | 6 | 0 | my $run; | |||
| 98 | 6 | 0 | my $verbose; | |||
| 99 | 6 | 0 | my $version; | |||
| 100 | 6 | 0 | my $dry_run; | |||
| 101 | 6 | 0 | my $replay_corpus; | |||
| 102 | ||||||
| 103 | 6 | 13 | Getopt::Long::Configure('bundling'); | |||
| 104 | ||||||
| 105 | 6 | 108 | GetOptions( | |||
| 106 | 'help|h' => \$help, | |||||
| 107 | 'input|i=s' => \$infile, | |||||
| 108 | 'dry-run|n' => \$dry_run, | |||||
| 109 | 'output|o=s' => \$outfile, | |||||
| 110 | 'run|r' => \$run, | |||||
| 111 | 'verbose|v' => \$verbose, | |||||
| 112 | 'version|V' => \$version, | |||||
| 113 | 'replay-corpus|R=s' => \$replay_corpus, | |||||
| 114 | ) or pod2usage(2); | |||||
| 115 | ||||||
| 116 | 6 | 3007 | pod2usage(-exitval => 0, -verbose => 1) if($help); | |||
| 117 | ||||||
| 118 | 5 | 6 | if($version) { | |||
| 119 | 1 | 5 | print $App::Test::Generator::VERSION, "\n"; | |||
| 120 | 1 | 28 | exit 0; | |||
| 121 | } | |||||
| 122 | ||||||
| 123 | # --------------------------------------------------------------------------- | |||||
| 124 | # --replay-corpus mode: generate a regression .t from corpus bug entries | |||||
| 125 | # --------------------------------------------------------------------------- | |||||
| 126 | ||||||
| 127 | 4 | 4 | if($replay_corpus) { | |||
| 128 | 0 | 0 | pod2usage('--replay-corpus cannot be combined with --dry-run') if $dry_run; | |||
| 129 | 0 | 0 | pod2usage('--replay-corpus cannot be combined with --input') if $infile; | |||
| 130 | ||||||
| 131 | 0 | 0 | my @corpus_files = _collect_corpus_files($replay_corpus); | |||
| 132 | 0 | 0 | die "No corpus JSON files found at: $replay_corpus\n" unless @corpus_files; | |||
| 133 | ||||||
| 134 | 0 | 0 | my $tap = _generate_replay_tap(@corpus_files); | |||
| 135 | ||||||
| 136 | 0 | 0 | if($outfile) { | |||
| 137 | 0 | 0 | open(my $fh, '>', $outfile) | |||
| 138 | or die "Cannot write to $outfile: $!"; | |||||
| 139 | 0 | 0 | print $fh $tap; | |||
| 140 | 0 | 0 | close $fh; | |||
| 141 | 0 | 0 | chmod 0755, $outfile; | |||
| 142 | 0 | 0 | print "Replay test written to: $outfile\n"; | |||
| 143 | 0 | 0 | if($run) { | |||
| 144 | 0 | 0 | exit system('prove', '-l', $outfile) >> 8; | |||
| 145 | } | |||||
| 146 | } else { | |||||
| 147 | 0 | 0 | print $tap; | |||
| 148 | } | |||||
| 149 | 0 | 0 | exit 0; | |||
| 150 | } | |||||
| 151 | ||||||
| 152 | 4 | 12 | if($infile && @ARGV) { | |||
| 153 | 0 | 0 | pod2usage('Specify input file either as argument or via --input, not both'); | |||
| 154 | } | |||||
| 155 | ||||||
| 156 | 4 | 4 | if($infile) { | |||
| 157 | 4 4 | 4 8 | my $schema = eval { LoadFile($infile) }; | |||
| 158 | 4 | 512 | if($@) { | |||
| 159 | 1 | 3 | die "Cannot parse '$infile' as YAML: $@"; | |||
| 160 | } | |||||
| 161 | 3 | 5 | unless(ref($schema) eq 'HASH') { | |||
| 162 | 0 | 0 | die "Input file '$infile' does not contain a YAML hash"; | |||
| 163 | } | |||||
| 164 | 3 | 13 | unless($schema->{function}) { | |||
| 165 | 0 | 0 | die "Input file '$infile' is missing required 'function' key"; | |||
| 166 | } | |||||
| 167 | } | |||||
| 168 | ||||||
| 169 | 3 | 6 | $infile ||= shift @ARGV or pod2usage('No config file given'); | |||
| 170 | ||||||
| 171 | 3 | 5 | if($dry_run && $run) { | |||
| 172 | 0 | 0 | pod2usage('--dry-run cannot be used with --run'); | |||
| 173 | } | |||||
| 174 | ||||||
| 175 | 3 | 5 | if($dry_run && $outfile) { | |||
| 176 | 1 | 7 | warn '--dry-run specified; --output will be ignored'; | |||
| 177 | } | |||||
| 178 | ||||||
| 179 | 3 | 166 | if($verbose) { | |||
| 180 | 0 | 0 | $ENV{'TEST_VERBOSE'} = 1; | |||
| 181 | } | |||||
| 182 | ||||||
| 183 | 3 | 3 | if($run && !$outfile) { | |||
| 184 | 0 | 0 | my ($fh, $tmp) = File::Temp::tempfile(); | |||
| 185 | 0 | 0 | close $fh; | |||
| 186 | ||||||
| 187 | 0 | 0 | App::Test::Generator->generate($infile, $tmp); | |||
| 188 | ||||||
| 189 | 0 | 0 | exit system('prove', '-l', $tmp) >> 8; | |||
| 190 | } | |||||
| 191 | ||||||
| 192 | 3 | 10 | if($dry_run) { | |||
| 193 | 2 | 4 | my ($fh, $tmp) = File::Temp::tempfile(); | |||
| 194 | 2 | 496 | close $fh; | |||
| 195 | ||||||
| 196 | eval { | |||||
| 197 | 2 | 9 | App::Test::Generator->generate($infile, $tmp); | |||
| 198 | 2 | 84 | 1; | |||
| 199 | 2 | 806 | } or do { | |||
| 200 | 0 | 0 | die "Dry-run failed for $infile: $@"; | |||
| 201 | }; | |||||
| 202 | ||||||
| 203 | 2 | 4 | unlink $tmp; | |||
| 204 | 2 | 684 | print "Dry-run OK: $infile parsed and validated successfully\n"; | |||
| 205 | 2 | 66 | exit 0; | |||
| 206 | } elsif($outfile && -e $outfile && !$run) { | |||||
| 207 | 0 | 0 | warn "Overwriting existing file: $outfile"; | |||
| 208 | } | |||||
| 209 | ||||||
| 210 | 1 | 4 | App::Test::Generator->generate($infile, $outfile); | |||
| 211 | ||||||
| 212 | 1 | 38 | if($outfile) { | |||
| 213 | 1 | 1 | chmod 0755, $outfile if($outfile =~ /\.(pl|cgi)$/); | |||
| 214 | 1 | 1 | if($run) { | |||
| 215 | # Use list form to avoid shell interpolation of $outfile | |||||
| 216 | 0 | 0 | system('prove', '-l', $outfile); | |||
| 217 | } | |||||
| 218 | } | |||||
| 219 | ||||||
| 220 | 1 | 25 | exit 0; | |||
| 221 | ||||||
| 222 | # --------------------------------------------------------------------------- | |||||
| 223 | # Helpers for --replay-corpus | |||||
| 224 | # --------------------------------------------------------------------------- | |||||
| 225 | ||||||
| 226 | # -------------------------------------------------- | |||||
| 227 | # _collect_corpus_files | |||||
| 228 | # | |||||
| 229 | # Purpose: Collect the list of corpus JSON files | |||||
| 230 | # to process for --replay-corpus mode. | |||||
| 231 | # Accepts either a single file path or | |||||
| 232 | # a directory, returning all *.json files | |||||
| 233 | # found in the directory case. | |||||
| 234 | # | |||||
| 235 | # Entry: $path - filesystem path to either a | |||||
| 236 | # single .json file or a directory | |||||
| 237 | # containing .json files. | |||||
| 238 | # | |||||
| 239 | # Exit: Returns a sorted list of file paths. | |||||
| 240 | # Returns an empty list if the path does | |||||
| 241 | # not exist or contains no .json files. | |||||
| 242 | # | |||||
| 243 | # Side effects: None. | |||||
| 244 | # | |||||
| 245 | # Notes: Directory globbing matches only *.json | |||||
| 246 | # files at the top level of the directory; | |||||
| 247 | # subdirectories are not recursed into. | |||||
| 248 | # -------------------------------------------------- | |||||
| 249 | sub _collect_corpus_files { | |||||
| 250 | 0 | my ($path) = @_; | ||||
| 251 | ||||||
| 252 | 0 | if(-f $path) { | ||||
| 253 | 0 | return ($path); | ||||
| 254 | } elsif(-d $path) { | |||||
| 255 | 0 | my @files = glob(File::Spec->catfile($path, '*.json')); | ||||
| 256 | 0 | return sort @files; | ||||
| 257 | } | |||||
| 258 | ||||||
| 259 | 0 | return (); | ||||
| 260 | } | |||||
| 261 | ||||||
| 262 | # -------------------------------------------------- | |||||
| 263 | # _generate_replay_tap | |||||
| 264 | # | |||||
| 265 | # Purpose: Read one or more corpus JSON files and | |||||
| 266 | # produce a complete .t file as a string. | |||||
| 267 | # Each bug entry in the corpus becomes | |||||
| 268 | # one lives_ok test that calls the target | |||||
| 269 | # method with the exact input that | |||||
| 270 | # previously caused a crash, asserting | |||||
| 271 | # that it no longer dies. | |||||
| 272 | # | |||||
| 273 | # Entry: @corpus_files - list of paths to corpus | |||||
| 274 | # JSON files as returned | |||||
| 275 | # by _collect_corpus_files. | |||||
| 276 | # | |||||
| 277 | # Exit: Returns the complete .t file content as | |||||
| 278 | # a string. Never returns undef. | |||||
| 279 | # Returns a skip_all plan if no bugs are | |||||
| 280 | # found across all corpus files. | |||||
| 281 | # | |||||
| 282 | # Side effects: Reads corpus JSON files from disk. | |||||
| 283 | # Attempts to load JSON::MaybeXS or | |||||
| 284 | # JSON via block eval. | |||||
| 285 | # | |||||
| 286 | # Notes: Corpus files that cannot be parsed are | |||||
| 287 | # skipped with a warning rather than | |||||
| 288 | # aborting the entire run. | |||||
| 289 | # Clean corpus entries (those without | |||||
| 290 | # recorded bugs) are silently ignored â | |||||
| 291 | # only entries with a 'bugs' array are | |||||
| 292 | # processed. | |||||
| 293 | # The module name for each test is | |||||
| 294 | # inferred from the YAML schema file | |||||
| 295 | # alongside the corpus file via | |||||
| 296 | # _infer_module_from_schema. Falls back | |||||
| 297 | # to 'UNKNOWN::Module' if not found. | |||||
| 298 | # -------------------------------------------------- | |||||
| 299 | sub _generate_replay_tap { | |||||
| 300 | 0 | my (@corpus_files) = @_; | ||||
| 301 | ||||||
| 302 | # Prefer JSON::MaybeXS for correctness; fall back to JSON | |||||
| 303 | 0 | my $json_module; | ||||
| 304 | 0 | for my $mod (qw(JSON::MaybeXS JSON)) { | ||||
| 305 | 0 0 0 | eval { require $mod; 1 } and $json_module = $mod and last; | ||||
| 306 | } | |||||
| 307 | 0 | die "No JSON module available; install JSON or JSON::MaybeXS\n" | ||||
| 308 | unless $json_module; | |||||
| 309 | ||||||
| 310 | # Collect all bugs across all corpus files into a flat list | |||||
| 311 | 0 | my @tests; | ||||
| 312 | ||||||
| 313 | 0 | for my $file (@corpus_files) { | ||||
| 314 | 0 | open(my $fh, '<', $file) | ||||
| 315 | or die "Cannot read $file: $!"; | |||||
| 316 | 0 | my $data = eval { | ||||
| 317 | 0 0 0 | $json_module->new->decode(do { local $/; <$fh> }) | ||||
| 318 | }; | |||||
| 319 | 0 | close $fh; | ||||
| 320 | ||||||
| 321 | 0 | if($@) { | ||||
| 322 | 0 | warn "Skipping $file: could not parse JSON: $@\n"; | ||||
| 323 | 0 | next; | ||||
| 324 | } | |||||
| 325 | ||||||
| 326 | 0 | my $bugs = $data->{'bugs'} // []; | ||||
| 327 | 0 0 | next unless @{$bugs}; | ||||
| 328 | ||||||
| 329 | # Derive method name from filename: translate.json -> translate | |||||
| 330 | 0 | my (undef, undef, $fname) = File::Spec->splitpath($file); | ||||
| 331 | 0 | (my $method = $fname) =~ s/\.json$//; | ||||
| 332 | ||||||
| 333 | # Look up the module name from the companion schema file; | |||||
| 334 | # fall back to a placeholder if the schema cannot be found | |||||
| 335 | 0 | my $module = _infer_module_from_schema($file, $method) | ||||
| 336 | // 'UNKNOWN::Module'; | |||||
| 337 | ||||||
| 338 | 0 0 | for my $bug (@{$bugs}) { | ||||
| 339 | push @tests, { | |||||
| 340 | module => $module, | |||||
| 341 | method => $method, | |||||
| 342 | input => $bug->{'input'}, | |||||
| 343 | 0 | error => $bug->{'error'}, | ||||
| 344 | file => $file, | |||||
| 345 | }; | |||||
| 346 | } | |||||
| 347 | } | |||||
| 348 | ||||||
| 349 | # Build the .t header â include Test::Exception up front since | |||||
| 350 | # lives_ok is always needed when there are tests to emit | |||||
| 351 | 0 | my $t = <<'HEADER'; | ||||
| 352 | #!/usr/bin/env perl | |||||
| 353 | # Auto-generated by fuzz-harness-generator --replay-corpus | |||||
| 354 | # DO NOT EDIT - regenerate from corpus files instead | |||||
| 355 | use strict; | |||||
| 356 | use warnings; | |||||
| 357 | use Test::More; | |||||
| 358 | use Test::Exception; | |||||
| 359 | HEADER | |||||
| 360 | ||||||
| 361 | 0 | my $test_count = scalar @tests; | ||||
| 362 | ||||||
| 363 | 0 | if($test_count == 0) { | ||||
| 364 | 0 | $t .= "\nplan skip_all => 'No bugs recorded in corpus files';\n"; | ||||
| 365 | 0 | return $t; | ||||
| 366 | } | |||||
| 367 | ||||||
| 368 | # Emit one use statement per unique module (excluding the placeholder) | |||||
| 369 | 0 0 | my %modules = map { $_->{'module'} => 1 } @tests; | ||||
| 370 | 0 | for my $mod (sort keys %modules) { | ||||
| 371 | 0 | next if $mod eq 'UNKNOWN::Module'; | ||||
| 372 | 0 | $t .= "use $mod;\n"; | ||||
| 373 | } | |||||
| 374 | ||||||
| 375 | 0 | $t .= "\nplan tests => $test_count;\n\n"; | ||||
| 376 | ||||||
| 377 | 0 | for my $i (0 .. $#tests) { | ||||
| 378 | 0 | my $test = $tests[$i]; | ||||
| 379 | 0 | my $n = $i + 1; | ||||
| 380 | 0 | my $input = _format_input($test->{'input'}); | ||||
| 381 | 0 | my $label = "$test->{'method'} does not die on input from $test->{'file'}"; | ||||
| 382 | ||||||
| 383 | # Flatten and escape the original error for use as a comment | |||||
| 384 | 0 | (my $orig_error = $test->{'error'} // '') =~ s/\n/ /g; | ||||
| 385 | 0 | $orig_error =~ s/'/\\'/g; | ||||
| 386 | ||||||
| 387 | 0 | $t .= "# Corpus bug: $orig_error\n"; | ||||
| 388 | 0 | $t .= "lives_ok { $test->{'module'}\->$test->{'method'}($input) }\n"; | ||||
| 389 | 0 | $t .= " '$label';\n\n"; | ||||
| 390 | } | |||||
| 391 | ||||||
| 392 | 0 | return $t; | ||||
| 393 | } | |||||
| 394 | ||||||
| 395 | # -------------------------------------------------- | |||||
| 396 | # _format_input | |||||
| 397 | # | |||||
| 398 | # Purpose: Format a scalar input value as a Perl | |||||
| 399 | # literal string suitable for embedding | |||||
| 400 | # directly in generated test source code. | |||||
| 401 | # | |||||
| 402 | # Entry: $input - the input value to format. | |||||
| 403 | # May be undef, a numeric string, | |||||
| 404 | # or an arbitrary string. | |||||
| 405 | # | |||||
| 406 | # Exit: Returns a Perl literal string: | |||||
| 407 | # 'undef' if $input is undef | |||||
| 408 | # bare number if $input looks numeric | |||||
| 409 | # single-quoted string otherwise, with | |||||
| 410 | # backslashes and single quotes escaped. | |||||
| 411 | # | |||||
| 412 | # Side effects: None. | |||||
| 413 | # | |||||
| 414 | # Notes: Only scalar inputs are handled â corpus | |||||
| 415 | # entries with arrayref or hashref inputs | |||||
| 416 | # are not currently supported and will be | |||||
| 417 | # formatted as a single-quoted string of | |||||
| 418 | # the stringified reference, which will | |||||
| 419 | # not reproduce the original input. | |||||
| 420 | # -------------------------------------------------- | |||||
| 421 | sub _format_input { | |||||
| 422 | 0 | my ($input) = @_; | ||||
| 423 | ||||||
| 424 | 0 | return 'undef' unless defined $input; | ||||
| 425 | ||||||
| 426 | # Emit bare numeric literals without quoting | |||||
| 427 | 0 | return $input if $input =~ /^-?(?:\d+\.?\d*|\.\d+)$/; | ||||
| 428 | ||||||
| 429 | # Escape backslashes first, then single quotes, to avoid | |||||
| 430 | # double-escaping when both appear in the same string | |||||
| 431 | 0 | (my $escaped = $input) =~ s/\\/\\\\/g; | ||||
| 432 | 0 | $escaped =~ s/'/\\'/g; | ||||
| 433 | ||||||
| 434 | 0 | return "'$escaped'"; | ||||
| 435 | } | |||||
| 436 | ||||||
| 437 | # -------------------------------------------------- | |||||
| 438 | # _infer_module_from_schema | |||||
| 439 | # | |||||
| 440 | # Purpose: Attempt to determine the Perl module | |||||
| 441 | # name for a given corpus method by | |||||
| 442 | # locating and reading the companion YAML | |||||
| 443 | # schema file that sits alongside the | |||||
| 444 | # corpus directory. | |||||
| 445 | # | |||||
| 446 | # Entry: $corpus_file - path to the corpus JSON | |||||
| 447 | # file, e.g. | |||||
| 448 | # schemas/corpus/translate.json | |||||
| 449 | # $method - the method name derived | |||||
| 450 | # from the corpus filename, | |||||
| 451 | # e.g. 'translate' | |||||
| 452 | # | |||||
| 453 | # Exit: Returns the module name string if found, | |||||
| 454 | # or undef if no companion schema file | |||||
| 455 | # exists or the schema contains no | |||||
| 456 | # 'module:' line. | |||||
| 457 | # | |||||
| 458 | # Side effects: Reads schema files from disk. | |||||
| 459 | # | |||||
| 460 | # Notes: The corpus is expected to live one | |||||
| 461 | # directory below the schemas directory, | |||||
| 462 | # e.g. schemas/corpus/ alongside | |||||
| 463 | # schemas/translate.yaml. This function | |||||
| 464 | # walks up one level from the corpus | |||||
| 465 | # directory to find the schema. | |||||
| 466 | # Both .yaml and .yml extensions are | |||||
| 467 | # tried, in that order. | |||||
| 468 | # -------------------------------------------------- | |||||
| 469 | sub _infer_module_from_schema { | |||||
| 470 | 0 | my ($corpus_file, $method) = @_; | ||||
| 471 | ||||||
| 472 | 0 | my (undef, $corpus_dir) = File::Spec->splitpath($corpus_file); | ||||
| 473 | ||||||
| 474 | # Walk up one directory from corpus/ to reach the schemas/ dir | |||||
| 475 | 0 | my $schema_dir = File::Spec->catdir($corpus_dir, File::Spec->updir()); | ||||
| 476 | ||||||
| 477 | 0 | for my $ext (qw(yaml yml)) { | ||||
| 478 | 0 | my $schema_file = File::Spec->catfile($schema_dir, "$method.$ext"); | ||||
| 479 | 0 | next unless -f $schema_file; | ||||
| 480 | ||||||
| 481 | 0 | open(my $fh, '<', $schema_file) or next; | ||||
| 482 | 0 | while(<$fh>) { | ||||
| 483 | 0 | if(/^module:\s*(\S+)/) { | ||||
| 484 | 0 | close $fh; | ||||
| 485 | 0 | return $1; | ||||
| 486 | } | |||||
| 487 | } | |||||
| 488 | 0 | close $fh; | ||||
| 489 | } | |||||
| 490 | ||||||
| 491 | 0 | return undef; | ||||
| 492 | } | |||||
| 493 | ||||||