File Coverage

File:bin/extract-schemas
Coverage:45.9%

linestmtbrancondsubtimecode
1#!/usr/bin/env perl
2
3
5
5
5
9519
4
103
use strict;
4
5
5
5
10
3
96
use warnings;
5
6
5
5
5
1113
16374
161
use Data::Dumper;
7
5
5
5
14
6
129
use File::Path qw(make_path);
8
5
5
5
9
4
56
use File::Spec;
9
5
5
5
1543
22760
10
use Getopt::Long;
10
5
5
5
1067
2417
139
use FindBin;
11
5
5
5
828
1397
18
use lib "$FindBin::Bin/../lib";
12
5
5
5
1326
129065
233
use Pod::Usage;
13
14
5
5
5
3200
9
7850
use App::Test::Generator::SchemaExtractor;
15
16 - 114
=head1 NAME

extract-schemas - Extract test schemas from Perl modules

=head1 SYNOPSIS

    extract-schemas [options] <module.pm>

    Options:
      --output-dir DIR    Output directory for schema files (default: schemas/)
      --strict-pod=off|warn|fatal
      --verbose           Show detailed analysis
      --fuzz              Run coverage-guided fuzzing on extracted schemas
      --fuzz-iters N      Iterations per method when fuzzing (default: 100)
                          (no short form, to avoid conflict with --fuzz/-f)
      --fuzz-all          Fuzz all methods, including those with no input schema
      --corpus-dir DIR    Directory to persist fuzz corpora (default: schemas/corpus/)
      --help              Show this help message
      --man               Show full documentation

    Examples:
      extract-schemas lib/MyModule.pm
      extract-schemas --output-dir my_schemas --verbose lib/MyModule.pm
      extract-schemas --fuzz lib/MyModule.pm
      extract-schemas --fuzz --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm
      extract-schemas --fuzz --fuzz-all lib/MyModule.pm

=head1 QUICK START

Run C<extract-schemas --strict-pod=warn -v --fuzz lib/MyModule.pm> to analyse your module and
automatically probe each method with hundreds of fuzzed inputs,
looking for
crashes caused by inputs that should be valid.
Anything suspicious is saved to C<schemas/corpus/>.

If genuine bugs are found,
run C<fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t>
to turn them into regression tests that will fail until you fix the underlying code and pass forever after.
Run C<extract-schemas --fuzz> regularly - each
run builds on the last, probing deeper into your code each time.

Otherwise, for each of the functions in MyModule.pm,
C<fuzz-harness-generator -r schemas/function.yml>

=head1 DESCRIPTION

This tool analyzes a Perl module and generates YAML schema files for each
method, suitable for use with L<App::Test::Generator>
using the C<fuzz-harness-generator> program which will create the C<.t> file to run through C<prove>.

The extractor uses three sources of information:

=over 4

=item 1. POD Documentation

Parses parameter descriptions from POD to extract types and constraints.

=item 2. Code Analysis

Analyzes validation patterns in the code (ref checks, length checks, etc.)

=item 3. Method Signatures

Extracts parameter names from method signatures.

=back

The tool assigns a confidence level (high/medium/low) to each schema based
on how much information it could infer.

=head1 FUZZING

When C<--fuzz> is specified, the tool will additionally run
C<App::Test::Generator::CoverageGuidedFuzzer> against each method after
schema extraction.

By default all methods with at least one known input parameter are fuzzed,
regardless of confidence level. Use C<--fuzz-all> to also attempt fuzzing
methods with no input schema (these will use purely random generation).

The fuzzer will:

=over 4

=item * Load and C<require> the target module at runtime

=item * Run coverage-guided fuzzing using the extracted schema as input spec

=item * Report any crashes or unexpected errors found

=item * Persist a corpus to C<--corpus-dir> for incremental improvement across runs

=back

Corpus files are named C<< <corpus-dir>/<method>.json >> and are automatically
loaded on subsequent runs, so each run builds on the last.

=cut
115
116# ---------------------------------------------------------------------------
117# Option parsing
118# ---------------------------------------------------------------------------
119
120
5
383228
my %cli_opts = (
121    help => 0,
122    man  => 0,
123);
124
125
5
10
my %extractor_opts = (
126    output_dir => 'schemas',
127    strict_pod => 'warn',
128    verbose    => 0,
129);
130
131
5
4
my $fuzz       = 0;
132
5
3
my $fuzz_all   = 0;
133
5
6
my $fuzz_iters = 100;
134
5
5
my $corpus_dir;   # default set after output_dir is known
135
136GetOptions(
137    'output-dir|o=s'  => \$extractor_opts{output_dir},
138    'strict-pod|s=s'  => \$extractor_opts{strict_pod},
139    'verbose|v'       => \$extractor_opts{verbose},
140    'fuzz|f'          => \$fuzz,
141    'fuzz-all'        => \$fuzz_all,
142    'fuzz-iters=i'    => \$fuzz_iters,
143    'corpus-dir|c=s'  => \$corpus_dir,
144    'help|h'          => \$cli_opts{help},
145    'man|m'           => \$cli_opts{man},
146
5
18
) or pod2usage(2);
147
148
5
2460
pod2usage(-exitval => 0, -verbose => 1) if $cli_opts{help};
149
4
6
pod2usage(-exitval => 0, -verbose => 2) if $cli_opts{man};
150
151
4
9
if ($extractor_opts{strict_pod} !~ /^(off|warn|fatal)$/) {
152
1
8
        die "Invalid --strict-pod value '$extractor_opts{strict_pod}'. Expected off, warn, or fatal";
153}
154
155
3
6
my $input_file = shift @ARGV or pod2usage('Error: No input file specified');
156
2
14
die "Error: File not found: $input_file" unless -f $input_file;
157
158# Default corpus dir sits under the output dir
159
2
17
$corpus_dir //= File::Spec->catdir($extractor_opts{output_dir}, 'corpus');
160
161# ---------------------------------------------------------------------------
162# Schema extraction
163# ---------------------------------------------------------------------------
164
165
2
8
print "Extracting schemas from: $input_file\n";
166
2
2
print "Output directory: $extractor_opts{output_dir}\n\n";
167
168
2
146
make_path($extractor_opts{output_dir}) unless -d $extractor_opts{output_dir};
169
170
2
7
my $extractor = App::Test::Generator::SchemaExtractor->new(
171    input_file => $input_file,
172    %extractor_opts,
173);
174
175
2
4
my $schemas = $extractor->extract_all();
176
177# ---------------------------------------------------------------------------
178# Optional: coverage-guided fuzzing
179# ---------------------------------------------------------------------------
180
181
2
2
my %fuzz_results;   # method_name => report hashref
182
183
2
2
if ($fuzz) {
184
0
0
    require App::Test::Generator::CoverageGuidedFuzzer;
185
0
0
    make_path($corpus_dir) unless -d $corpus_dir;
186
187    # Load the target module once so all methods are callable
188
0
0
    my $package = _load_target_module($input_file, $schemas);
189
190    # Try to build a default instance for object method calls.
191    # Most OO modules need a $self as the first argument.
192    # We try new() with no args, then new({}), then give up and fuzz as functions.
193
0
0
    my $instance = _try_construct($package);
194
0
0
    if ($instance) {
195
0
0
        print "Constructed $package instance for method calls.\n";
196    } else {
197
0
0
        print "Could not construct $package instance; fuzzing as functions.\n";
198    }
199
200
0
0
    print "Fuzzing with $fuzz_iters iterations per method",
201          ($fuzz_all ? ' (all methods)' : ' (methods with known inputs)'),
202          "...\n\n";
203
204
0
0
    foreach my $method (sort keys %$schemas) {
205
0
0
        my $schema = $schemas->{$method};
206
0
0
        my $iconf  = $schema->{_confidence}{input}{level} // 'low';
207
208
0
0
        unless ($fuzz_all) {
209            # Skip methods with no input schema at all — there is nothing to fuzz
210
0
0
0
0
            next if $iconf eq 'none' && !%{ $schema->{input} // {} };
211        }
212
213
0
0
        my $sub_ref = $package->can($method);
214
0
0
        unless ($sub_ref) {
215
0
0
            warn "  Skipping $method: not callable in $package\n";
216
0
0
            next;
217        }
218
219        # Skip constructors and AUTOLOAD — not suitable for direct fuzzing
220
0
0
        if ($method =~ /^(new|AUTOLOAD|DESTROY|import)$/) {
221            print "  Skipping $method (constructor/special method)\n"
222
0
0
                if $extractor_opts{verbose};
223
0
0
            next;
224        }
225
226
0
0
        my $corpus_file = File::Spec->catfile($corpus_dir, "$method.json");
227
228
0
0
        print "  Fuzzing $method ($iconf confidence)... ";
229
230
0
0
        my $fuzzer = App::Test::Generator::CoverageGuidedFuzzer->new(
231            schema      => $schema,
232            target_sub  => $sub_ref,
233            instance    => $instance,
234            iterations  => $fuzz_iters,
235        );
236
237
0
0
        $fuzzer->load_corpus($corpus_file) if -f $corpus_file;
238
239
0
0
        my $report = $fuzzer->run();
240
0
0
        $fuzzer->save_corpus($corpus_file);
241
242
0
0
        $fuzz_results{$method} = $report;
243
244        printf "%d bugs, %d branches covered\n",
245            $report->{bugs_found},
246
0
0
            $report->{branches_covered};
247    }
248
249
0
0
    print "\n";
250}
251
252# ---------------------------------------------------------------------------
253# Summary report
254# ---------------------------------------------------------------------------
255
256
2
3
print '=' x 70, "\n",
257      "EXTRACTION SUMMARY\n",
258      '=' x 70, "\n\n";
259
260
2
5
my %input_confidence_counts  = (high => 0, medium => 0, low => 0, none => 0);
261
2
4
my %output_confidence_counts = (high => 0, medium => 0, low => 0, none => 0);
262
263
2
4
foreach my $method (sort keys %$schemas) {
264
2
2
    my $schema = $schemas->{$method};
265
2
4
    my $iconf  = $schema->{_confidence}{input}{level}  // 'low';
266
2
2
    my $oconf  = $schema->{_confidence}{output}{level} // 'low';
267
2
2
    $input_confidence_counts{$iconf}++;
268
2
1
    $output_confidence_counts{$oconf}++;
269
270
2
4
2
2
5
4
    my $param_count = scalar grep { $_ !~ /^_/ } keys %{ $schema->{input} };
271
272
2
2
    my $fuzz_col = '';
273
2
2
    if (exists $fuzz_results{$method}) {
274
0
0
        my $r = $fuzz_results{$method};
275        $fuzz_col = $r->{bugs_found}
276            ? sprintf('  BUGS: %d', $r->{bugs_found})
277
0
0
            : '  fuzz: ok';
278    }
279
280
2
8
    printf "%-30s %d params  [%s input confidence] [%s output confidence]%s\n",
281        $method, $param_count, uc($iconf), uc($oconf), $fuzz_col;
282}
283
284
2
3
print "\n";
285
2
6
print 'Total methods: ', (scalar keys %$schemas), "\n";
286
2
2
print "  Input:\n";
287
2
3
print "    High confidence:   $input_confidence_counts{high}\n";
288
2
3
print "    Medium confidence: $input_confidence_counts{medium}\n";
289
2
2
print "    Low confidence:    $input_confidence_counts{low}\n";
290
2
2
print "  Output:\n";
291
2
2
print "    High confidence:   $output_confidence_counts{high}\n";
292
2
2
print "    Medium confidence: $output_confidence_counts{medium}\n";
293
2
2
print "    Low confidence:    $output_confidence_counts{low}\n";
294
2
2
print "\n";
295
296
2
5
if ($input_confidence_counts{low} > 0 || $input_confidence_counts{medium} > 0) {
297
0
0
    print "RECOMMENDATION:\n",
298          "Review the generated schemas in $extractor_opts{output_dir}/\n",
299          "Focus on methods with medium/low confidence ratings.\n\n";
300}
301
302# Fuzz bug detail
303
2
2
if (%fuzz_results) {
304
0
0
    my $total_bugs = 0;
305
0
0
    $total_bugs += $_->{bugs_found} for values %fuzz_results;
306
307
0
0
    if ($total_bugs) {
308
0
0
        print '=' x 70, "\n",
309              "FUZZING BUGS FOUND ($total_bugs total)\n",
310              '=' x 70, "\n\n";
311
312
0
0
        foreach my $method (sort keys %fuzz_results) {
313
0
0
            my $r = $fuzz_results{$method};
314
0
0
            next unless $r->{bugs_found};
315
0
0
            print "  $method:\n";
316
0
0
0
0
            for my $i (0 .. $#{ $r->{bugs} }) {
317
0
0
                my $bug = $r->{bugs}[$i];
318
0
0
                my $inp = defined($bug->{input}) ? qq("$bug->{input}") : 'undef';
319                printf "    Bug %d: input=%-30s error=%s\n",
320
0
0
                    $i + 1, $inp, $bug->{error};
321            }
322
0
0
            print "\n";
323        }
324
0
0
        print "Corpora saved to: $corpus_dir/\n\n";
325    } else {
326
0
0
        print "Fuzzing complete: no bugs found across ",
327              scalar(keys %fuzz_results), " methods.\n\n";
328    }
329}
330
331
2
3
if ($extractor_opts{verbose}) {
332
1
2
    print "Schemas:\n\t", Dumper($schemas);
333}
334
335
2
255
print "Schema files written to: $extractor_opts{output_dir}/\n";
336
337# ---------------------------------------------------------------------------
338# Helper: load the target module so methods become callable
339# ---------------------------------------------------------------------------
340
341sub _load_target_module {
342
0
    my ($input_file, $schemas) = @_;
343
344    # Derive the package name from the first schema entry that has 'module' set
345
0
    my ($package) = map  { $schemas->{$_}{module} }
346
0
0
                    grep { $schemas->{$_}{module} }
347                    keys %$schemas;
348
349
0
    die 'Could not determine package name from extracted schemas' unless $package;
350
351    # Add the module's containing lib dir to @INC
352    # Walks up from the file looking for a 'lib' directory
353
0
    my $abs = File::Spec->rel2abs($input_file);
354
0
    my @dirs = File::Spec->splitdir( (File::Spec->splitpath($abs))[1] );
355
356
0
    while (@dirs) {
357
0
        my $candidate = File::Spec->catdir(@dirs, 'lib');
358
0
        if (-d $candidate) {
359
0
            lib->import($candidate);
360
0
            last;
361        }
362
0
        pop @dirs;
363    }
364
365
0
    eval "require $package"
366        or die "Could not load $package for fuzzing: $@";
367
368
0
    return $package;
369}
370
371# Try to construct a default instance of the target package for method calls.
372# Attempts new() with progressively more forgiving argument lists.
373# Returns the instance on success, undef if nothing works.
374sub _try_construct {
375
0
    my ($package) = @_;
376
377
0
    for my $args ([], [{}], [undef]) {
378
0
0
        my $obj = eval { $package->new(@$args) };
379
0
        next if $@;
380
0
        next unless defined $obj && ref $obj;
381
0
        return $obj;
382    }
383
384
0
    return undef;
385}
386