File Coverage

File:bin/extract-schemas
Coverage:76.2%

linestmtbrancondsubtimecode
1#!/usr/bin/env perl
2
3
6
6
6
11066
5
93
use strict;
4
6
6
6
10
3
105
use warnings;
5
6
6
6
6
1453
19198
171
use Data::Dumper;
7
6
6
6
16
5
142
use File::Path qw(make_path);
8
6
6
6
8
6
42
use File::Spec;
9
6
6
6
1811
26945
16
use Getopt::Long;
10
6
6
6
1239
2700
159
use FindBin;
11
6
6
6
1078
1599
17
use lib "$FindBin::Bin/../lib";
12
6
6
6
1453
151625
283
use Pod::Usage;
13
14
6
6
6
3890
11
9630
use App::Test::Generator::SchemaExtractor;
15
16 - 114
=head1 NAME

extract-schemas - Extract test schemas from Perl modules

=head1 SYNOPSIS

    extract-schemas [options] <module.pm>

    Options:
      --output-dir DIR    Output directory for schema files (default: schemas/)
      --strict-pod=off|warn|fatal
      --verbose           Show detailed analysis
      --fuzz              Run coverage-guided fuzzing on extracted schemas
      --fuzz-iters N      Iterations per method when fuzzing (default: 100)
                          (no short form, to avoid conflict with --fuzz/-f)
      --fuzz-all          Fuzz all methods, including those with no input schema
      --corpus-dir DIR    Directory to persist fuzz corpora (default: schemas/corpus/)
      --help              Show this help message
      --man               Show full documentation

    Examples:
      extract-schemas lib/MyModule.pm
      extract-schemas --output-dir my_schemas --verbose lib/MyModule.pm
      extract-schemas --fuzz lib/MyModule.pm
      extract-schemas --fuzz --fuzz-iters 300 --corpus-dir t/corpus lib/MyModule.pm
      extract-schemas --fuzz --fuzz-all lib/MyModule.pm

=head1 QUICK START

Run C<extract-schemas --strict-pod=warn -v --fuzz lib/MyModule.pm> to analyse your module and
automatically probe each method with hundreds of fuzzed inputs,
looking for
crashes caused by inputs that should be valid.
Anything suspicious is saved to C<schemas/corpus/>.

If genuine bugs are found,
run C<fuzz-harness-generator --replay-corpus schemas/corpus/ -o t/fuzz_replay.t>
to turn them into regression tests that will fail until you fix the underlying code and pass forever after.
Run C<extract-schemas --fuzz> regularly - each
run builds on the last, probing deeper into your code each time.

Otherwise, for each of the functions in MyModule.pm,
C<fuzz-harness-generator -r schemas/function.yml>

=head1 DESCRIPTION

This tool analyzes a Perl module and generates YAML schema files for each
method, suitable for use with L<App::Test::Generator>
using the C<fuzz-harness-generator> program which will create the C<.t> file to run through C<prove>.

The extractor uses three sources of information:

=over 4

=item 1. POD Documentation

Parses parameter descriptions from POD to extract types and constraints.

=item 2. Code Analysis

Analyzes validation patterns in the code (ref checks, length checks, etc.)

=item 3. Method Signatures

Extracts parameter names from method signatures.

=back

The tool assigns a confidence level (high/medium/low) to each schema based
on how much information it could infer.

=head1 FUZZING

When C<--fuzz> is specified, the tool will additionally run
C<App::Test::Generator::CoverageGuidedFuzzer> against each method after
schema extraction.

By default all methods with at least one known input parameter are fuzzed,
regardless of confidence level. Use C<--fuzz-all> to also attempt fuzzing
methods with no input schema (these will use purely random generation).

The fuzzer will:

=over 4

=item * Load and C<require> the target module at runtime

=item * Run coverage-guided fuzzing using the extracted schema as input spec

=item * Report any crashes or unexpected errors found

=item * Persist a corpus to C<--corpus-dir> for incremental improvement across runs

=back

Corpus files are named C<< <corpus-dir>/<method>.json >> and are automatically
loaded on subsequent runs, so each run builds on the last.

=cut
115
116# ---------------------------------------------------------------------------
117# Option parsing
118# ---------------------------------------------------------------------------
119
120
6
460055
my %cli_opts = (
121    help => 0,
122    man  => 0,
123);
124
125
6
15
my %extractor_opts = (
126    output_dir => 'schemas',
127    strict_pod => 'warn',
128    verbose    => 0,
129);
130
131
6
7
my $fuzz       = 0;
132
6
5
my $fuzz_all   = 0;
133
6
6
my $fuzz_iters = 100;
134
6
6
my $corpus_dir;   # default set after output_dir is known
135
136GetOptions(
137        'output-dir|o=s'  => \$extractor_opts{output_dir},
138        'strict-pod|s=s'  => \$extractor_opts{strict_pod},
139        'verbose|v'       => \$extractor_opts{verbose},
140        'fuzz|f'          => \$fuzz,
141        'fuzz-all'        => \$fuzz_all,
142        'fuzz-iters=i'    => \$fuzz_iters,
143        'corpus-dir|c=s'  => \$corpus_dir,
144        'help|h'          => \$cli_opts{help},
145        'man|m'           => \$cli_opts{man},
146
6
20
) or pod2usage(2);
147
148
6
3074
pod2usage(-exitval => 0, -verbose => 1) if $cli_opts{help};
149
5
8
pod2usage(-exitval => 0, -verbose => 2) if $cli_opts{man};
150
151
5
15
if ($extractor_opts{strict_pod} !~ /^(off|warn|fatal)$/) {
152
1
8
        die "Invalid --strict-pod value '$extractor_opts{strict_pod}'. Expected off, warn, or fatal";
153}
154
155
4
8
my $input_file = shift @ARGV or pod2usage('Error: No input file specified');
156
3
19
die "Error: File not found: $input_file" unless -f $input_file;
157
158# Default corpus dir sits under the output dir
159
3
20
$corpus_dir //= File::Spec->catdir($extractor_opts{output_dir}, 'corpus');
160
161# ---------------------------------------------------------------------------
162# Schema extraction
163# ---------------------------------------------------------------------------
164
165
3
11
print "Extracting schemas from: $input_file\n";
166
3
5
print "Output directory: $extractor_opts{output_dir}\n\n";
167
168
3
289
make_path($extractor_opts{output_dir}) unless -d $extractor_opts{output_dir};
169
170
3
35
my $extractor = App::Test::Generator::SchemaExtractor->new(
171        input_file => $input_file,
172        %extractor_opts,
173);
174
175
3
5
my $schemas = $extractor->extract_all();
176
177# ---------------------------------------------------------------------------
178# Optional: coverage-guided fuzzing
179# ---------------------------------------------------------------------------
180
181
3
3
my %fuzz_results;   # method_name => report hashref
182
183
3
5
if ($fuzz) {
184
1
197
        require App::Test::Generator::CoverageGuidedFuzzer;
185
1
104
        make_path($corpus_dir) unless -d $corpus_dir;
186
187    # Load the target module once so all methods are callable
188
1
2
    my $package = _load_target_module($input_file, $schemas);
189
190    # Try to build a default instance for object method calls.
191    # Most OO modules need a $self as the first argument.
192    # We try new() with no args, then new({}), then give up and fuzz as functions.
193
1
2
    my $instance = _try_construct($package);
194
1
1
    if ($instance) {
195
0
0
        print "Constructed $package instance for method calls.\n";
196    } else {
197
1
2
        print "Could not construct $package instance; fuzzing as functions.\n";
198    }
199
200
1
1
    print "Fuzzing with $fuzz_iters iterations per method",
201          ($fuzz_all ? ' (all methods)' : ' (methods with known inputs)'),
202          "...\n\n";
203
204
1
2
    foreach my $method (sort keys %$schemas) {
205
1
1
        my $schema = $schemas->{$method};
206
1
2
        my $iconf  = $schema->{_confidence}{input}{level} // 'low';
207
208
1
1
        unless ($fuzz_all) {
209            # Skip methods with no input schema at all — there is nothing to fuzz
210
1
0
2
0
            next if $iconf eq 'none' && !%{ $schema->{input} // {} };
211        }
212
213
1
3
        my $sub_ref = $package->can($method);
214
1
1
        unless ($sub_ref) {
215
0
0
            warn "  Skipping $method: not callable in $package\n";
216
0
0
            next;
217        }
218
219        # Skip constructors and AUTOLOAD — not suitable for direct fuzzing
220
1
2
        if ($method =~ /^(new|AUTOLOAD|DESTROY|import)$/) {
221            print "  Skipping $method (constructor/special method)\n"
222
0
0
                if $extractor_opts{verbose};
223
0
0
            next;
224        }
225
226
1
6
        my $corpus_file = File::Spec->catfile($corpus_dir, "$method.json");
227
228
1
1
        print "  Fuzzing $method ($iconf confidence)... ";
229
230
1
3
        my $fuzzer = App::Test::Generator::CoverageGuidedFuzzer->new(
231            schema      => $schema,
232            target_sub  => $sub_ref,
233            instance    => $instance,
234            iterations  => $fuzz_iters,
235        );
236
237
1
8
        $fuzzer->load_corpus($corpus_file) if -f $corpus_file;
238
239
1
1
        my $report = $fuzzer->run();
240
1
2
        $fuzzer->save_corpus($corpus_file);
241
242
1
2
        $fuzz_results{$method} = $report;
243
244        printf "%d bugs, %d branches covered\n",
245            $report->{bugs_found},
246
1
7
            $report->{branches_covered};
247    }
248
249
1
1
    print "\n";
250}
251
252# ---------------------------------------------------------------------------
253# Summary report
254# ---------------------------------------------------------------------------
255
256
3
4
print '=' x 70, "\n",
257      "EXTRACTION SUMMARY\n",
258      '=' x 70, "\n\n";
259
260
3
7
my %input_confidence_counts  = (high => 0, medium => 0, low => 0, none => 0);
261
3
5
my %output_confidence_counts = (high => 0, medium => 0, low => 0, none => 0);
262
263
3
7
foreach my $method (sort keys %$schemas) {
264
3
2
    my $schema = $schemas->{$method};
265
3
5
    my $iconf  = $schema->{_confidence}{input}{level}  // 'low';
266
3
4
    my $oconf  = $schema->{_confidence}{output}{level} // 'low';
267
3
3
    $input_confidence_counts{$iconf}++;
268
3
3
    $output_confidence_counts{$oconf}++;
269
270
3
6
3
3
9
4
    my $param_count = scalar grep { $_ !~ /^_/ } keys %{ $schema->{input} };
271
272
3
3
    my $fuzz_col = '';
273
3
4
    if (exists $fuzz_results{$method}) {
274
1
0
        my $r = $fuzz_results{$method};
275        $fuzz_col = $r->{bugs_found}
276            ? sprintf('  BUGS: %d', $r->{bugs_found})
277
1
2
            : '  fuzz: ok';
278    }
279
280
3
10
    printf "%-30s %d params  [%s input confidence] [%s output confidence]%s\n",
281        $method, $param_count, uc($iconf), uc($oconf), $fuzz_col;
282}
283
284
3
3
print "\n";
285
3
9
print 'Total methods: ', (scalar keys %$schemas), "\n";
286
3
3
print "  Input:\n";
287
3
4
print "    High confidence:   $input_confidence_counts{high}\n";
288
3
4
print "    Medium confidence: $input_confidence_counts{medium}\n";
289
3
4
print "    Low confidence:    $input_confidence_counts{low}\n";
290
3
2
print "  Output:\n";
291
3
2
print "    High confidence:   $output_confidence_counts{high}\n";
292
3
4
print "    Medium confidence: $output_confidence_counts{medium}\n";
293
3
3
print "    Low confidence:    $output_confidence_counts{low}\n";
294
3
3
print "\n";
295
296
3
7
if ($input_confidence_counts{low} > 0 || $input_confidence_counts{medium} > 0) {
297
0
0
    print "RECOMMENDATION:\n",
298          "Review the generated schemas in $extractor_opts{output_dir}/\n",
299          "Focus on methods with medium/low confidence ratings.\n\n";
300}
301
302# Fuzz bug detail
303
3
5
if (%fuzz_results) {
304
1
1
    my $total_bugs = 0;
305
1
2
    $total_bugs += $_->{bugs_found} for values %fuzz_results;
306
307
1
1
    if ($total_bugs) {
308
0
0
        print '=' x 70, "\n",
309              "FUZZING BUGS FOUND ($total_bugs total)\n",
310              '=' x 70, "\n\n";
311
312
0
0
        foreach my $method (sort keys %fuzz_results) {
313
0
0
            my $r = $fuzz_results{$method};
314
0
0
            next unless $r->{bugs_found};
315
0
0
            print "  $method:\n";
316
0
0
0
0
            for my $i (0 .. $#{ $r->{bugs} }) {
317
0
0
                my $bug = $r->{bugs}[$i];
318
0
0
                my $inp = defined($bug->{input}) ? qq("$bug->{input}") : 'undef';
319                printf "    Bug %d: input=%-30s error=%s\n",
320
0
0
                    $i + 1, $inp, $bug->{error};
321            }
322
0
0
            print "\n";
323        }
324
0
0
        print "Corpora saved to: $corpus_dir/\n\n";
325    } else {
326
1
1
        print "Fuzzing complete: no bugs found across ",
327              scalar(keys %fuzz_results), " methods.\n\n";
328    }
329}
330
331
3
5
if ($extractor_opts{verbose}) {
332
1
1
    print "Schemas:\n\t", Dumper($schemas);
333}
334
335
3
261
print "Schema files written to: $extractor_opts{output_dir}/\n";
336
337# ---------------------------------------------------------------------------
338# Helper: load the target module so methods become callable
339# ---------------------------------------------------------------------------
340
341sub _load_target_module {
342
1
1
    my ($input_file, $schemas) = @_;
343
344    # Derive the package name from the first schema entry that has 'module' set
345
1
2
    my ($package) = map  { $schemas->{$_}{module} }
346
1
1
2
1
                    grep { $schemas->{$_}{module} }
347                    keys %$schemas;
348
349
1
1
    die 'Could not determine package name from extracted schemas' unless $package;
350
351    # Reject anything that is not a syntactically valid Perl package name
352    # before it is used to build a require path — guards against this
353    # becoming a code-injection vector if $package is ever sourced from
354    # something less constrained than a PPI-parsed 'package' statement.
355
1
4
    die "Invalid package name: $package"
356        unless $package =~ /^[A-Za-z_]\w*(?:::[A-Za-z_]\w*)*\z/;
357
358    # Add the module's containing lib dir to @INC
359    # Walks up from the file looking for a 'lib' directory
360
1
10
    my $abs = File::Spec->rel2abs($input_file);
361
1
8
    my ($volume, $directory) = File::Spec->splitpath($abs);
362
1
5
    my @dirs = File::Spec->splitdir($directory);
363
364
1
1
    while (@dirs) {
365        # catpath() (not catdir()) keeps $volume attached, so this still
366        # resolves on Windows when the temp dir and the checkout live on
367        # different drive letters (a bare catdir() result is interpreted
368        # as relative to the *current* drive, not $volume's).
369
3
13
        my $candidate = File::Spec->catpath($volume, File::Spec->catdir(@dirs, 'lib'), '');
370
3
13
        if (-d $candidate) {
371
1
3
            lib->import($candidate);
372
1
44
            last;
373        }
374
2
2
        pop @dirs;
375    }
376
377    # require() the module by file path rather than via string eval, so
378    # $package is never compiled as Perl source.
379
1
1
    (my $module_file = $package) =~ s{::}{/}g;
380
1
1
1
192
    eval { require "$module_file.pm" }
381        or die "Could not load $package for fuzzing: $@";
382
383
1
2
    return $package;
384}
385
386# Try to construct a default instance of the target package for method calls.
387# Attempts new() with progressively more forgiving argument lists.
388# Returns the instance on success, undef if nothing works.
389sub _try_construct {
390
1
1
    my ($package) = @_;
391
392
1
1
    for my $args ([], [{}], [undef]) {
393
3
3
2
22
        my $obj = eval { $package->new(@$args) };
394
3
402
        next if $@;
395
0
0
        next unless defined $obj && ref $obj;
396
0
0
        return $obj;
397    }
398
399
1
2
    return undef;
400}
401