xref: /aosp_15_r20/external/AFLplusplus/afl-cmin.bash (revision 08b48e0b10e97b33e7b60c5b6e2243bd915777f2)
1#!/usr/bin/env bash
2#
3# american fuzzy lop++ - corpus minimization tool
4# ---------------------------------------------
5#
6# Originally written by Michal Zalewski
7#
8# Copyright 2014, 2015 Google Inc. All rights reserved.
9#
10# Copyright 2019-2024 AFLplusplus
11#
12# Licensed under the Apache License, Version 2.0 (the "License");
13# you may not use this file except in compliance with the License.
14# You may obtain a copy of the License at:
15#
16#   https://www.apache.org/licenses/LICENSE-2.0
17#
18# This tool tries to find the smallest subset of files in the input directory
19# that still trigger the full range of instrumentation data points seen in
20# the starting corpus. This has two uses:
21#
22#   - Screening large corpora of input files before using them as a seed for
23#     afl-fuzz. The tool will remove functionally redundant files and likely
24#     leave you with a much smaller set.
25#
26#     (In this case, you probably also want to consider running afl-tmin on
27#     the individual files later on to reduce their size.)
28#
29#   - Minimizing the corpus generated organically by afl-fuzz, perhaps when
30#     planning to feed it to more resource-intensive tools. The tool achieves
31#     this by removing all entries that used to trigger unique behaviors in the
32#     past, but have been made obsolete by later finds.
33#
34# Note that the tool doesn't modify the files themselves. For that, you want
35# afl-tmin.
36#
37# This script must use bash because other shells may have hardcoded limits on
38# array sizes.
39#
40
41echo "corpus minimization tool for afl-fuzz"
42echo
43
44#########
45# SETUP #
46#########
47
48# Process command-line options...
49
50MEM_LIMIT=none
51TIMEOUT=5000
52
53unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN F_ARG \
54  AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE T_ARG
55
56export AFL_QUIET=1
57
58while getopts "+i:o:f:m:t:T:eOQUAChXY" opt; do
59
60  case "$opt" in
61
62    "h")
63	;;
64
65    "i")
66         IN_DIR="$OPTARG"
67         ;;
68
69    "o")
70         OUT_DIR="$OPTARG"
71         ;;
72    "f")
73         STDIN_FILE="$OPTARG"
74         F_ARG=1
75         ;;
76    "m")
77         MEM_LIMIT="$OPTARG"
78         MEM_LIMIT_GIVEN=1
79         ;;
80    "t")
81         TIMEOUT="$OPTARG"
82         ;;
83    "e")
84         EXTRA_PAR="$EXTRA_PAR -e"
85         ;;
86    "A")
87         export AFL_CMIN_ALLOW_ANY=1
88         ;;
89    "C")
90         export AFL_CMIN_CRASHES_ONLY=1
91         ;;
92    "O")
93         EXTRA_PAR="$EXTRA_PAR -O"
94         FRIDA_MODE=1
95         ;;
96    "Q")
97         EXTRA_PAR="$EXTRA_PAR -Q"
98         QEMU_MODE=1
99         ;;
100    "Y")
101         EXTRA_PAR="$EXTRA_PAR -X"
102         NYX_MODE=1
103         ;;
104    "X")
105         EXTRA_PAR="$EXTRA_PAR -X"
106         NYX_MODE=1
107         ;;
108    "U")
109         EXTRA_PAR="$EXTRA_PAR -U"
110         UNICORN_MODE=1
111         ;;
112    "T")
113         T_ARG="$OPTARG"
114         ;;
115    "?")
116         exit 1
117         ;;
118
119   esac
120
121done
122
123shift $((OPTIND-1))
124
125TARGET_BIN="$1"
126
127if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then
128
129  cat 1>&2 <<_EOF_
130Usage: $0 [ options ] -- /path/to/target_app [ ... ]
131
132Required parameters:
133
134  -i dir        - input directory with the starting corpus
135  -o dir        - output directory for minimized files
136
137Execution control settings:
138
139  -T tasks      - how many parallel processes to create (default=1, "all"=nproc)
140  -f file       - location read by the fuzzed program (default: stdin)
141  -m megs       - memory limit for child process (default=$MEM_LIMIT MB)
142  -t msec       - run time limit for child process (default: 5000ms)
143  -O            - use binary-only instrumentation (FRIDA mode)
144  -Q            - use binary-only instrumentation (QEMU mode)
145  -U            - use unicorn-based instrumentation (Unicorn mode)
146  -X            - use Nyx mode
147  
148Minimization settings:
149
150  -A            - allow crashing and timeout inputs
151  -C            - keep crashing inputs, reject everything else
152  -e            - solve for edge coverage only, ignore hit counts
153
154For additional tips, please consult README.md.
155
156Environment variables used:
157AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory
158AFL_NO_FORKSRV: run target via execve instead of using the forkserver
159AFL_PATH: last resort location to find the afl-showmap binary
160AFL_SKIP_BIN_CHECK: skip check for target binary
161AFL_CUSTOM_MUTATOR_LIBRARY: custom mutator library (post_process and send)
162AFL_PYTHON_MODULE: custom mutator library (post_process and send)
163_EOF_
164  exit 1
165fi
166
167# Do a sanity check to discourage the use of /tmp, since we can't really
168# handle this safely from a shell script.
169
170if [ "$AFL_ALLOW_TMP" = "" ]; then
171
172  echo "$IN_DIR" | grep -qE '^(/var)?/tmp/'
173  T1="$?"
174
175  echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/'
176  T2="$?"
177
178  echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/'
179  T3="$?"
180
181  echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/'
182  T4="$?"
183
184  echo "$PWD" | grep -qE '^(/var)?/tmp/'
185  T5="$?"
186
187  if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then
188    echo "[-] Warning: do not use this script in /tmp or /var/tmp for security reasons." 1>&2
189  fi
190
191fi
192
193# If @@ is specified, but there's no -f, let's come up with a temporary input
194# file name.
195
196TRACE_DIR="$OUT_DIR/.traces"
197
198if [ "$STDIN_FILE" = "" ]; then
199
200  if echo "$*" | grep -qF '@@'; then
201    STDIN_FILE="$TRACE_DIR/.cur_input"
202  fi
203
204fi
205
206# Check for obvious errors.
207
208if [ ! "$T_ARG" = "" -a -n "$F_ARG" -a ! "$NYX_MODE" == 1 ]; then
209  echo "[-] Error: -T and -f can not be used together." 1>&2
210  exit 1
211fi
212
213if [ ! "$MEM_LIMIT" = "none" ]; then
214
215  if [ "$MEM_LIMIT" -lt "5" ]; then
216    echo "[-] Error: dangerously low memory limit." 1>&2
217    exit 1
218  fi
219
220fi
221
222if [ ! "$TIMEOUT" = "none" ]; then
223
224  if [ "$TIMEOUT" -lt "10" ]; then
225    echo "[-] Error: dangerously low timeout." 1>&2
226    exit 1
227  fi
228
229fi
230
231if [ "$NYX_MODE" = "" ]; then
232  if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then
233
234    TNEW="`which "$TARGET_BIN" 2>/dev/null`"
235
236    if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then
237      echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2
238      exit 1
239    fi
240
241    TARGET_BIN="$TNEW"
242
243  fi
244
245fi
246
247grep -aq AFL_DUMP_MAP_SIZE "$TARGET_BIN" && {
248  echo "[!] Trying to obtain the map size of the target ..."
249  MAPSIZE=`AFL_DUMP_MAP_SIZE=1 "./$TARGET_BIN" 2>/dev/null`
250  test -n "$MAPSIZE" && {
251    export AFL_MAP_SIZE=$MAPSIZE
252    echo "[+] Setting AFL_MAP_SIZE=$MAPSIZE"
253  }
254}
255
256if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" -a "$NYX_MODE" = "" ]; then
257
258  if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then
259    echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2
260    exit 1
261  fi
262
263fi
264
265if [ ! -d "$IN_DIR" ]; then
266  echo "[-] Error: directory '$IN_DIR' not found." 1>&2
267  exit 1
268fi
269
270test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default"
271test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue"
272
273find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null
274rm -rf "$TRACE_DIR" 2>/dev/null
275
276rmdir "$OUT_DIR" 2>/dev/null
277
278if [ -d "$OUT_DIR" ]; then
279  echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2
280  exit 1
281fi
282
283mkdir -m 700 -p "$TRACE_DIR" || exit 1
284
285if [ ! "$STDIN_FILE" = "" ]; then
286  rm -f "$STDIN_FILE" || exit 1
287  touch "$STDIN_FILE" || exit 1
288fi
289
290SHOWMAP=`command -v afl-showmap 2>/dev/null`
291
292if [ -z "$SHOWMAP" ]; then
293  TMP="${0%/afl-cmin.bash}/afl-showmap"
294  if [ -x "$TMP" ]; then
295    SHOWMAP=$TMP
296  fi
297fi
298
299if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then
300  SHOWMAP="./afl-showmap"
301else
302  if [ -n "$AFL_PATH" ]; then
303    SHOWMAP="$AFL_PATH/afl-showmap"
304  fi
305fi
306
307if [ ! -x "$SHOWMAP" ]; then
308  echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2
309  rm -rf "$TRACE_DIR"
310  exit 1
311fi
312
313THREADS=
314if [ ! "$T_ARG" = "" ]; then
315  if [ "$T_ARG" = "all" ]; then
316    THREADS=$(nproc)
317  else
318    if [ "$T_ARG" -gt 1 -a "$T_ARG" -le "$(nproc)" ]; then
319      THREADS=$T_ARG
320    else
321      echo "[-] Error: -T parameter must between 2 and $(nproc) or \"all\"." 1>&2
322    fi
323  fi
324else
325  if [ -z "$F_ARG" ]; then
326    echo "[*] Are you aware of the '-T all' parallelize option that massively improves the speed?"
327  fi
328fi
329
330IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`))
331
332if [ "$IN_COUNT" = "0" ]; then
333  echo "[-] Hmm, no inputs in the target directory. Nothing to be done."
334  rm -rf "$TRACE_DIR"
335  exit 1
336fi
337
338echo "[*] Are you aware that afl-cmin is faster than this afl-cmin.bash script?"
339echo "[+] Found $IN_COUNT files for minimizing."
340
341if [ -n "$THREADS" ]; then
342  if [ "$IN_COUNT" -lt "$THREADS" ]; then
343    THREADS=$IN_COUNT
344    echo "[!] WARNING: less inputs than threads, reducing threads to $THREADS and likely the overhead of threading makes things slower..."
345  fi
346fi
347
348FIRST_FILE=`ls "$IN_DIR" | head -1`
349
350# Make sure that we're not dealing with a directory.
351
352if [ -d "$IN_DIR/$FIRST_FILE" ]; then
353  echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2
354  rm -rf "$TRACE_DIR"
355  exit 1
356fi
357
358# Check for the more efficient way to copy files...
359
360if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then
361  CP_TOOL=ln
362else
363  CP_TOOL=cp
364fi
365
366# Make sure that we can actually get anything out of afl-showmap before we
367# waste too much time.
368
369echo "[*] Testing the target binary..."
370
371if [ "$STDIN_FILE" = "" ]; then
372
373  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE"
374
375else
376
377  cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE"
378  AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
379
380fi
381
382FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`))
383
384if [ "$FIRST_COUNT" -gt "0" ]; then
385
386  echo "[+] OK, $FIRST_COUNT tuples recorded."
387
388else
389
390  echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2
391  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
392  exit 1
393
394fi
395
396TMPFILE=$OUT_DIR/.list.$$
397if [ ! "$THREADS" = "" ]; then
398  ls -- "$IN_DIR" > $TMPFILE 2>/dev/null
399  IN_COUNT=$(cat $TMPFILE | wc -l)
400  SPLIT=$(($IN_COUNT / $THREADS))
401  if [ "$(($IN_COUNT % $THREADS))" -gt 0 ]; then
402    SPLIT=$(($SPLIT + 1))
403  fi
404  echo "[+] Splitting workload into $THREADS tasks with $SPLIT items on average each."
405  split -l $SPLIT $TMPFILE $TMPFILE.
406fi
407
408# Let's roll!
409
410#############################
411# STEP 1: COLLECTING TRACES #
412#############################
413
414echo "[*] Obtaining traces for input files in '$IN_DIR'..."
415
416if [ "$THREADS" = "" ]; then
417(
418
419  CUR=0
420
421  if [ "$STDIN_FILE" = "" ]; then
422
423    ls "$IN_DIR" | while read -r fn; do
424
425      if [ -s "$IN_DIR/$fn" ]; then
426
427        CUR=$((CUR+1))
428        printf "\\r    Processing file $CUR/$IN_COUNT... "
429
430        "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
431
432      fi
433
434    done
435
436  else
437
438    ls "$IN_DIR" | while read -r fn; do
439
440      if [ -s "$IN_DIR/$fn" ]; then
441
442        CUR=$((CUR+1))
443        printf "\\r    Processing file $CUR/$IN_COUNT... "
444
445        cp "$IN_DIR/$fn" "$STDIN_FILE"
446        "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
447
448      fi
449
450    done
451
452  fi
453
454  echo
455
456)
457
458else
459
460  PIDS=
461  CNT=0
462  for inputs in $(ls ${TMPFILE}.*); do
463
464(
465
466  if [ "$STDIN_FILE" = "" ]; then
467
468    cat $inputs | while read -r fn; do
469
470      if [ -s "$IN_DIR/$fn" ]; then
471
472        "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn"
473
474      fi
475
476    done
477
478  else
479
480    if [ -s "$IN_DIR/$fn" ]; then
481      STDIN_FILE="$inputs.$$"
482      cat $inputs | while read -r fn; do
483
484        cp "$IN_DIR/$fn" "$STDIN_FILE"
485        "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null
486
487      done
488
489    fi
490
491  fi
492
493) &
494
495  PIDS="$PIDS $!"
496  done
497
498  echo "[+] Waiting for running tasks IDs:$PIDS"
499  wait
500  echo "[+] all $THREADS running tasks completed."
501  rm -f ${TMPFILE}*
502
503  #echo trace dir files: $(ls $TRACE_DIR/*|wc -l)
504
505fi
506
507
508##########################
509# STEP 2: SORTING TUPLES #
510##########################
511
512# With this out of the way, we sort all tuples by popularity across all
513# datasets. The reasoning here is that we won't be able to avoid the files
514# that trigger unique tuples anyway, so we will want to start with them and
515# see what's left.
516
517echo "[*] Sorting trace sets (this may take a while)..."
518
519ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \
520  sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq"
521
522TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`))
523
524echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files."
525
526#####################################
527# STEP 3: SELECTING CANDIDATE FILES #
528#####################################
529
530# The next step is to find the best candidate for each tuple. The "best"
531# part is understood simply as the smallest input that includes a particular
532# tuple in its trace. Empirical evidence suggests that this produces smaller
533# datasets than more involved algorithms that could be still pulled off in
534# a shell script.
535
536echo "[*] Finding best candidates for each tuple..."
537
538CUR=0
539
540ls -rS "$IN_DIR" | while read -r fn; do
541
542  CUR=$((CUR+1))
543  printf "\\r    Processing file $CUR/$IN_COUNT... "
544
545  sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list"
546
547  test -s "$TRACE_DIR/$fn" || echo Warning: $fn is ignored because of crashing the target
548
549done
550
551echo
552
553##############################
554# STEP 4: LOADING CANDIDATES #
555##############################
556
557# At this point, we have a file of tuple-file pairs, sorted by file size
558# in ascending order (as a consequence of ls -rS). By doing sort keyed
559# only by tuple (-k 1,1) and configured to output only the first line for
560# every key (-s -u), we end up with the smallest file for each tuple.
561
562echo "[*] Sorting candidate list (be patient)..."
563
564sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \
565  sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script"
566
567if [ ! -s "$TRACE_DIR/.candidate_script" ]; then
568  echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2
569  test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
570  exit 1
571fi
572
573# The sed command converted the sorted list to a shell script that populates
574# BEST_FILE[tuple]="fname". Let's load that!
575
576. "$TRACE_DIR/.candidate_script"
577
578##########################
579# STEP 5: WRITING OUTPUT #
580##########################
581
582# The final trick is to grab the top pick for each tuple, unless said tuple is
583# already set due to the inclusion of an earlier candidate; and then put all
584# tuples associated with the newly-added file to the "already have" list. The
585# loop works from least popular tuples and toward the most common ones.
586
587echo "[*] Processing candidates and writing output files..."
588
589CUR=0
590
591touch "$TRACE_DIR/.already_have"
592
593while read -r cnt tuple; do
594
595  CUR=$((CUR+1))
596  printf "\\r    Processing tuple $CUR/$TUPLE_COUNT with count $cnt... "
597
598  # If we already have this tuple, skip it.
599
600  grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue
601
602  FN=${BEST_FILE[tuple]}
603
604#  echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log"
605  $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN"
606
607  if [ "$((CUR % 5))" = "0" ]; then
608    sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp"
609    mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have"
610  else
611    cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have"
612  fi
613
614done <"$TRACE_DIR/.all_uniq"
615
616echo
617
618OUT_COUNT=`ls -- "$OUT_DIR" | wc -l`
619
620if [ "$OUT_COUNT" = "1" ]; then
621  echo "[!] WARNING: All test cases had the same traces, check syntax!"
622fi
623
624echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'."
625echo
626
627test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR"
628
629exit 0
630