1#!/usr/bin/env bash 2# 3# american fuzzy lop++ - corpus minimization tool 4# --------------------------------------------- 5# 6# Originally written by Michal Zalewski 7# 8# Copyright 2014, 2015 Google Inc. All rights reserved. 9# 10# Copyright 2019-2024 AFLplusplus 11# 12# Licensed under the Apache License, Version 2.0 (the "License"); 13# you may not use this file except in compliance with the License. 14# You may obtain a copy of the License at: 15# 16# https://www.apache.org/licenses/LICENSE-2.0 17# 18# This tool tries to find the smallest subset of files in the input directory 19# that still trigger the full range of instrumentation data points seen in 20# the starting corpus. This has two uses: 21# 22# - Screening large corpora of input files before using them as a seed for 23# afl-fuzz. The tool will remove functionally redundant files and likely 24# leave you with a much smaller set. 25# 26# (In this case, you probably also want to consider running afl-tmin on 27# the individual files later on to reduce their size.) 28# 29# - Minimizing the corpus generated organically by afl-fuzz, perhaps when 30# planning to feed it to more resource-intensive tools. The tool achieves 31# this by removing all entries that used to trigger unique behaviors in the 32# past, but have been made obsolete by later finds. 33# 34# Note that the tool doesn't modify the files themselves. For that, you want 35# afl-tmin. 36# 37# This script must use bash because other shells may have hardcoded limits on 38# array sizes. 39# 40 41echo "corpus minimization tool for afl-fuzz" 42echo 43 44######### 45# SETUP # 46######### 47 48# Process command-line options... 49 50MEM_LIMIT=none 51TIMEOUT=5000 52 53unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN F_ARG \ 54 AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE T_ARG 55 56export AFL_QUIET=1 57 58while getopts "+i:o:f:m:t:T:eOQUAChXY" opt; do 59 60 case "$opt" in 61 62 "h") 63 ;; 64 65 "i") 66 IN_DIR="$OPTARG" 67 ;; 68 69 "o") 70 OUT_DIR="$OPTARG" 71 ;; 72 "f") 73 STDIN_FILE="$OPTARG" 74 F_ARG=1 75 ;; 76 "m") 77 MEM_LIMIT="$OPTARG" 78 MEM_LIMIT_GIVEN=1 79 ;; 80 "t") 81 TIMEOUT="$OPTARG" 82 ;; 83 "e") 84 EXTRA_PAR="$EXTRA_PAR -e" 85 ;; 86 "A") 87 export AFL_CMIN_ALLOW_ANY=1 88 ;; 89 "C") 90 export AFL_CMIN_CRASHES_ONLY=1 91 ;; 92 "O") 93 EXTRA_PAR="$EXTRA_PAR -O" 94 FRIDA_MODE=1 95 ;; 96 "Q") 97 EXTRA_PAR="$EXTRA_PAR -Q" 98 QEMU_MODE=1 99 ;; 100 "Y") 101 EXTRA_PAR="$EXTRA_PAR -X" 102 NYX_MODE=1 103 ;; 104 "X") 105 EXTRA_PAR="$EXTRA_PAR -X" 106 NYX_MODE=1 107 ;; 108 "U") 109 EXTRA_PAR="$EXTRA_PAR -U" 110 UNICORN_MODE=1 111 ;; 112 "T") 113 T_ARG="$OPTARG" 114 ;; 115 "?") 116 exit 1 117 ;; 118 119 esac 120 121done 122 123shift $((OPTIND-1)) 124 125TARGET_BIN="$1" 126 127if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then 128 129 cat 1>&2 <<_EOF_ 130Usage: $0 [ options ] -- /path/to/target_app [ ... ] 131 132Required parameters: 133 134 -i dir - input directory with the starting corpus 135 -o dir - output directory for minimized files 136 137Execution control settings: 138 139 -T tasks - how many parallel processes to create (default=1, "all"=nproc) 140 -f file - location read by the fuzzed program (default: stdin) 141 -m megs - memory limit for child process (default=$MEM_LIMIT MB) 142 -t msec - run time limit for child process (default: 5000ms) 143 -O - use binary-only instrumentation (FRIDA mode) 144 -Q - use binary-only instrumentation (QEMU mode) 145 -U - use unicorn-based instrumentation (Unicorn mode) 146 -X - use Nyx mode 147 148Minimization settings: 149 150 -A - allow crashing and timeout inputs 151 -C - keep crashing inputs, reject everything else 152 -e - solve for edge coverage only, ignore hit counts 153 154For additional tips, please consult README.md. 155 156Environment variables used: 157AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory 158AFL_NO_FORKSRV: run target via execve instead of using the forkserver 159AFL_PATH: last resort location to find the afl-showmap binary 160AFL_SKIP_BIN_CHECK: skip check for target binary 161AFL_CUSTOM_MUTATOR_LIBRARY: custom mutator library (post_process and send) 162AFL_PYTHON_MODULE: custom mutator library (post_process and send) 163_EOF_ 164 exit 1 165fi 166 167# Do a sanity check to discourage the use of /tmp, since we can't really 168# handle this safely from a shell script. 169 170if [ "$AFL_ALLOW_TMP" = "" ]; then 171 172 echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' 173 T1="$?" 174 175 echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' 176 T2="$?" 177 178 echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' 179 T3="$?" 180 181 echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' 182 T4="$?" 183 184 echo "$PWD" | grep -qE '^(/var)?/tmp/' 185 T5="$?" 186 187 if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then 188 echo "[-] Warning: do not use this script in /tmp or /var/tmp for security reasons." 1>&2 189 fi 190 191fi 192 193# If @@ is specified, but there's no -f, let's come up with a temporary input 194# file name. 195 196TRACE_DIR="$OUT_DIR/.traces" 197 198if [ "$STDIN_FILE" = "" ]; then 199 200 if echo "$*" | grep -qF '@@'; then 201 STDIN_FILE="$TRACE_DIR/.cur_input" 202 fi 203 204fi 205 206# Check for obvious errors. 207 208if [ ! "$T_ARG" = "" -a -n "$F_ARG" -a ! "$NYX_MODE" == 1 ]; then 209 echo "[-] Error: -T and -f can not be used together." 1>&2 210 exit 1 211fi 212 213if [ ! "$MEM_LIMIT" = "none" ]; then 214 215 if [ "$MEM_LIMIT" -lt "5" ]; then 216 echo "[-] Error: dangerously low memory limit." 1>&2 217 exit 1 218 fi 219 220fi 221 222if [ ! "$TIMEOUT" = "none" ]; then 223 224 if [ "$TIMEOUT" -lt "10" ]; then 225 echo "[-] Error: dangerously low timeout." 1>&2 226 exit 1 227 fi 228 229fi 230 231if [ "$NYX_MODE" = "" ]; then 232 if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then 233 234 TNEW="`which "$TARGET_BIN" 2>/dev/null`" 235 236 if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then 237 echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 238 exit 1 239 fi 240 241 TARGET_BIN="$TNEW" 242 243 fi 244 245fi 246 247grep -aq AFL_DUMP_MAP_SIZE "$TARGET_BIN" && { 248 echo "[!] Trying to obtain the map size of the target ..." 249 MAPSIZE=`AFL_DUMP_MAP_SIZE=1 "./$TARGET_BIN" 2>/dev/null` 250 test -n "$MAPSIZE" && { 251 export AFL_MAP_SIZE=$MAPSIZE 252 echo "[+] Setting AFL_MAP_SIZE=$MAPSIZE" 253 } 254} 255 256if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" -a "$NYX_MODE" = "" ]; then 257 258 if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then 259 echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 260 exit 1 261 fi 262 263fi 264 265if [ ! -d "$IN_DIR" ]; then 266 echo "[-] Error: directory '$IN_DIR' not found." 1>&2 267 exit 1 268fi 269 270test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default" 271test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" 272 273find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null 274rm -rf "$TRACE_DIR" 2>/dev/null 275 276rmdir "$OUT_DIR" 2>/dev/null 277 278if [ -d "$OUT_DIR" ]; then 279 echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 280 exit 1 281fi 282 283mkdir -m 700 -p "$TRACE_DIR" || exit 1 284 285if [ ! "$STDIN_FILE" = "" ]; then 286 rm -f "$STDIN_FILE" || exit 1 287 touch "$STDIN_FILE" || exit 1 288fi 289 290SHOWMAP=`command -v afl-showmap 2>/dev/null` 291 292if [ -z "$SHOWMAP" ]; then 293 TMP="${0%/afl-cmin.bash}/afl-showmap" 294 if [ -x "$TMP" ]; then 295 SHOWMAP=$TMP 296 fi 297fi 298 299if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then 300 SHOWMAP="./afl-showmap" 301else 302 if [ -n "$AFL_PATH" ]; then 303 SHOWMAP="$AFL_PATH/afl-showmap" 304 fi 305fi 306 307if [ ! -x "$SHOWMAP" ]; then 308 echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 309 rm -rf "$TRACE_DIR" 310 exit 1 311fi 312 313THREADS= 314if [ ! "$T_ARG" = "" ]; then 315 if [ "$T_ARG" = "all" ]; then 316 THREADS=$(nproc) 317 else 318 if [ "$T_ARG" -gt 1 -a "$T_ARG" -le "$(nproc)" ]; then 319 THREADS=$T_ARG 320 else 321 echo "[-] Error: -T parameter must between 2 and $(nproc) or \"all\"." 1>&2 322 fi 323 fi 324else 325 if [ -z "$F_ARG" ]; then 326 echo "[*] Are you aware of the '-T all' parallelize option that massively improves the speed?" 327 fi 328fi 329 330IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) 331 332if [ "$IN_COUNT" = "0" ]; then 333 echo "[-] Hmm, no inputs in the target directory. Nothing to be done." 334 rm -rf "$TRACE_DIR" 335 exit 1 336fi 337 338echo "[*] Are you aware that afl-cmin is faster than this afl-cmin.bash script?" 339echo "[+] Found $IN_COUNT files for minimizing." 340 341if [ -n "$THREADS" ]; then 342 if [ "$IN_COUNT" -lt "$THREADS" ]; then 343 THREADS=$IN_COUNT 344 echo "[!] WARNING: less inputs than threads, reducing threads to $THREADS and likely the overhead of threading makes things slower..." 345 fi 346fi 347 348FIRST_FILE=`ls "$IN_DIR" | head -1` 349 350# Make sure that we're not dealing with a directory. 351 352if [ -d "$IN_DIR/$FIRST_FILE" ]; then 353 echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 354 rm -rf "$TRACE_DIR" 355 exit 1 356fi 357 358# Check for the more efficient way to copy files... 359 360if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then 361 CP_TOOL=ln 362else 363 CP_TOOL=cp 364fi 365 366# Make sure that we can actually get anything out of afl-showmap before we 367# waste too much time. 368 369echo "[*] Testing the target binary..." 370 371if [ "$STDIN_FILE" = "" ]; then 372 373 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" 374 375else 376 377 cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" 378 AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null 379 380fi 381 382FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) 383 384if [ "$FIRST_COUNT" -gt "0" ]; then 385 386 echo "[+] OK, $FIRST_COUNT tuples recorded." 387 388else 389 390 echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2 391 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 392 exit 1 393 394fi 395 396TMPFILE=$OUT_DIR/.list.$$ 397if [ ! "$THREADS" = "" ]; then 398 ls -- "$IN_DIR" > $TMPFILE 2>/dev/null 399 IN_COUNT=$(cat $TMPFILE | wc -l) 400 SPLIT=$(($IN_COUNT / $THREADS)) 401 if [ "$(($IN_COUNT % $THREADS))" -gt 0 ]; then 402 SPLIT=$(($SPLIT + 1)) 403 fi 404 echo "[+] Splitting workload into $THREADS tasks with $SPLIT items on average each." 405 split -l $SPLIT $TMPFILE $TMPFILE. 406fi 407 408# Let's roll! 409 410############################# 411# STEP 1: COLLECTING TRACES # 412############################# 413 414echo "[*] Obtaining traces for input files in '$IN_DIR'..." 415 416if [ "$THREADS" = "" ]; then 417( 418 419 CUR=0 420 421 if [ "$STDIN_FILE" = "" ]; then 422 423 ls "$IN_DIR" | while read -r fn; do 424 425 if [ -s "$IN_DIR/$fn" ]; then 426 427 CUR=$((CUR+1)) 428 printf "\\r Processing file $CUR/$IN_COUNT... " 429 430 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" 431 432 fi 433 434 done 435 436 else 437 438 ls "$IN_DIR" | while read -r fn; do 439 440 if [ -s "$IN_DIR/$fn" ]; then 441 442 CUR=$((CUR+1)) 443 printf "\\r Processing file $CUR/$IN_COUNT... " 444 445 cp "$IN_DIR/$fn" "$STDIN_FILE" 446 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null 447 448 fi 449 450 done 451 452 fi 453 454 echo 455 456) 457 458else 459 460 PIDS= 461 CNT=0 462 for inputs in $(ls ${TMPFILE}.*); do 463 464( 465 466 if [ "$STDIN_FILE" = "" ]; then 467 468 cat $inputs | while read -r fn; do 469 470 if [ -s "$IN_DIR/$fn" ]; then 471 472 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" 473 474 fi 475 476 done 477 478 else 479 480 if [ -s "$IN_DIR/$fn" ]; then 481 STDIN_FILE="$inputs.$$" 482 cat $inputs | while read -r fn; do 483 484 cp "$IN_DIR/$fn" "$STDIN_FILE" 485 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null 486 487 done 488 489 fi 490 491 fi 492 493) & 494 495 PIDS="$PIDS $!" 496 done 497 498 echo "[+] Waiting for running tasks IDs:$PIDS" 499 wait 500 echo "[+] all $THREADS running tasks completed." 501 rm -f ${TMPFILE}* 502 503 #echo trace dir files: $(ls $TRACE_DIR/*|wc -l) 504 505fi 506 507 508########################## 509# STEP 2: SORTING TUPLES # 510########################## 511 512# With this out of the way, we sort all tuples by popularity across all 513# datasets. The reasoning here is that we won't be able to avoid the files 514# that trigger unique tuples anyway, so we will want to start with them and 515# see what's left. 516 517echo "[*] Sorting trace sets (this may take a while)..." 518 519ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ 520 sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq" 521 522TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) 523 524echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." 525 526##################################### 527# STEP 3: SELECTING CANDIDATE FILES # 528##################################### 529 530# The next step is to find the best candidate for each tuple. The "best" 531# part is understood simply as the smallest input that includes a particular 532# tuple in its trace. Empirical evidence suggests that this produces smaller 533# datasets than more involved algorithms that could be still pulled off in 534# a shell script. 535 536echo "[*] Finding best candidates for each tuple..." 537 538CUR=0 539 540ls -rS "$IN_DIR" | while read -r fn; do 541 542 CUR=$((CUR+1)) 543 printf "\\r Processing file $CUR/$IN_COUNT... " 544 545 sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" 546 547 test -s "$TRACE_DIR/$fn" || echo Warning: $fn is ignored because of crashing the target 548 549done 550 551echo 552 553############################## 554# STEP 4: LOADING CANDIDATES # 555############################## 556 557# At this point, we have a file of tuple-file pairs, sorted by file size 558# in ascending order (as a consequence of ls -rS). By doing sort keyed 559# only by tuple (-k 1,1) and configured to output only the first line for 560# every key (-s -u), we end up with the smallest file for each tuple. 561 562echo "[*] Sorting candidate list (be patient)..." 563 564sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ 565 sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" 566 567if [ ! -s "$TRACE_DIR/.candidate_script" ]; then 568 echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 569 test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 570 exit 1 571fi 572 573# The sed command converted the sorted list to a shell script that populates 574# BEST_FILE[tuple]="fname". Let's load that! 575 576. "$TRACE_DIR/.candidate_script" 577 578########################## 579# STEP 5: WRITING OUTPUT # 580########################## 581 582# The final trick is to grab the top pick for each tuple, unless said tuple is 583# already set due to the inclusion of an earlier candidate; and then put all 584# tuples associated with the newly-added file to the "already have" list. The 585# loop works from least popular tuples and toward the most common ones. 586 587echo "[*] Processing candidates and writing output files..." 588 589CUR=0 590 591touch "$TRACE_DIR/.already_have" 592 593while read -r cnt tuple; do 594 595 CUR=$((CUR+1)) 596 printf "\\r Processing tuple $CUR/$TUPLE_COUNT with count $cnt... " 597 598 # If we already have this tuple, skip it. 599 600 grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue 601 602 FN=${BEST_FILE[tuple]} 603 604# echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log" 605 $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" 606 607 if [ "$((CUR % 5))" = "0" ]; then 608 sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" 609 mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" 610 else 611 cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" 612 fi 613 614done <"$TRACE_DIR/.all_uniq" 615 616echo 617 618OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` 619 620if [ "$OUT_COUNT" = "1" ]; then 621 echo "[!] WARNING: All test cases had the same traces, check syntax!" 622fi 623 624echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." 625echo 626 627test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" 628 629exit 0 630