| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- This experimental program is designed to extract a subset of interesting test
- case snippets from the trezor-crypto test directory and output them as a
- standard fuzzer dictionary file.
- The program is built on quick-and-dirty regex matching that is known to be
- incorrect for parsing code files, but is considered "good enough" for this
- specific purpose.
- Note that there are target-specific configurations and internal filter settings.
- """
- import argparse
- import binascii
- import glob
- import re
- # re2 is considered for future use
- # it requires a system installation and the google-re2 python package
- # import re2
- # Expected target format for strings in code:
- # Most strings are defined in the general form "example"
- # There are a few test vectors in crypto/tests/wycheproof/javascript/EcUtil.js
- # with 'example' style string definitions, these are ignored for now
- TARGET_DIR = "../tests"
- # intentionally excluded file types that currently do not provide enough value:
- # *.js, *.md, *.sh, *.html and others from the wycheproof subdirectory
- targeted_filetypes_multiline_classA = ("*.c", "*.h", "*.py")
- # Java files have different multiline strings that are handled differently
- targeted_filetypes_multiline_classB = ("*.java",)
- targeted_filetypes_multiline = (
- targeted_filetypes_multiline_classA + targeted_filetypes_multiline_classB
- )
- # files without multiline string content
- # Note: consider switching to actual JSON parsing?
- # Note: the wycheproof repository has a number of test cases for other
- # cryptography such as DSA and RSA which are currently less interesting for the
- # fuzzer dictionary and therefore excluded
- targeted_filetypes_singleline = (
- "aes*.json",
- "ecdh*.json",
- "ecdsa*.json",
- "x25519*.json",
- "chacha20*.json",
- "kw*.json",
- )
- verbose = False
- # patterns to extract
- # singleline:
- # "4a1e76f133afb"
- # 0xAF8BBDFE8CDD5 and 0x0488b21e
- # m/0'/2147483647'/1'/2147483646'/2' in test_check.c via m/[\d'/]+
- #
- # multiline:
- # "fffc" \n "99"
- # "dpubZ9169K" \n "bTYbcY"
- # "\x65\xf9" \\n "\xa0\x6a"
- # { 0x086d8bd5, 0x1018f82f, \n 0xc55ece} , see rg "0x([a-zA-Z0-9])+"
- # patterns to ignore
- # lines with print statements
- # lines with exceptions
- # comments and other metadata in the testvector JSON files
- # filenames
- # import statements and other package names
- # patterns to investigate further
- # public keys with the form BEGIN PUBLIC KEY
- # TODO "abc" + "def" string concatenation on the same line without newline
- # strings in comments
- # dictionary text export file format
- # general description:
- # https://github.com/AFLplusplus/AFLplusplus/blob/stable/dictionaries/README.md
- #
- # the exported file is primarly designed for use with a recent libFuzzer version
- # and is known to be partially incompatible with other fuzzers that impose
- # other limitations
- #
- # known incompatibilities:
- # * honggfuzz only reads a limited number of dictionary entries (8192 with version 2.5)
- # * afl++ only reads line content with up to 128 byte
- # match everything in quotes that doesn't have an internal quote character and
- # at least one internal character
- regex_string_general_definition = r"\"[^\"]+\""
- regex_string_general = re.compile(regex_string_general_definition)
- # the capturing group ignores prefix and suffix outside of the quotes
- # Note that this is prone to matching the last line of a C-style multiline string,
- # which is addressed via extra state handling during the file processing
- regex_oneline_string = re.compile(
- r"(" + regex_string_general_definition + r")\s*[\,\)]+"
- )
- # ignore lines that have a "+" character preceding a string
- regex_oneline_string_java_ignore1 = re.compile(r"^\s*\+\s*\"")
- regex_hex_character_segment_inner_definition = "[0-9a-fA-F]+"
- regex_hex_character_input_complete = re.compile(
- '^"' + regex_hex_character_segment_inner_definition + '"$'
- )
- regex_hex_character_input_inner = re.compile(
- regex_hex_character_segment_inner_definition
- )
- # most constants are preceded by a space, but some have a "(" "[" or "{" before them
- regex_hex_constant_singleline = re.compile(r"(?<=\(|\[|\{| )0x[a-fA-F0-9]+")
- regex_c_style_multiline = re.compile(r"(?:\".+\"\s*\n\s*)+(?:\".+\")", re.MULTILINE)
- regex_c_intermediary_content = re.compile(r"\"\s*\n\s*\"", re.MULTILINE)
- # TODO how to prevent matching in the middle of a multi-line string concatenation?
- # negative lookbehind for "+" is not possible generically and
- # (?<!\+ ) and similar patterns are too static
- regex_java_style_multiline = re.compile(
- r"(?:\".+\"\s*\n\s*\+\s*)+(?:\".+\")", re.MULTILINE
- )
- regex_java_intermediary_content = re.compile(r"\"\s*\n\s*\+\s*\"", re.MULTILINE)
- regex_text_newline = re.compile(r"\\n")
- # primitive regex that catches most filenames in the data set
- regex_filename_heuristic = re.compile(r"\.[a-zA-Z]+")
- counter_hex_content = 0
- counter_wycheproof_hex_reconstruction = 0
- # TODO add '"curve"' to capture algorithm names?
- allowlist_keywords_json = (
- '"uncompressed"',
- '"wx"',
- '"wy"',
- '"msg"',
- '"sig"',
- '"key"',
- '"iv"',
- '"ct"',
- '"aad"',
- '"tag"',
- '"public"',
- '"private"',
- '"shared"',
- '"padding"',
- '"x"',
- '"d"',
- )
- # TODO the "keyPem" entry is only a workaround for an encoding issue
- ignore_keywords_java = (
- "println(",
- "Exception(",
- '"keyPem"',
- )
- ignore_keywords_c = ("printf(",)
- def ignore_single_line_json(data):
- """return True if the input should be ignored"""
- # ignore everything that is not matched by the allowlist
- for keyword in allowlist_keywords_json:
- if data.find(keyword) > -1:
- return False
- return True
- def ignore_single_line_java(data):
- """return True if the input should be ignored"""
- for keyword in ignore_keywords_java:
- if data.find(keyword) > -1:
- return True
- return False
- def ignore_single_line_c(data):
- """return True if the input should be ignored"""
- for keyword in ignore_keywords_c:
- if data.find(keyword) > -1:
- return True
- return False
- def ignore_general(data):
- """return True if the input should be ignored"""
- if regex_filename_heuristic.search(data):
- return True
- return False
- def encode_strings_for_dictionary(data):
- """
- Assumes that inputs are already in string quotes
- Handles dictionary-specific encoding steps
- """
- # libfuzzer does not like "\n" string patterns in dictionary files, replace
- # it with an encoded newline
- data = regex_text_newline.sub("\\\\x0a", data)
- return data
- def detect_and_convert_hex(data):
- """
- Convert hex strings
- Directly pass through non-hex content
- """
- global counter_hex_content
- global counter_wycheproof_hex_reconstruction
- match_result1 = regex_hex_character_input_complete.search(data)
- if match_result1:
- match_result2 = regex_hex_character_input_inner.search(match_result1.string)
- isolated_substring = match_result2.group(0)
- if len(isolated_substring) % 2 == 1:
- # Note: the test cases in the wycheproof testvector JSON files have
- # a custom binary hex format to represent keys
- # among other things, this results in hex strings with an uneven
- # number of characters
- # see tests/wycheproof/java/com/google/security/wycheproof/JsonUtil.java
- # specifically the asBigInteger() function for more information
- if isolated_substring[0] >= "0" and isolated_substring[0] <= "7":
- isolated_substring = "0" + isolated_substring
- else:
- isolated_substring = "f" + isolated_substring
- counter_wycheproof_hex_reconstruction += 1
- converted_result = ""
- try:
- # test error-free conversion to binary
- binascii.unhexlify(isolated_substring)
- hex_with_c_style_formatting = ""
- pos = 0
- while pos < len(isolated_substring) - 1:
- hex_with_c_style_formatting += "\\x" + isolated_substring[pos : pos + 2]
- pos += 2
- converted_result = '"%s"' % hex_with_c_style_formatting
- # TODO binascii.Incomplete exception also relevant?
- except binascii.Error:
- # default to the original input
- return data
- counter_hex_content += 1
- return converted_result
- return data
- def search_files_recursively(directory, filetype_glob):
- """returns glob search results"""
- target_files = []
- print_verbose("searching in %s" % directory)
- for filetype in filetype_glob:
- print_verbose("searching for %s" % filetype)
- target_files.extend(glob.glob(f"{directory}/**/{filetype}", recursive=True))
- return target_files
- def print_verbose(text):
- """print wrapper"""
- if verbose:
- print(text)
- def recursive_dictionary_extraction(directory):
- """handle the central extraction logic"""
- # TODO split this function up into subfunctions
- global counter_hex_content
- # handle as a set structure to de-duplicate results automatically
- candidate_lines = set()
- target_files = search_files_recursively(directory, targeted_filetypes_singleline)
- for filepath in target_files:
- per_file_result_counter = 0
- with open(filepath) as _file:
- print_verbose("processing %s" % filepath)
- for _, line in enumerate(_file.readlines()):
- if ignore_single_line_json(line):
- continue
- results = regex_oneline_string.findall(line)
- for result in results:
- candidate_lines.add(result)
- per_file_result_counter += 1
- if per_file_result_counter > 0:
- print_verbose("results: %d" % per_file_result_counter)
- print_verbose("number of candidate entries: %d" % len(candidate_lines))
- target_files = search_files_recursively(directory, targeted_filetypes_multiline)
- for filepath in target_files:
- per_file_result_counter = 0
- with open(filepath) as _file:
- last_line_was_multiline_string = False
- print_verbose("processing %s for single-line strings" % filepath)
- for _, line in enumerate(_file.readlines()):
- if ignore_single_line_java(line):
- last_line_was_multiline_string = False
- continue
- if ignore_single_line_c(line):
- last_line_was_multiline_string = False
- continue
- if regex_oneline_string_java_ignore1.search(line):
- last_line_was_multiline_string = True
- if regex_oneline_string.search(line):
- # the Java multiline string apparently ends on this line
- last_line_was_multiline_string = False
- continue
- result_general_string = regex_string_general.search(line)
- if result_general_string:
- # at least one general string is matched, see if it is
- # a single-line string
- results = regex_oneline_string.findall(line)
- for result in results:
- if not last_line_was_multiline_string:
- candidate_lines.add(result)
- per_file_result_counter += 1
- last_line_was_multiline_string = False
- if len(results) == 0:
- last_line_was_multiline_string = True
- else:
- last_line_was_multiline_string = False
- # TODO split this into a separate loop?
- results = regex_hex_constant_singleline.findall(line)
- for result in results:
- # remove the "0x" prefix, add quotes
- candidate_lines.add('"%s"' % result[2:])
- per_file_result_counter += 1
- if per_file_result_counter > 0:
- print_verbose("results: %d" % per_file_result_counter)
- target_files = search_files_recursively(
- directory, targeted_filetypes_multiline_classA
- )
- for filepath in target_files:
- with open(filepath) as _file:
- print_verbose("processing %s for C-style multi-line strings" % filepath)
- filecontent = _file.read()
- multiline_results = regex_c_style_multiline.findall(filecontent)
- if len(multiline_results) > 0:
- print_verbose("results: %d" % len(multiline_results))
- for result in multiline_results:
- cleanup = regex_c_intermediary_content.sub("", result)
- candidate_lines.add(cleanup)
- target_files = search_files_recursively(
- directory, targeted_filetypes_multiline_classB
- )
- for filepath in target_files:
- with open(filepath) as _file:
- print_verbose("processing %s for Java-style multi-line strings" % filepath)
- filecontent = _file.read()
- multiline_results = regex_java_style_multiline.findall(filecontent)
- if len(multiline_results) > 0:
- print_verbose("results: %d" % len(multiline_results))
- for result in multiline_results:
- cleanup = regex_java_intermediary_content.sub("", result)
- candidate_lines.add(cleanup)
- return candidate_lines
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument("dictionary_output_file", help="output file", type=str)
- parser.add_argument("--verbose", action="store_true", help="verbose stdout output")
- args = parser.parse_args()
- verbose = args.verbose
- collected_candidate_lines = recursive_dictionary_extraction(TARGET_DIR)
- sorted_candidate_lines = sorted(collected_candidate_lines)
- result_lines = []
- for candidate_line in sorted_candidate_lines:
- if ignore_general(candidate_line):
- continue
- result_lines.append(
- encode_strings_for_dictionary(detect_and_convert_hex(candidate_line))
- )
- print_verbose("counter_hex_content: %d" % counter_hex_content)
- print_verbose(
- "counter_wycheproof_hex_reconstruction: %d"
- % counter_wycheproof_hex_reconstruction
- )
- print_verbose("overall deduplicated entries: %d" % len(sorted_candidate_lines))
- with open(args.dictionary_output_file, "w") as _file:
- for result_line in result_lines:
- _file.write("%s\n" % result_line)
|