extract_fuzzer_dictionary.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. This experimental program is designed to extract a subset of interesting test
  5. case snippets from the trezor-crypto test directory and output them as a
  6. standard fuzzer dictionary file.
  7. The program is built on quick-and-dirty regex matching that is known to be
  8. incorrect for parsing code files, but is considered "good enough" for this
  9. specific purpose.
  10. Note that there are target-specific configurations and internal filter settings.
  11. """
  12. import argparse
  13. import binascii
  14. import glob
  15. import re
  16. # re2 is considered for future use
  17. # it requires a system installation and the google-re2 python package
  18. # import re2
  19. # Expected target format for strings in code:
  20. # Most strings are defined in the general form "example"
  21. # There are a few test vectors in crypto/tests/wycheproof/javascript/EcUtil.js
  22. # with 'example' style string definitions, these are ignored for now
  23. TARGET_DIR = "../tests"
  24. # intentionally excluded file types that currently do not provide enough value:
  25. # *.js, *.md, *.sh, *.html and others from the wycheproof subdirectory
  26. targeted_filetypes_multiline_classA = ("*.c", "*.h", "*.py")
  27. # Java files have different multiline strings that are handled differently
  28. targeted_filetypes_multiline_classB = ("*.java",)
  29. targeted_filetypes_multiline = (
  30. targeted_filetypes_multiline_classA + targeted_filetypes_multiline_classB
  31. )
  32. # files without multiline string content
  33. # Note: consider switching to actual JSON parsing?
  34. # Note: the wycheproof repository has a number of test cases for other
  35. # cryptography such as DSA and RSA which are currently less interesting for the
  36. # fuzzer dictionary and therefore excluded
  37. targeted_filetypes_singleline = (
  38. "aes*.json",
  39. "ecdh*.json",
  40. "ecdsa*.json",
  41. "x25519*.json",
  42. "chacha20*.json",
  43. "kw*.json",
  44. )
  45. verbose = False
  46. # patterns to extract
  47. # singleline:
  48. # "4a1e76f133afb"
  49. # 0xAF8BBDFE8CDD5 and 0x0488b21e
  50. # m/0'/2147483647'/1'/2147483646'/2' in test_check.c via m/[\d'/]+
  51. #
  52. # multiline:
  53. # "fffc" \n "99"
  54. # "dpubZ9169K" \n "bTYbcY"
  55. # "\x65\xf9" \\n "\xa0\x6a"
  56. # { 0x086d8bd5, 0x1018f82f, \n 0xc55ece} , see rg "0x([a-zA-Z0-9])+"
  57. # patterns to ignore
  58. # lines with print statements
  59. # lines with exceptions
  60. # comments and other metadata in the testvector JSON files
  61. # filenames
  62. # import statements and other package names
  63. # patterns to investigate further
  64. # public keys with the form BEGIN PUBLIC KEY
  65. # TODO "abc" + "def" string concatenation on the same line without newline
  66. # strings in comments
  67. # dictionary text export file format
  68. # general description:
  69. # https://github.com/AFLplusplus/AFLplusplus/blob/stable/dictionaries/README.md
  70. #
  71. # the exported file is primarly designed for use with a recent libFuzzer version
  72. # and is known to be partially incompatible with other fuzzers that impose
  73. # other limitations
  74. #
  75. # known incompatibilities:
  76. # * honggfuzz only reads a limited number of dictionary entries (8192 with version 2.5)
  77. # * afl++ only reads line content with up to 128 byte
  78. # match everything in quotes that doesn't have an internal quote character and
  79. # at least one internal character
  80. regex_string_general_definition = r"\"[^\"]+\""
  81. regex_string_general = re.compile(regex_string_general_definition)
  82. # the capturing group ignores prefix and suffix outside of the quotes
  83. # Note that this is prone to matching the last line of a C-style multiline string,
  84. # which is addressed via extra state handling during the file processing
  85. regex_oneline_string = re.compile(
  86. r"(" + regex_string_general_definition + r")\s*[\,\)]+"
  87. )
  88. # ignore lines that have a "+" character preceding a string
  89. regex_oneline_string_java_ignore1 = re.compile(r"^\s*\+\s*\"")
  90. regex_hex_character_segment_inner_definition = "[0-9a-fA-F]+"
  91. regex_hex_character_input_complete = re.compile(
  92. '^"' + regex_hex_character_segment_inner_definition + '"$'
  93. )
  94. regex_hex_character_input_inner = re.compile(
  95. regex_hex_character_segment_inner_definition
  96. )
  97. # most constants are preceded by a space, but some have a "(" "[" or "{" before them
  98. regex_hex_constant_singleline = re.compile(r"(?<=\(|\[|\{| )0x[a-fA-F0-9]+")
  99. regex_c_style_multiline = re.compile(r"(?:\".+\"\s*\n\s*)+(?:\".+\")", re.MULTILINE)
  100. regex_c_intermediary_content = re.compile(r"\"\s*\n\s*\"", re.MULTILINE)
  101. # TODO how to prevent matching in the middle of a multi-line string concatenation?
  102. # negative lookbehind for "+" is not possible generically and
  103. # (?<!\+ ) and similar patterns are too static
  104. regex_java_style_multiline = re.compile(
  105. r"(?:\".+\"\s*\n\s*\+\s*)+(?:\".+\")", re.MULTILINE
  106. )
  107. regex_java_intermediary_content = re.compile(r"\"\s*\n\s*\+\s*\"", re.MULTILINE)
  108. regex_text_newline = re.compile(r"\\n")
  109. # primitive regex that catches most filenames in the data set
  110. regex_filename_heuristic = re.compile(r"\.[a-zA-Z]+")
  111. counter_hex_content = 0
  112. counter_wycheproof_hex_reconstruction = 0
  113. # TODO add '"curve"' to capture algorithm names?
  114. allowlist_keywords_json = (
  115. '"uncompressed"',
  116. '"wx"',
  117. '"wy"',
  118. '"msg"',
  119. '"sig"',
  120. '"key"',
  121. '"iv"',
  122. '"ct"',
  123. '"aad"',
  124. '"tag"',
  125. '"public"',
  126. '"private"',
  127. '"shared"',
  128. '"padding"',
  129. '"x"',
  130. '"d"',
  131. )
  132. # TODO the "keyPem" entry is only a workaround for an encoding issue
  133. ignore_keywords_java = (
  134. "println(",
  135. "Exception(",
  136. '"keyPem"',
  137. )
  138. ignore_keywords_c = ("printf(",)
  139. def ignore_single_line_json(data):
  140. """return True if the input should be ignored"""
  141. # ignore everything that is not matched by the allowlist
  142. for keyword in allowlist_keywords_json:
  143. if data.find(keyword) > -1:
  144. return False
  145. return True
  146. def ignore_single_line_java(data):
  147. """return True if the input should be ignored"""
  148. for keyword in ignore_keywords_java:
  149. if data.find(keyword) > -1:
  150. return True
  151. return False
  152. def ignore_single_line_c(data):
  153. """return True if the input should be ignored"""
  154. for keyword in ignore_keywords_c:
  155. if data.find(keyword) > -1:
  156. return True
  157. return False
  158. def ignore_general(data):
  159. """return True if the input should be ignored"""
  160. if regex_filename_heuristic.search(data):
  161. return True
  162. return False
  163. def encode_strings_for_dictionary(data):
  164. """
  165. Assumes that inputs are already in string quotes
  166. Handles dictionary-specific encoding steps
  167. """
  168. # libfuzzer does not like "\n" string patterns in dictionary files, replace
  169. # it with an encoded newline
  170. data = regex_text_newline.sub("\\\\x0a", data)
  171. return data
  172. def detect_and_convert_hex(data):
  173. """
  174. Convert hex strings
  175. Directly pass through non-hex content
  176. """
  177. global counter_hex_content
  178. global counter_wycheproof_hex_reconstruction
  179. match_result1 = regex_hex_character_input_complete.search(data)
  180. if match_result1:
  181. match_result2 = regex_hex_character_input_inner.search(match_result1.string)
  182. isolated_substring = match_result2.group(0)
  183. if len(isolated_substring) % 2 == 1:
  184. # Note: the test cases in the wycheproof testvector JSON files have
  185. # a custom binary hex format to represent keys
  186. # among other things, this results in hex strings with an uneven
  187. # number of characters
  188. # see tests/wycheproof/java/com/google/security/wycheproof/JsonUtil.java
  189. # specifically the asBigInteger() function for more information
  190. if isolated_substring[0] >= "0" and isolated_substring[0] <= "7":
  191. isolated_substring = "0" + isolated_substring
  192. else:
  193. isolated_substring = "f" + isolated_substring
  194. counter_wycheproof_hex_reconstruction += 1
  195. converted_result = ""
  196. try:
  197. # test error-free conversion to binary
  198. binascii.unhexlify(isolated_substring)
  199. hex_with_c_style_formatting = ""
  200. pos = 0
  201. while pos < len(isolated_substring) - 1:
  202. hex_with_c_style_formatting += "\\x" + isolated_substring[pos : pos + 2]
  203. pos += 2
  204. converted_result = '"%s"' % hex_with_c_style_formatting
  205. # TODO binascii.Incomplete exception also relevant?
  206. except binascii.Error:
  207. # default to the original input
  208. return data
  209. counter_hex_content += 1
  210. return converted_result
  211. return data
  212. def search_files_recursively(directory, filetype_glob):
  213. """returns glob search results"""
  214. target_files = []
  215. print_verbose("searching in %s" % directory)
  216. for filetype in filetype_glob:
  217. print_verbose("searching for %s" % filetype)
  218. target_files.extend(glob.glob(f"{directory}/**/{filetype}", recursive=True))
  219. return target_files
  220. def print_verbose(text):
  221. """print wrapper"""
  222. if verbose:
  223. print(text)
  224. def recursive_dictionary_extraction(directory):
  225. """handle the central extraction logic"""
  226. # TODO split this function up into subfunctions
  227. global counter_hex_content
  228. # handle as a set structure to de-duplicate results automatically
  229. candidate_lines = set()
  230. target_files = search_files_recursively(directory, targeted_filetypes_singleline)
  231. for filepath in target_files:
  232. per_file_result_counter = 0
  233. with open(filepath) as _file:
  234. print_verbose("processing %s" % filepath)
  235. for _, line in enumerate(_file.readlines()):
  236. if ignore_single_line_json(line):
  237. continue
  238. results = regex_oneline_string.findall(line)
  239. for result in results:
  240. candidate_lines.add(result)
  241. per_file_result_counter += 1
  242. if per_file_result_counter > 0:
  243. print_verbose("results: %d" % per_file_result_counter)
  244. print_verbose("number of candidate entries: %d" % len(candidate_lines))
  245. target_files = search_files_recursively(directory, targeted_filetypes_multiline)
  246. for filepath in target_files:
  247. per_file_result_counter = 0
  248. with open(filepath) as _file:
  249. last_line_was_multiline_string = False
  250. print_verbose("processing %s for single-line strings" % filepath)
  251. for _, line in enumerate(_file.readlines()):
  252. if ignore_single_line_java(line):
  253. last_line_was_multiline_string = False
  254. continue
  255. if ignore_single_line_c(line):
  256. last_line_was_multiline_string = False
  257. continue
  258. if regex_oneline_string_java_ignore1.search(line):
  259. last_line_was_multiline_string = True
  260. if regex_oneline_string.search(line):
  261. # the Java multiline string apparently ends on this line
  262. last_line_was_multiline_string = False
  263. continue
  264. result_general_string = regex_string_general.search(line)
  265. if result_general_string:
  266. # at least one general string is matched, see if it is
  267. # a single-line string
  268. results = regex_oneline_string.findall(line)
  269. for result in results:
  270. if not last_line_was_multiline_string:
  271. candidate_lines.add(result)
  272. per_file_result_counter += 1
  273. last_line_was_multiline_string = False
  274. if len(results) == 0:
  275. last_line_was_multiline_string = True
  276. else:
  277. last_line_was_multiline_string = False
  278. # TODO split this into a separate loop?
  279. results = regex_hex_constant_singleline.findall(line)
  280. for result in results:
  281. # remove the "0x" prefix, add quotes
  282. candidate_lines.add('"%s"' % result[2:])
  283. per_file_result_counter += 1
  284. if per_file_result_counter > 0:
  285. print_verbose("results: %d" % per_file_result_counter)
  286. target_files = search_files_recursively(
  287. directory, targeted_filetypes_multiline_classA
  288. )
  289. for filepath in target_files:
  290. with open(filepath) as _file:
  291. print_verbose("processing %s for C-style multi-line strings" % filepath)
  292. filecontent = _file.read()
  293. multiline_results = regex_c_style_multiline.findall(filecontent)
  294. if len(multiline_results) > 0:
  295. print_verbose("results: %d" % len(multiline_results))
  296. for result in multiline_results:
  297. cleanup = regex_c_intermediary_content.sub("", result)
  298. candidate_lines.add(cleanup)
  299. target_files = search_files_recursively(
  300. directory, targeted_filetypes_multiline_classB
  301. )
  302. for filepath in target_files:
  303. with open(filepath) as _file:
  304. print_verbose("processing %s for Java-style multi-line strings" % filepath)
  305. filecontent = _file.read()
  306. multiline_results = regex_java_style_multiline.findall(filecontent)
  307. if len(multiline_results) > 0:
  308. print_verbose("results: %d" % len(multiline_results))
  309. for result in multiline_results:
  310. cleanup = regex_java_intermediary_content.sub("", result)
  311. candidate_lines.add(cleanup)
  312. return candidate_lines
  313. if __name__ == "__main__":
  314. parser = argparse.ArgumentParser()
  315. parser.add_argument("dictionary_output_file", help="output file", type=str)
  316. parser.add_argument("--verbose", action="store_true", help="verbose stdout output")
  317. args = parser.parse_args()
  318. verbose = args.verbose
  319. collected_candidate_lines = recursive_dictionary_extraction(TARGET_DIR)
  320. sorted_candidate_lines = sorted(collected_candidate_lines)
  321. result_lines = []
  322. for candidate_line in sorted_candidate_lines:
  323. if ignore_general(candidate_line):
  324. continue
  325. result_lines.append(
  326. encode_strings_for_dictionary(detect_and_convert_hex(candidate_line))
  327. )
  328. print_verbose("counter_hex_content: %d" % counter_hex_content)
  329. print_verbose(
  330. "counter_wycheproof_hex_reconstruction: %d"
  331. % counter_wycheproof_hex_reconstruction
  332. )
  333. print_verbose("overall deduplicated entries: %d" % len(sorted_candidate_lines))
  334. with open(args.dictionary_output_file, "w") as _file:
  335. for result_line in result_lines:
  336. _file.write("%s\n" % result_line)