x2d.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. #!/usr/bin/env python
  2. "Makes working with XML feel like you are working with JSON"
  3. try:
  4. from defusedexpat import pyexpat as expat
  5. except ImportError:
  6. from xml.parsers import expat
  7. from xml.sax.saxutils import XMLGenerator
  8. from xml.sax.xmlreader import AttributesImpl
  9. try: # pragma no cover
  10. from cStringIO import StringIO
  11. except ImportError: # pragma no cover
  12. try:
  13. from StringIO import StringIO
  14. except ImportError:
  15. from io import StringIO
  16. from inspect import isgenerator
  17. class ObjectDict(dict):
  18. def __getattr__(self, name):
  19. if name in self:
  20. return self[name]
  21. else:
  22. raise AttributeError("No such attribute: " + name)
  23. try: # pragma no cover
  24. _basestring = basestring
  25. except NameError: # pragma no cover
  26. _basestring = str
  27. try: # pragma no cover
  28. _unicode = unicode
  29. except NameError: # pragma no cover
  30. _unicode = str
  31. __author__ = "Martin Blech"
  32. __version__ = "0.12.0"
  33. __license__ = "MIT"
  34. class ParsingInterrupted(Exception):
  35. pass
  36. class _DictSAXHandler(object):
  37. def __init__(
  38. self,
  39. item_depth=0,
  40. item_callback=lambda *args: True,
  41. xml_attribs=True,
  42. attr_prefix="@",
  43. cdata_key="#text",
  44. force_cdata=False,
  45. cdata_separator="",
  46. postprocessor=None,
  47. dict_constructor=ObjectDict,
  48. strip_whitespace=True,
  49. namespace_separator=":",
  50. namespaces=None,
  51. force_list=None,
  52. comment_key="#comment",
  53. ):
  54. self.path = []
  55. self.stack = []
  56. self.data = []
  57. self.item = None
  58. self.item_depth = item_depth
  59. self.xml_attribs = xml_attribs
  60. self.item_callback = item_callback
  61. self.attr_prefix = attr_prefix
  62. self.cdata_key = cdata_key
  63. self.force_cdata = force_cdata
  64. self.cdata_separator = cdata_separator
  65. self.postprocessor = postprocessor
  66. self.dict_constructor = dict_constructor
  67. self.strip_whitespace = strip_whitespace
  68. self.namespace_separator = namespace_separator
  69. self.namespaces = namespaces
  70. self.namespace_declarations = ObjectDict()
  71. self.force_list = force_list
  72. self.comment_key = comment_key
  73. def _build_name(self, full_name):
  74. if self.namespaces is None:
  75. return full_name
  76. i = full_name.rfind(self.namespace_separator)
  77. if i == -1:
  78. return full_name
  79. namespace, name = full_name[:i], full_name[i + 1 :]
  80. try:
  81. short_namespace = self.namespaces[namespace]
  82. except KeyError:
  83. short_namespace = namespace
  84. if not short_namespace:
  85. return name
  86. else:
  87. return self.namespace_separator.join((short_namespace, name))
  88. def _attrs_to_dict(self, attrs):
  89. if isinstance(attrs, dict):
  90. return attrs
  91. return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))
  92. def startNamespaceDecl(self, prefix, uri):
  93. self.namespace_declarations[prefix or ""] = uri
  94. def startElement(self, full_name, attrs):
  95. name = self._build_name(full_name)
  96. attrs = self._attrs_to_dict(attrs)
  97. if attrs and self.namespace_declarations:
  98. attrs["xmlns"] = self.namespace_declarations
  99. self.namespace_declarations = ObjectDict()
  100. self.path.append((name, attrs or None))
  101. if len(self.path) > self.item_depth:
  102. self.stack.append((self.item, self.data))
  103. if self.xml_attribs:
  104. attr_entries = []
  105. for key, value in attrs.items():
  106. key = self.attr_prefix + self._build_name(key)
  107. if self.postprocessor:
  108. entry = self.postprocessor(self.path, key, value)
  109. else:
  110. entry = (key, value)
  111. if entry:
  112. attr_entries.append(entry)
  113. attrs = self.dict_constructor(attr_entries)
  114. else:
  115. attrs = None
  116. self.item = attrs or None
  117. self.data = []
  118. def endElement(self, full_name):
  119. name = self._build_name(full_name)
  120. if len(self.path) == self.item_depth:
  121. item = self.item
  122. if item is None:
  123. item = None if not self.data else self.cdata_separator.join(self.data)
  124. should_continue = self.item_callback(self.path, item)
  125. if not should_continue:
  126. raise ParsingInterrupted()
  127. if len(self.stack):
  128. data = None if not self.data else self.cdata_separator.join(self.data)
  129. item = self.item
  130. self.item, self.data = self.stack.pop()
  131. if self.strip_whitespace and data:
  132. data = data.strip() or None
  133. if data and self.force_cdata and item is None:
  134. item = self.dict_constructor()
  135. if item is not None:
  136. if data:
  137. self.push_data(item, self.cdata_key, data)
  138. self.item = self.push_data(self.item, name, item)
  139. else:
  140. self.item = self.push_data(self.item, name, data)
  141. else:
  142. self.item = None
  143. self.data = []
  144. self.path.pop()
  145. def characters(self, data):
  146. if not self.data:
  147. self.data = [data]
  148. else:
  149. self.data.append(data)
  150. def comments(self, data):
  151. if self.strip_whitespace:
  152. data = data.strip()
  153. self.item = self.push_data(self.item, self.comment_key, data)
  154. def push_data(self, item, key, data):
  155. if self.postprocessor is not None:
  156. result = self.postprocessor(self.path, key, data)
  157. if result is None:
  158. return item
  159. key, data = result
  160. if item is None:
  161. item = self.dict_constructor()
  162. try:
  163. value = item[key]
  164. if isinstance(value, list):
  165. value.append(data)
  166. else:
  167. item[key] = [value, data]
  168. except KeyError:
  169. if self._should_force_list(key, data):
  170. item[key] = [data]
  171. else:
  172. item[key] = data
  173. return item
  174. def _should_force_list(self, key, value):
  175. if not self.force_list:
  176. return False
  177. if isinstance(self.force_list, bool):
  178. return self.force_list
  179. try:
  180. return key in self.force_list
  181. except TypeError:
  182. return self.force_list(self.path[:-1], key, value)
  183. def parse(
  184. xml_input,
  185. encoding=None,
  186. expat=expat,
  187. process_namespaces=False,
  188. namespace_separator=":",
  189. disable_entities=True,
  190. process_comments=False,
  191. **kwargs
  192. ):
  193. """Parse the given XML input and convert it into a dictionary.
  194. `xml_input` can either be a `string`, a file-like object, or a generator of strings.
  195. If `xml_attribs` is `True`, element attributes are put in the dictionary
  196. among regular child elements, using `@` as a prefix to avoid collisions. If
  197. set to `False`, they are just ignored.
  198. Simple example::
  199. >>> import xmltodict
  200. >>> doc = xmltodict.parse(\"\"\"
  201. ... <a prop="x">
  202. ... <b>1</b>
  203. ... <b>2</b>
  204. ... </a>
  205. ... \"\"\")
  206. >>> doc['a']['@prop']
  207. u'x'
  208. >>> doc['a']['b']
  209. [u'1', u'2']
  210. If `item_depth` is `0`, the function returns a dictionary for the root
  211. element (default behavior). Otherwise, it calls `item_callback` every time
  212. an item at the specified depth is found and returns `None` in the end
  213. (streaming mode).
  214. The callback function receives two parameters: the `path` from the document
  215. root to the item (name-attribs pairs), and the `item` (dict). If the
  216. callback's return value is false-ish, parsing will be stopped with the
  217. :class:`ParsingInterrupted` exception.
  218. Streaming example::
  219. >>> def handle(path, item):
  220. ... print('path:%s item:%s' % (path, item))
  221. ... return True
  222. ...
  223. >>> xmltodict.parse(\"\"\"
  224. ... <a prop="x">
  225. ... <b>1</b>
  226. ... <b>2</b>
  227. ... </a>\"\"\", item_depth=2, item_callback=handle)
  228. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1
  229. path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2
  230. The optional argument `postprocessor` is a function that takes `path`,
  231. `key` and `value` as positional arguments and returns a new `(key, value)`
  232. pair where both `key` and `value` may have changed. Usage example::
  233. >>> def postprocessor(path, key, value):
  234. ... try:
  235. ... return key + ':int', int(value)
  236. ... except (ValueError, TypeError):
  237. ... return key, value
  238. >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',
  239. ... postprocessor=postprocessor)
  240. ObjectDict([(u'a', ObjectDict([(u'b:int', [1, 2]), (u'b', u'x')]))])
  241. You can pass an alternate version of `expat` (such as `defusedexpat`) by
  242. using the `expat` parameter. E.g:
  243. >>> import defusedexpat
  244. >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)
  245. ObjectDict([(u'a', u'hello')])
  246. You can use the force_list argument to force lists to be created even
  247. when there is only a single child of a given level of hierarchy. The
  248. force_list argument is a tuple of keys. If the key for a given level
  249. of hierarchy is in the force_list argument, that level of hierarchy
  250. will have a list as a child (even if there is only one sub-element).
  251. The index_keys operation takes precedence over this. This is applied
  252. after any user-supplied postprocessor has already run.
  253. For example, given this input:
  254. <servers>
  255. <server>
  256. <name>host1</name>
  257. <os>Linux</os>
  258. <interfaces>
  259. <interface>
  260. <name>em0</name>
  261. <ip_address>10.0.0.1</ip_address>
  262. </interface>
  263. </interfaces>
  264. </server>
  265. </servers>
  266. If called with force_list=('interface',), it will produce
  267. this dictionary:
  268. {'servers':
  269. {'server':
  270. {'name': 'host1',
  271. 'os': 'Linux'},
  272. 'interfaces':
  273. {'interface':
  274. [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }
  275. `force_list` can also be a callable that receives `path`, `key` and
  276. `value`. This is helpful in cases where the logic that decides whether
  277. a list should be forced is more complex.
  278. If `process_comment` is `True` then comment will be added with comment_key
  279. (default=`'#comment'`) to then tag which contains comment
  280. For example, given this input:
  281. <a>
  282. <b>
  283. <!-- b comment -->
  284. <c>
  285. <!-- c comment -->
  286. 1
  287. </c>
  288. <d>2</d>
  289. </b>
  290. </a>
  291. If called with process_comment=True, it will produce
  292. this dictionary:
  293. 'a': {
  294. 'b': {
  295. '#comment': 'b comment',
  296. 'c': {
  297. '#comment': 'c comment',
  298. '#text': '1',
  299. },
  300. 'd': '2',
  301. },
  302. }
  303. """
  304. handler = _DictSAXHandler(namespace_separator=namespace_separator, **kwargs)
  305. if isinstance(xml_input, _unicode):
  306. if not encoding:
  307. encoding = "utf-8"
  308. xml_input = xml_input.encode(encoding)
  309. if not process_namespaces:
  310. namespace_separator = None
  311. parser = expat.ParserCreate(encoding, namespace_separator)
  312. try:
  313. parser.ordered_attributes = True
  314. except AttributeError:
  315. # Jython's expat does not support ordered_attributes
  316. pass
  317. parser.StartNamespaceDeclHandler = handler.startNamespaceDecl
  318. parser.StartElementHandler = handler.startElement
  319. parser.EndElementHandler = handler.endElement
  320. parser.CharacterDataHandler = handler.characters
  321. if process_comments:
  322. parser.CommentHandler = handler.comments
  323. parser.buffer_text = True
  324. if disable_entities:
  325. try:
  326. # Attempt to disable DTD in Jython's expat parser (Xerces-J).
  327. feature = "http://apache.org/xml/features/disallow-doctype-decl"
  328. parser._reader.setFeature(feature, True)
  329. except AttributeError:
  330. # For CPython / expat parser.
  331. # Anything not handled ends up here and entities aren't expanded.
  332. parser.DefaultHandler = lambda x: None
  333. # Expects an integer return; zero means failure -> expat.ExpatError.
  334. parser.ExternalEntityRefHandler = lambda *x: 1
  335. if hasattr(xml_input, "read"):
  336. parser.ParseFile(xml_input)
  337. elif isgenerator(xml_input):
  338. for chunk in xml_input:
  339. parser.Parse(chunk, False)
  340. parser.Parse(b"", True)
  341. else:
  342. parser.Parse(xml_input, True)
  343. return handler.item
  344. def _process_namespace(name, namespaces, ns_sep=":", attr_prefix="@"):
  345. if not namespaces:
  346. return name
  347. try:
  348. ns, name = name.rsplit(ns_sep, 1)
  349. except ValueError:
  350. pass
  351. else:
  352. ns_res = namespaces.get(ns.strip(attr_prefix))
  353. name = (
  354. "{}{}{}{}".format(
  355. attr_prefix if ns.startswith(attr_prefix) else "", ns_res, ns_sep, name
  356. )
  357. if ns_res
  358. else name
  359. )
  360. return name
  361. def _emit(
  362. key,
  363. value,
  364. content_handler,
  365. attr_prefix="@",
  366. cdata_key="#text",
  367. depth=0,
  368. preprocessor=None,
  369. pretty=False,
  370. newl="\n",
  371. indent="\t",
  372. namespace_separator=":",
  373. namespaces=None,
  374. full_document=True,
  375. expand_iter=None,
  376. ):
  377. key = _process_namespace(key, namespaces, namespace_separator, attr_prefix)
  378. if preprocessor is not None:
  379. result = preprocessor(key, value)
  380. if result is None:
  381. return
  382. key, value = result
  383. if (
  384. not hasattr(value, "__iter__")
  385. or isinstance(value, _basestring)
  386. or isinstance(value, dict)
  387. ):
  388. value = [value]
  389. for index, v in enumerate(value):
  390. if full_document and depth == 0 and index > 0:
  391. raise ValueError("document with multiple roots")
  392. if v is None:
  393. v = ObjectDict()
  394. elif isinstance(v, bool):
  395. if v:
  396. v = _unicode("true")
  397. else:
  398. v = _unicode("false")
  399. elif not isinstance(v, dict):
  400. if (
  401. expand_iter
  402. and hasattr(v, "__iter__")
  403. and not isinstance(v, _basestring)
  404. ):
  405. v = ObjectDict(((expand_iter, v),))
  406. else:
  407. v = _unicode(v)
  408. if isinstance(v, _basestring):
  409. v = ObjectDict(((cdata_key, v),))
  410. cdata = None
  411. attrs = ObjectDict()
  412. children = []
  413. for ik, iv in v.items():
  414. if ik == cdata_key:
  415. cdata = iv
  416. continue
  417. if ik.startswith(attr_prefix):
  418. ik = _process_namespace(
  419. ik, namespaces, namespace_separator, attr_prefix
  420. )
  421. if ik == "@xmlns" and isinstance(iv, dict):
  422. for k, v in iv.items():
  423. attr = "xmlns{}".format(":{}".format(k) if k else "")
  424. attrs[attr] = _unicode(v)
  425. continue
  426. if not isinstance(iv, _unicode):
  427. iv = _unicode(iv)
  428. attrs[ik[len(attr_prefix) :]] = iv
  429. continue
  430. children.append((ik, iv))
  431. if pretty:
  432. content_handler.ignorableWhitespace(depth * indent)
  433. content_handler.startElement(key, AttributesImpl(attrs))
  434. if pretty and children:
  435. content_handler.ignorableWhitespace(newl)
  436. for child_key, child_value in children:
  437. _emit(
  438. child_key,
  439. child_value,
  440. content_handler,
  441. attr_prefix,
  442. cdata_key,
  443. depth + 1,
  444. preprocessor,
  445. pretty,
  446. newl,
  447. indent,
  448. namespaces=namespaces,
  449. namespace_separator=namespace_separator,
  450. expand_iter=expand_iter,
  451. )
  452. if cdata is not None:
  453. content_handler.characters(cdata)
  454. if pretty and children:
  455. content_handler.ignorableWhitespace(depth * indent)
  456. content_handler.endElement(key)
  457. if pretty and depth:
  458. content_handler.ignorableWhitespace(newl)
  459. def unparse(
  460. input_dict,
  461. output=None,
  462. encoding="utf-8",
  463. full_document=True,
  464. short_empty_elements=False,
  465. **kwargs
  466. ):
  467. """Emit an XML document for the given `input_dict` (reverse of `parse`).
  468. The resulting XML document is returned as a string, but if `output` (a
  469. file-like object) is specified, it is written there instead.
  470. Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted
  471. as XML node attributes, whereas keys equal to `cdata_key`
  472. (default=`'#text'`) are treated as character data.
  473. The `pretty` parameter (default=`False`) enables pretty-printing. In this
  474. mode, lines are terminated with `'\n'` and indented with `'\t'`, but this
  475. can be customized with the `newl` and `indent` parameters.
  476. """
  477. if full_document and len(input_dict) != 1:
  478. raise ValueError("Document must have exactly one root.")
  479. must_return = False
  480. if output is None:
  481. output = StringIO()
  482. must_return = True
  483. if short_empty_elements:
  484. content_handler = XMLGenerator(output, encoding, True)
  485. else:
  486. content_handler = XMLGenerator(output, encoding)
  487. if full_document:
  488. content_handler.startDocument()
  489. for key, value in input_dict.items():
  490. _emit(key, value, content_handler, full_document=full_document, **kwargs)
  491. if full_document:
  492. content_handler.endDocument()
  493. if must_return:
  494. value = output.getvalue()
  495. try: # pragma no cover
  496. value = value.decode(encoding)
  497. except AttributeError: # pragma no cover
  498. pass
  499. return value
  500. if __name__ == "__main__": # pragma: no cover
  501. import sys
  502. import marshal
  503. try:
  504. stdin = sys.stdin.buffer
  505. stdout = sys.stdout.buffer
  506. except AttributeError:
  507. stdin = sys.stdin
  508. stdout = sys.stdout
  509. (item_depth,) = sys.argv[1:]
  510. item_depth = int(item_depth)
  511. def handle_item(path, item):
  512. marshal.dump((path, item), stdout)
  513. return True
  514. try:
  515. root = parse(
  516. stdin,
  517. item_depth=item_depth,
  518. item_callback=handle_item,
  519. dict_constructor=dict,
  520. )
  521. if item_depth == 0:
  522. handle_item([], root)
  523. except KeyboardInterrupt:
  524. pass