xer_support.c 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. /*
  2. * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
  3. * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
  4. * All rights reserved.
  5. * Redistribution and modifications are permitted subject to BSD license.
  6. */
  7. #include <asn_system.h>
  8. #include <xer_support.h>
  9. /* Parser states */
  10. typedef enum {
  11. ST_TEXT,
  12. ST_TAG_START,
  13. ST_TAG_BODY,
  14. ST_TAG_QUOTE_WAIT,
  15. ST_TAG_QUOTED_STRING,
  16. ST_TAG_UNQUOTED_STRING,
  17. ST_COMMENT_WAIT_DASH1, /* "<!--"[1] */
  18. ST_COMMENT_WAIT_DASH2, /* "<!--"[2] */
  19. ST_COMMENT,
  20. ST_COMMENT_CLO_DASH2, /* "-->"[0] */
  21. ST_COMMENT_CLO_RT /* "-->"[1] */
  22. } pstate_e;
  23. static const int
  24. _charclass[256] = {
  25. 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
  26. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  27. 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  28. 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
  29. 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
  30. 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
  31. 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
  32. 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
  33. };
  34. #define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
  35. #define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
  36. #define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
  37. /* Aliases for characters, ASCII/UTF-8 */
  38. #define EXCLAM 0x21 /* '!' */
  39. #define CQUOTE 0x22 /* '"' */
  40. #define CDASH 0x2d /* '-' */
  41. #define CSLASH 0x2f /* '/' */
  42. #define LANGLE 0x3c /* '<' */
  43. #define CEQUAL 0x3d /* '=' */
  44. #define RANGLE 0x3e /* '>' */
  45. #define CQUEST 0x3f /* '?' */
  46. /* Invoke token callback */
  47. #define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
  48. int _ret; \
  49. pstate_e ns = _ns; \
  50. ssize_t _sz = (p - chunk_start) + _current_too; \
  51. if (!_sz) { \
  52. /* Shortcut */ \
  53. state = _ns; \
  54. break; \
  55. } \
  56. _ret = cb(type, chunk_start, _sz, key); \
  57. if(_ret < _sz) { \
  58. if(_current_too && _ret == -1) \
  59. state = ns; \
  60. goto finish; \
  61. } \
  62. chunk_start = p + _current_too; \
  63. state = ns; \
  64. } while(0)
  65. #define TOKEN_CB(_type, _ns, _current_too) \
  66. TOKEN_CB_CALL(_type, _ns, _current_too, 0)
  67. #define PXML_TAG_FINAL_CHUNK_TYPE PXML_TAG_END
  68. #define PXML_COMMENT_FINAL_CHUNK_TYPE PXML_COMMENT_END
  69. #define TOKEN_CB_FINAL(_type, _ns, _current_too) \
  70. TOKEN_CB_CALL( _type ## _FINAL_CHUNK_TYPE , _ns, _current_too, 1)
  71. /*
  72. * Parser itself
  73. */
  74. ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
  75. pstate_e state = (pstate_e)*stateContext;
  76. const char *chunk_start = (const char *)xmlbuf;
  77. const char *p = chunk_start;
  78. const char *end = p + size;
  79. for(; p < end; p++) {
  80. int C = *(const unsigned char *)p;
  81. switch(state) {
  82. case ST_TEXT:
  83. /*
  84. * Initial state: we're in the middle of some text,
  85. * or just have started.
  86. */
  87. if (C == LANGLE)
  88. /* We're now in the tag, probably */
  89. TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
  90. break;
  91. case ST_TAG_START:
  92. if (ALPHA(C) || (C == CSLASH))
  93. state = ST_TAG_BODY;
  94. else if (C == EXCLAM)
  95. state = ST_COMMENT_WAIT_DASH1;
  96. else
  97. /*
  98. * Not characters and not whitespace.
  99. * Must be something like "3 < 4".
  100. */
  101. TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
  102. break;
  103. case ST_TAG_BODY:
  104. switch(C) {
  105. case RANGLE:
  106. /* End of the tag */
  107. TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
  108. break;
  109. case LANGLE:
  110. /*
  111. * The previous tag wasn't completed, but still
  112. * recognized as valid. (Mozilla-compatible)
  113. */
  114. TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
  115. break;
  116. case CEQUAL:
  117. state = ST_TAG_QUOTE_WAIT;
  118. break;
  119. }
  120. break;
  121. case ST_TAG_QUOTE_WAIT:
  122. /*
  123. * State after the equal sign ("=") in the tag.
  124. */
  125. switch(C) {
  126. case CQUOTE:
  127. state = ST_TAG_QUOTED_STRING;
  128. break;
  129. case RANGLE:
  130. /* End of the tag */
  131. TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
  132. break;
  133. default:
  134. if(!WHITESPACE(C))
  135. /* Unquoted string value */
  136. state = ST_TAG_UNQUOTED_STRING;
  137. }
  138. break;
  139. case ST_TAG_QUOTED_STRING:
  140. /*
  141. * Tag attribute's string value in quotes.
  142. */
  143. if(C == CQUOTE) {
  144. /* Return back to the tag state */
  145. state = ST_TAG_BODY;
  146. }
  147. break;
  148. case ST_TAG_UNQUOTED_STRING:
  149. if(C == RANGLE) {
  150. /* End of the tag */
  151. TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
  152. } else if(WHITESPACE(C)) {
  153. /* Return back to the tag state */
  154. state = ST_TAG_BODY;
  155. }
  156. break;
  157. case ST_COMMENT_WAIT_DASH1:
  158. if(C == CDASH) {
  159. state = ST_COMMENT_WAIT_DASH2;
  160. } else {
  161. /* Some ordinary tag. */
  162. state = ST_TAG_BODY;
  163. }
  164. break;
  165. case ST_COMMENT_WAIT_DASH2:
  166. if(C == CDASH) {
  167. /* Seen "<--" */
  168. state = ST_COMMENT;
  169. } else {
  170. /* Some ordinary tag */
  171. state = ST_TAG_BODY;
  172. }
  173. break;
  174. case ST_COMMENT:
  175. if(C == CDASH) {
  176. state = ST_COMMENT_CLO_DASH2;
  177. }
  178. break;
  179. case ST_COMMENT_CLO_DASH2:
  180. if(C == CDASH) {
  181. state = ST_COMMENT_CLO_RT;
  182. } else {
  183. /* This is not an end of a comment */
  184. state = ST_COMMENT;
  185. }
  186. break;
  187. case ST_COMMENT_CLO_RT:
  188. if(C == RANGLE) {
  189. TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
  190. } else if(C == CDASH) {
  191. /* Maintain current state, still waiting for '>' */
  192. } else {
  193. state = ST_COMMENT;
  194. }
  195. break;
  196. } /* switch(*ptr) */
  197. } /* for() */
  198. /*
  199. * Flush the partially processed chunk, state permitting.
  200. */
  201. if(p - chunk_start) {
  202. switch (state) {
  203. case ST_COMMENT:
  204. TOKEN_CB(PXML_COMMENT, state, 0);
  205. break;
  206. case ST_TEXT:
  207. TOKEN_CB(PXML_TEXT, state, 0);
  208. break;
  209. default: break; /* a no-op */
  210. }
  211. }
  212. finish:
  213. *stateContext = (int)state;
  214. return chunk_start - (const char *)xmlbuf;
  215. }