unicode.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2013, 2014 Damien P. George
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. */
  26. #include <stdint.h>
  27. #include "py/unicode.h"
  28. // attribute flags
  29. #define FL_PRINT (0x01)
  30. #define FL_SPACE (0x02)
  31. #define FL_DIGIT (0x04)
  32. #define FL_ALPHA (0x08)
  33. #define FL_UPPER (0x10)
  34. #define FL_LOWER (0x20)
  35. #define FL_XDIGIT (0x40)
  36. // shorthand character attributes
  37. #define AT_PR (FL_PRINT)
  38. #define AT_SP (FL_SPACE | FL_PRINT)
  39. #define AT_DI (FL_DIGIT | FL_PRINT | FL_XDIGIT)
  40. #define AT_AL (FL_ALPHA | FL_PRINT)
  41. #define AT_UP (FL_UPPER | FL_ALPHA | FL_PRINT)
  42. #define AT_LO (FL_LOWER | FL_ALPHA | FL_PRINT)
  43. #define AT_UX (FL_UPPER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
  44. #define AT_LX (FL_LOWER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
  45. // table of attributes for ascii characters
  46. STATIC const uint8_t attr[] = {
  47. 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0,
  51. AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  52. AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  53. AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI,
  54. AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  55. AT_PR, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UP,
  56. AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
  57. AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
  58. AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  59. AT_PR, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LO,
  60. AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
  61. AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
  62. AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
  63. };
  64. #if MICROPY_PY_BUILTINS_STR_UNICODE
  65. unichar utf8_get_char(const byte *s) {
  66. unichar ord = *s++;
  67. if (!UTF8_IS_NONASCII(ord)) {
  68. return ord;
  69. }
  70. ord &= 0x7F;
  71. for (unichar mask = 0x40; ord & mask; mask >>= 1) {
  72. ord &= ~mask;
  73. }
  74. while (UTF8_IS_CONT(*s)) {
  75. ord = (ord << 6) | (*s++ & 0x3F);
  76. }
  77. return ord;
  78. }
  79. const byte *utf8_next_char(const byte *s) {
  80. ++s;
  81. while (UTF8_IS_CONT(*s)) {
  82. ++s;
  83. }
  84. return s;
  85. }
  86. mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
  87. mp_uint_t i = 0;
  88. while (ptr > s) {
  89. if (!UTF8_IS_CONT(*--ptr)) {
  90. i++;
  91. }
  92. }
  93. return i;
  94. }
  95. size_t utf8_charlen(const byte *str, size_t len) {
  96. size_t charlen = 0;
  97. for (const byte *top = str + len; str < top; ++str) {
  98. if (!UTF8_IS_CONT(*str)) {
  99. ++charlen;
  100. }
  101. }
  102. return charlen;
  103. }
  104. #endif
  105. // Be aware: These unichar_is* functions are actually ASCII-only!
  106. bool unichar_isspace(unichar c) {
  107. return c < 128 && (attr[c] & FL_SPACE) != 0;
  108. }
  109. bool unichar_isalpha(unichar c) {
  110. return c < 128 && (attr[c] & FL_ALPHA) != 0;
  111. }
  112. /* unused
  113. bool unichar_isprint(unichar c) {
  114. return c < 128 && (attr[c] & FL_PRINT) != 0;
  115. }
  116. */
  117. bool unichar_isdigit(unichar c) {
  118. return c < 128 && (attr[c] & FL_DIGIT) != 0;
  119. }
  120. bool unichar_isxdigit(unichar c) {
  121. return c < 128 && (attr[c] & FL_XDIGIT) != 0;
  122. }
  123. bool unichar_isident(unichar c) {
  124. return c < 128 && ((attr[c] & (FL_ALPHA | FL_DIGIT)) != 0 || c == '_');
  125. }
  126. bool unichar_isalnum(unichar c) {
  127. return c < 128 && ((attr[c] & (FL_ALPHA | FL_DIGIT)) != 0);
  128. }
  129. bool unichar_isupper(unichar c) {
  130. return c < 128 && (attr[c] & FL_UPPER) != 0;
  131. }
  132. bool unichar_islower(unichar c) {
  133. return c < 128 && (attr[c] & FL_LOWER) != 0;
  134. }
  135. unichar unichar_tolower(unichar c) {
  136. if (unichar_isupper(c)) {
  137. return c + 0x20;
  138. }
  139. return c;
  140. }
  141. unichar unichar_toupper(unichar c) {
  142. if (unichar_islower(c)) {
  143. return c - 0x20;
  144. }
  145. return c;
  146. }
  147. mp_uint_t unichar_xdigit_value(unichar c) {
  148. // c is assumed to be hex digit
  149. mp_uint_t n = c - '0';
  150. if (n > 9) {
  151. n &= ~('a' - 'A');
  152. n -= ('A' - ('9' + 1));
  153. }
  154. return n;
  155. }
  156. #if MICROPY_PY_BUILTINS_STR_UNICODE
  157. bool utf8_check(const byte *p, size_t len) {
  158. uint8_t need = 0;
  159. const byte *end = p + len;
  160. for (; p < end; p++) {
  161. byte c = *p;
  162. if (need) {
  163. if (UTF8_IS_CONT(c)) {
  164. need--;
  165. } else {
  166. // mismatch
  167. return 0;
  168. }
  169. } else {
  170. if (c >= 0xc0) {
  171. if (c >= 0xf8) {
  172. // mismatch
  173. return 0;
  174. }
  175. need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
  176. } else if (c >= 0x80) {
  177. // mismatch
  178. return 0;
  179. }
  180. }
  181. }
  182. return need == 0; // no pending fragments allowed
  183. }
  184. #endif