objstrunicode.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2013, 2014 Damien P. George
  7. * Copyright (c) 2014-2016 Paul Sokolovsky
  8. *
  9. * Permission is hereby granted, free of charge, to any person obtaining a copy
  10. * of this software and associated documentation files (the "Software"), to deal
  11. * in the Software without restriction, including without limitation the rights
  12. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13. * copies of the Software, and to permit persons to whom the Software is
  14. * furnished to do so, subject to the following conditions:
  15. *
  16. * The above copyright notice and this permission notice shall be included in
  17. * all copies or substantial portions of the Software.
  18. *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  25. * THE SOFTWARE.
  26. */
  27. #include <string.h>
  28. #include <assert.h>
  29. #include "py/objstr.h"
  30. #include "py/objlist.h"
  31. #include "py/runtime.h"
  32. #if MICROPY_PY_BUILTINS_STR_UNICODE
  33. static mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
  34. /******************************************************************************/
  35. /* str */
  36. static void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint str_len) {
  37. // this escapes characters, but it will be very slow to print (calling print many times)
  38. bool has_single_quote = false;
  39. bool has_double_quote = false;
  40. for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
  41. if (*s == '\'') {
  42. has_single_quote = true;
  43. } else if (*s == '"') {
  44. has_double_quote = true;
  45. }
  46. }
  47. unichar quote_char = '\'';
  48. if (has_single_quote && !has_double_quote) {
  49. quote_char = '"';
  50. }
  51. mp_printf(print, "%c", quote_char);
  52. const byte *s = str_data, *top = str_data + str_len;
  53. while (s < top) {
  54. unichar ch;
  55. ch = utf8_get_char(s);
  56. s = utf8_next_char(s);
  57. if (ch == quote_char) {
  58. mp_printf(print, "\\%c", quote_char);
  59. } else if (ch == '\\') {
  60. mp_print_str(print, "\\\\");
  61. } else if (32 <= ch && ch <= 126) {
  62. mp_printf(print, "%c", ch);
  63. } else if (ch == '\n') {
  64. mp_print_str(print, "\\n");
  65. } else if (ch == '\r') {
  66. mp_print_str(print, "\\r");
  67. } else if (ch == '\t') {
  68. mp_print_str(print, "\\t");
  69. } else if (ch < 0x100) {
  70. mp_printf(print, "\\x%02x", ch);
  71. } else if (ch < 0x10000) {
  72. mp_printf(print, "\\u%04x", ch);
  73. } else {
  74. mp_printf(print, "\\U%08x", ch);
  75. }
  76. }
  77. mp_printf(print, "%c", quote_char);
  78. }
  79. static void uni_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  80. GET_STR_DATA_LEN(self_in, str_data, str_len);
  81. #if MICROPY_PY_JSON
  82. if (kind == PRINT_JSON) {
  83. mp_str_print_json(print, str_data, str_len);
  84. return;
  85. }
  86. #endif
  87. if (kind == PRINT_STR) {
  88. print->print_strn(print->data, (const char *)str_data, str_len);
  89. } else {
  90. uni_print_quoted(print, str_data, str_len);
  91. }
  92. }
  93. static mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
  94. GET_STR_DATA_LEN(self_in, str_data, str_len);
  95. switch (op) {
  96. case MP_UNARY_OP_BOOL:
  97. return mp_obj_new_bool(str_len != 0);
  98. case MP_UNARY_OP_LEN:
  99. return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len));
  100. default:
  101. return MP_OBJ_NULL; // op not supported
  102. }
  103. }
  104. // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
  105. // be capped to the first/last character of the string, depending on is_slice.
  106. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
  107. mp_obj_t index, bool is_slice) {
  108. // All str functions also handle bytes objects, and they call str_index_to_ptr(),
  109. // so it must handle bytes.
  110. if (type == &mp_type_bytes
  111. #if MICROPY_PY_BUILTINS_BYTEARRAY
  112. || type == &mp_type_bytearray
  113. #endif
  114. ) {
  115. // Taken from objstr.c:str_index_to_ptr()
  116. size_t index_val = mp_get_index(type, self_len, index, is_slice);
  117. return self_data + index_val;
  118. }
  119. mp_int_t i;
  120. // Copied from mp_get_index; I don't want bounds checking, just give me
  121. // the integer as-is. (I can't bounds-check without scanning the whole
  122. // string; an out-of-bounds index will be caught in the loops below.)
  123. if (mp_obj_is_small_int(index)) {
  124. i = MP_OBJ_SMALL_INT_VALUE(index);
  125. } else if (!mp_obj_get_int_maybe(index, &i)) {
  126. mp_raise_msg_varg(&mp_type_TypeError, MP_ERROR_TEXT("string indices must be integers, not %s"), mp_obj_get_type_str(index));
  127. }
  128. const byte *s, *top = self_data + self_len;
  129. if (i < 0) {
  130. // Negative indexing is performed by counting from the end of the string.
  131. for (s = top - 1; i; --s) {
  132. if (s < self_data) {
  133. if (is_slice) {
  134. return self_data;
  135. }
  136. mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range"));
  137. }
  138. if (!UTF8_IS_CONT(*s)) {
  139. ++i;
  140. }
  141. }
  142. ++s;
  143. } else {
  144. // Positive indexing, correspondingly, counts from the start of the string.
  145. // It's assumed that negative indexing will generally be used with small
  146. // absolute values (eg str[-1], not str[-1000000]), which means it'll be
  147. // more efficient this way.
  148. s = self_data;
  149. while (1) {
  150. // First check out-of-bounds
  151. if (s >= top) {
  152. if (is_slice) {
  153. return top;
  154. }
  155. mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range"));
  156. }
  157. // Then check completion
  158. if (i-- == 0) {
  159. break;
  160. }
  161. // Then skip UTF-8 char
  162. ++s;
  163. while (UTF8_IS_CONT(*s)) {
  164. ++s;
  165. }
  166. }
  167. }
  168. return s;
  169. }
  170. static mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
  171. const mp_obj_type_t *type = mp_obj_get_type(self_in);
  172. assert(type == &mp_type_str);
  173. GET_STR_DATA_LEN(self_in, self_data, self_len);
  174. if (value == MP_OBJ_SENTINEL) {
  175. // load
  176. #if MICROPY_PY_BUILTINS_SLICE
  177. if (mp_obj_is_type(index, &mp_type_slice)) {
  178. mp_obj_t ostart, ostop, ostep;
  179. mp_obj_slice_t *slice = MP_OBJ_TO_PTR(index);
  180. ostart = slice->start;
  181. ostop = slice->stop;
  182. ostep = slice->step;
  183. if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
  184. mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported"));
  185. }
  186. const byte *pstart, *pstop;
  187. if (ostart != mp_const_none) {
  188. pstart = str_index_to_ptr(type, self_data, self_len, ostart, true);
  189. } else {
  190. pstart = self_data;
  191. }
  192. if (ostop != mp_const_none) {
  193. // pstop will point just after the stop character. This depends on
  194. // the \0 at the end of the string.
  195. pstop = str_index_to_ptr(type, self_data, self_len, ostop, true);
  196. } else {
  197. pstop = self_data + self_len;
  198. }
  199. if (pstop < pstart) {
  200. return MP_OBJ_NEW_QSTR(MP_QSTR_);
  201. }
  202. return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
  203. }
  204. #endif
  205. const byte *s = str_index_to_ptr(type, self_data, self_len, index, false);
  206. int len = 1;
  207. if (UTF8_IS_NONASCII(*s)) {
  208. // Count the number of 1 bits (after the first)
  209. for (char mask = 0x40; *s & mask; mask >>= 1) {
  210. ++len;
  211. }
  212. }
  213. return mp_obj_new_str_via_qstr((const char *)s, len); // This will create a one-character string
  214. } else {
  215. return MP_OBJ_NULL; // op not supported
  216. }
  217. }
  218. MP_DEFINE_CONST_OBJ_TYPE(
  219. mp_type_str,
  220. MP_QSTR_str,
  221. MP_TYPE_FLAG_ITER_IS_GETITER,
  222. make_new, mp_obj_str_make_new,
  223. print, uni_print,
  224. unary_op, uni_unary_op,
  225. binary_op, mp_obj_str_binary_op,
  226. subscr, str_subscr,
  227. iter, mp_obj_new_str_iterator,
  228. buffer, mp_obj_str_get_buffer,
  229. locals_dict, &mp_obj_str_locals_dict
  230. );
  231. /******************************************************************************/
  232. /* str iterator */
  233. typedef struct _mp_obj_str_it_t {
  234. mp_obj_base_t base;
  235. mp_fun_1_t iternext;
  236. mp_obj_t str;
  237. size_t cur;
  238. } mp_obj_str_it_t;
  239. static mp_obj_t str_it_iternext(mp_obj_t self_in) {
  240. mp_obj_str_it_t *self = MP_OBJ_TO_PTR(self_in);
  241. GET_STR_DATA_LEN(self->str, str, len);
  242. if (self->cur < len) {
  243. const byte *cur = str + self->cur;
  244. const byte *end = utf8_next_char(str + self->cur);
  245. mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)cur, end - cur);
  246. self->cur += end - cur;
  247. return o_out;
  248. } else {
  249. return MP_OBJ_STOP_ITERATION;
  250. }
  251. }
  252. static mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
  253. assert(sizeof(mp_obj_str_it_t) <= sizeof(mp_obj_iter_buf_t));
  254. mp_obj_str_it_t *o = (mp_obj_str_it_t *)iter_buf;
  255. o->base.type = &mp_type_polymorph_iter;
  256. o->iternext = str_it_iternext;
  257. o->str = str;
  258. o->cur = 0;
  259. return MP_OBJ_FROM_PTR(o);
  260. }
  261. #endif // MICROPY_PY_BUILTINS_STR_UNICODE