py_str.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. #include "pocketpy/common/str.h"
  2. #include "pocketpy/pocketpy.h"
  3. #include "pocketpy/common/utils.h"
  4. #include "pocketpy/objects/object.h"
  5. #include "pocketpy/interpreter/vm.h"
  6. #include "pocketpy/common/sstream.h"
  7. void py_newstr(py_Ref out, const char* data) { return py_newstrn(out, data, strlen(data)); }
  8. void py_newstrn(py_Ref out, const char* data, int size) {
  9. pk_ManagedHeap* heap = &pk_current_vm->heap;
  10. int total_size = sizeof(c11_string) + size + 1;
  11. PyObject* obj = pk_ManagedHeap__gcnew(heap, tp_str, 0, total_size);
  12. c11_string* ud = PyObject__userdata(obj);
  13. c11_string__ctor2(ud, data, size);
  14. out->type = tp_str;
  15. out->is_ptr = true;
  16. out->_obj = obj;
  17. }
  18. unsigned char* py_newbytes(py_Ref out, int size) {
  19. pk_ManagedHeap* heap = &pk_current_vm->heap;
  20. // 4 bytes size + data
  21. PyObject* obj = pk_ManagedHeap__gcnew(heap, tp_bytes, 0, sizeof(c11_bytes) + size);
  22. c11_bytes* ud = PyObject__userdata(obj);
  23. ud->size = size;
  24. out->type = tp_bytes;
  25. out->is_ptr = true;
  26. out->_obj = obj;
  27. return ud->data;
  28. }
  29. const char* py_tostr(const py_Ref self) {
  30. assert(self->type == tp_str);
  31. c11_string* ud = PyObject__userdata(self->_obj);
  32. return ud->data;
  33. }
  34. const char* py_tostrn(const py_Ref self, int* size) {
  35. assert(self->type == tp_str);
  36. c11_string* ud = PyObject__userdata(self->_obj);
  37. *size = ud->size;
  38. return ud->data;
  39. }
  40. unsigned char* py_tobytes(const py_Ref self, int* size) {
  41. assert(self->type == tp_bytes);
  42. c11_bytes* ud = PyObject__userdata(self->_obj);
  43. *size = ud->size;
  44. return ud->data;
  45. }
  46. ////////////////////////////////
  47. static bool _py_str__new__(int argc, py_Ref argv) {
  48. assert(argc >= 1);
  49. if(argc == 1) {
  50. py_newstr(py_retval(), "");
  51. return true;
  52. }
  53. if(argc > 2) return TypeError("str() takes at most 1 argument");
  54. return py_str(py_arg(1));
  55. }
  56. static bool _py_str__hash__(int argc, py_Ref argv) {
  57. PY_CHECK_ARGC(1);
  58. int size;
  59. const char* data = py_tostrn(&argv[0], &size);
  60. py_i64 res = 0;
  61. for(int i = 0; i < size; i++) {
  62. res = res * 31 + data[i];
  63. }
  64. py_newint(py_retval(), res);
  65. return true;
  66. }
  67. static bool _py_str__len__(int argc, py_Ref argv) {
  68. PY_CHECK_ARGC(1);
  69. c11_string* self = py_touserdata(&argv[0]);
  70. py_newint(py_retval(), self->size);
  71. return true;
  72. }
  73. static bool _py_str__add__(int argc, py_Ref argv) {
  74. PY_CHECK_ARGC(2);
  75. c11_string* self = py_touserdata(&argv[0]);
  76. if(py_arg(1)->type != tp_str) {
  77. py_newnotimplemented(py_retval());
  78. } else {
  79. c11_string* other = py_touserdata(&argv[1]);
  80. int total_size = sizeof(c11_string) + self->size + other->size + 1;
  81. c11_string* res = py_newobject(py_retval(), tp_str, 0, total_size);
  82. res->size = self->size + other->size;
  83. char* p = (char*)res->data;
  84. memcpy(p, self->data, self->size);
  85. memcpy(p + self->size, other->data, other->size);
  86. p[res->size] = '\0';
  87. }
  88. return true;
  89. }
  90. static bool _py_str__mul__(int argc, py_Ref argv) {
  91. PY_CHECK_ARGC(2);
  92. c11_string* self = py_touserdata(&argv[0]);
  93. if(py_arg(1)->type != tp_int) {
  94. py_newnotimplemented(py_retval());
  95. } else {
  96. py_i64 n = py_toint(py_arg(1));
  97. if(n <= 0) {
  98. py_newstr(py_retval(), "");
  99. } else {
  100. int total_size = sizeof(c11_string) + self->size * n + 1;
  101. c11_string* res = py_newobject(py_retval(), tp_str, 0, total_size);
  102. res->size = self->size * n;
  103. char* p = (char*)res->data;
  104. for(int i = 0; i < n; i++) {
  105. memcpy(p + i * self->size, self->data, self->size);
  106. }
  107. p[res->size] = '\0';
  108. }
  109. }
  110. return true;
  111. }
  112. static bool _py_str__rmul__(int argc, py_Ref argv) { return _py_str__mul__(argc, argv); }
  113. static bool _py_str__contains__(int argc, py_Ref argv) {
  114. PY_CHECK_ARGC(2);
  115. c11_string* self = py_touserdata(&argv[0]);
  116. if(py_arg(1)->type != tp_str) {
  117. py_newnotimplemented(py_retval());
  118. } else {
  119. c11_string* other = py_touserdata(&argv[1]);
  120. const char* p = strstr(self->data, other->data);
  121. py_newbool(py_retval(), p != NULL);
  122. }
  123. return true;
  124. }
  125. static bool _py_str__str__(int argc, py_Ref argv) {
  126. PY_CHECK_ARGC(1);
  127. *py_retval() = argv[0];
  128. return true;
  129. }
  130. static bool _py_str__repr__(int argc, py_Ref argv) {
  131. PY_CHECK_ARGC(1);
  132. c11_sbuf buf;
  133. c11_sbuf__ctor(&buf);
  134. int size;
  135. const char* data = py_tostrn(&argv[0], &size);
  136. c11_sbuf__write_quoted(&buf, (c11_sv){data, size}, '\'');
  137. c11_string* res = c11_sbuf__submit(&buf);
  138. py_newstrn(py_retval(), res->data, res->size);
  139. c11_string__delete(res);
  140. return true;
  141. }
  142. static bool _py_str__iter__(int argc, py_Ref argv) {
  143. PY_CHECK_ARGC(1);
  144. return py_tpcall(tp_str_iterator, 1, argv);
  145. }
  146. static bool _py_str__getitem__(int argc, py_Ref argv) {
  147. PY_CHECK_ARGC(2);
  148. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  149. py_Ref _1 = py_arg(1);
  150. if(_1->type == tp_int) {
  151. int index = py_toint(py_arg(1));
  152. pk__normalize_index(&index, self.size);
  153. c11_sv res = c11_sv__u8_getitem(self, index);
  154. py_newstrn(py_retval(), res.data, res.size);
  155. } else if(_1->type == tp_slice) {
  156. int start, stop, step;
  157. bool ok = pk__parse_int_slice(_1, c11_sv__u8_length(self), &start, &stop, &step);
  158. if(!ok) return false;
  159. c11_string* res = c11_sv__u8_slice(self, start, stop, step);
  160. py_newstrn(py_retval(), res->data, res->size);
  161. c11_string__delete(res);
  162. return true;
  163. } else {
  164. return TypeError("str indices must be integers");
  165. }
  166. return true;
  167. }
  168. #define DEF_STR_CMP_OP(op, __f, __cond) \
  169. static bool _py_str##op(int argc, py_Ref argv) { \
  170. PY_CHECK_ARGC(2); \
  171. c11_string* self = py_touserdata(&argv[0]); \
  172. if(py_arg(1)->type != tp_str) { \
  173. py_newnotimplemented(py_retval()); \
  174. } else { \
  175. c11_string* other = py_touserdata(&argv[1]); \
  176. int res = __f(c11_string__sv(self), c11_string__sv(other)); \
  177. py_newbool(py_retval(), __cond); \
  178. } \
  179. return true; \
  180. }
  181. DEF_STR_CMP_OP(__eq__, c11__sveq, res)
  182. DEF_STR_CMP_OP(__ne__, c11__sveq, !res)
  183. DEF_STR_CMP_OP(__lt__, c11_sv__cmp, res < 0)
  184. DEF_STR_CMP_OP(__le__, c11_sv__cmp, res <= 0)
  185. DEF_STR_CMP_OP(__gt__, c11_sv__cmp, res > 0)
  186. DEF_STR_CMP_OP(__ge__, c11_sv__cmp, res >= 0)
  187. #undef DEF_STR_CMP_OP
  188. static bool _py_str__lower(int argc, py_Ref argv) {
  189. PY_CHECK_ARGC(1);
  190. c11_string* self = py_touserdata(&argv[0]);
  191. int total_size = sizeof(c11_string) + self->size + 1;
  192. c11_string* res = py_newobject(py_retval(), tp_str, 0, total_size);
  193. res->size = self->size;
  194. char* p = (char*)res->data;
  195. for(int i = 0; i < self->size; i++) {
  196. char c = self->data[i];
  197. p[i] = c >= 'A' && c <= 'Z' ? c + 32 : c;
  198. }
  199. p[res->size] = '\0';
  200. return true;
  201. }
  202. static bool _py_str__upper(int argc, py_Ref argv) {
  203. PY_CHECK_ARGC(1);
  204. c11_string* self = py_touserdata(&argv[0]);
  205. int total_size = sizeof(c11_string) + self->size + 1;
  206. c11_string* res = py_newobject(py_retval(), tp_str, 0, total_size);
  207. res->size = self->size;
  208. char* p = (char*)res->data;
  209. for(int i = 0; i < self->size; i++) {
  210. char c = self->data[i];
  211. p[i] = c >= 'a' && c <= 'z' ? c - 32 : c;
  212. }
  213. p[res->size] = '\0';
  214. return true;
  215. }
  216. static bool _py_str__startswith(int argc, py_Ref argv) {
  217. PY_CHECK_ARGC(2);
  218. c11_string* self = py_touserdata(&argv[0]);
  219. PY_CHECK_ARG_TYPE(1, tp_str);
  220. c11_string* other = py_touserdata(&argv[1]);
  221. c11_sv _0 = c11_sv__slice2(c11_string__sv(self), 0, other->size);
  222. c11_sv _1 = c11_string__sv(other);
  223. py_newbool(py_retval(), c11__sveq(_0, _1));
  224. return true;
  225. }
  226. static bool _py_str__endswith(int argc, py_Ref argv) {
  227. PY_CHECK_ARGC(2);
  228. c11_string* self = py_touserdata(&argv[0]);
  229. PY_CHECK_ARG_TYPE(1, tp_str);
  230. c11_string* other = py_touserdata(&argv[1]);
  231. c11_sv _0 = c11_sv__slice2(c11_string__sv(self), self->size - other->size, self->size);
  232. c11_sv _1 = c11_string__sv(other);
  233. py_newbool(py_retval(), c11__sveq(_0, _1));
  234. return true;
  235. }
  236. static bool _py_str__join(int argc, py_Ref argv) {
  237. PY_CHECK_ARGC(2);
  238. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  239. py_Ref _1 = py_arg(1);
  240. // join a list or tuple
  241. py_TValue* p;
  242. int length;
  243. if(py_istype(_1, tp_list)) {
  244. p = py_list__getitem(_1, 0);
  245. length = py_list__len(_1);
  246. } else if(py_istype(_1, tp_tuple)) {
  247. p = py_tuple__getitem(_1, 0);
  248. length = py_tuple__len(_1);
  249. } else {
  250. return TypeError("join() argument must be a list or tuple");
  251. }
  252. c11_sbuf buf;
  253. c11_sbuf__ctor(&buf);
  254. for(int i = 0; i < length; i++) {
  255. if(i > 0) c11_sbuf__write_sv(&buf, self);
  256. if(!py_checkstr(&p[i])) {
  257. c11_sbuf__dtor(&buf);
  258. return false;
  259. }
  260. c11_string* item = py_touserdata(&p[i]);
  261. c11_sbuf__write_cstrn(&buf, item->data, item->size);
  262. }
  263. c11_string* res = c11_sbuf__submit(&buf);
  264. py_newstrn(py_retval(), res->data, res->size);
  265. c11_string__delete(res);
  266. return true;
  267. }
  268. static bool _py_str__replace(int argc, py_Ref argv) {
  269. PY_CHECK_ARGC(3);
  270. c11_string* self = py_touserdata(&argv[0]);
  271. PY_CHECK_ARG_TYPE(1, tp_str);
  272. PY_CHECK_ARG_TYPE(2, tp_str);
  273. c11_string* old = py_touserdata(&argv[1]);
  274. c11_string* new_ = py_touserdata(&argv[2]);
  275. c11_string* res =
  276. c11_sv__replace2(c11_string__sv(self), c11_string__sv(old), c11_string__sv(new_));
  277. py_newstrn(py_retval(), res->data, res->size);
  278. c11_string__delete(res);
  279. return true;
  280. }
  281. static bool _py_str__split(int argc, py_Ref argv) {
  282. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  283. c11_vector res;
  284. if(argc > 2) return TypeError("split() takes at most 2 arguments");
  285. if(argc == 1) {
  286. // sep = ' '
  287. res = c11_sv__split(self, ' ');
  288. }
  289. if(argc == 2) {
  290. // sep = argv[1]
  291. if(!py_checkstr(&argv[1])) return false;
  292. c11_sv sep = c11_string__sv(py_touserdata(&argv[1]));
  293. res = c11_sv__split2(self, sep);
  294. }
  295. py_newlistn(py_retval(), res.count);
  296. for(int i = 0; i < res.count; i++) {
  297. c11_sv item = c11__getitem(c11_sv, &res, i);
  298. py_newstrn(py_list__getitem(py_retval(), i), item.data, item.size);
  299. }
  300. c11_vector__dtor(&res);
  301. return true;
  302. }
  303. static bool _py_str__count(int argc, py_Ref argv) {
  304. PY_CHECK_ARGC(2);
  305. c11_string* self = py_touserdata(&argv[0]);
  306. PY_CHECK_ARG_TYPE(1, tp_str);
  307. c11_string* sub = py_touserdata(&argv[1]);
  308. int res = c11_sv__count(c11_string__sv(self), c11_string__sv(sub));
  309. py_newint(py_retval(), res);
  310. return true;
  311. }
  312. static bool _py_str__strip_impl(bool left, bool right, int argc, py_Ref argv) {
  313. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  314. c11_sv chars;
  315. if(argc == 1) {
  316. chars = (c11_sv){" \t\n\r", 4};
  317. } else if(argc == 2) {
  318. if(!py_checkstr(&argv[1])) return false;
  319. chars = c11_string__sv(py_touserdata(&argv[1]));
  320. } else {
  321. return TypeError("strip() takes at most 2 arguments");
  322. }
  323. c11_sv res = c11_sv__strip(self, chars, left, right);
  324. py_newstrn(py_retval(), res.data, res.size);
  325. return true;
  326. }
  327. static bool _py_str__strip(int argc, py_Ref argv) {
  328. return _py_str__strip_impl(true, true, argc, argv);
  329. }
  330. static bool _py_str__lstrip(int argc, py_Ref argv) {
  331. return _py_str__strip_impl(true, false, argc, argv);
  332. }
  333. static bool _py_str__rstrip(int argc, py_Ref argv) {
  334. return _py_str__strip_impl(false, true, argc, argv);
  335. }
  336. static bool _py_str__zfill(int argc, py_Ref argv) {
  337. PY_CHECK_ARGC(2);
  338. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  339. PY_CHECK_ARG_TYPE(1, tp_int);
  340. int width = py_toint(py_arg(1));
  341. int delta = width - c11_sv__u8_length(self);
  342. if(delta <= 0) {
  343. *py_retval() = argv[0];
  344. return true;
  345. }
  346. c11_sbuf buf;
  347. c11_sbuf__ctor(&buf);
  348. for(int i = 0; i < delta; i++) {
  349. c11_sbuf__write_char(&buf, '0');
  350. }
  351. c11_sbuf__write_sv(&buf, self);
  352. c11_string* res = c11_sbuf__submit(&buf);
  353. py_newstrn(py_retval(), res->data, res->size);
  354. c11_string__delete(res);
  355. return true;
  356. }
  357. static bool _py_str__widthjust_impl(bool left, int argc, py_Ref argv) {
  358. if(argc > 1 + 2) return TypeError("expected at most 2 arguments");
  359. char pad;
  360. if(argc == 1 + 1) {
  361. pad = ' ';
  362. } else {
  363. if(!py_checkstr(&argv[2])) return false;
  364. c11_string* padstr = py_touserdata(&argv[2]);
  365. if(padstr->size != 1)
  366. return TypeError("The fill character must be exactly one character long");
  367. pad = padstr->data[0];
  368. }
  369. c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
  370. PY_CHECK_ARG_TYPE(1, tp_int);
  371. int width = py_toint(py_arg(1));
  372. if(width <= self.size) {
  373. *py_retval() = argv[0];
  374. return true;
  375. }
  376. c11_sbuf buf;
  377. c11_sbuf__ctor(&buf);
  378. if(left) {
  379. c11_sbuf__write_sv(&buf, self);
  380. for(int i = 0; i < width - self.size; i++) {
  381. c11_sbuf__write_char(&buf, pad);
  382. }
  383. } else {
  384. for(int i = 0; i < width - self.size; i++) {
  385. c11_sbuf__write_char(&buf, pad);
  386. }
  387. c11_sbuf__write_sv(&buf, self);
  388. }
  389. c11_string* res = c11_sbuf__submit(&buf);
  390. py_newstrn(py_retval(), res->data, res->size);
  391. c11_string__delete(res);
  392. return true;
  393. }
  394. static bool _py_str__ljust(int argc, py_Ref argv) {
  395. return _py_str__widthjust_impl(true, argc, argv);
  396. }
  397. static bool _py_str__rjust(int argc, py_Ref argv) {
  398. return _py_str__widthjust_impl(false, argc, argv);
  399. }
  400. py_Type pk_str__register() {
  401. pk_VM* vm = pk_current_vm;
  402. py_Type type = pk_VM__new_type(vm, "str", tp_object, NULL, false);
  403. // no need to dtor because the memory is controlled by the object
  404. py_bindmagic(tp_str, __new__, _py_str__new__);
  405. py_bindmagic(tp_str, __hash__, _py_str__hash__);
  406. py_bindmagic(tp_str, __len__, _py_str__len__);
  407. py_bindmagic(tp_str, __add__, _py_str__add__);
  408. py_bindmagic(tp_str, __mul__, _py_str__mul__);
  409. py_bindmagic(tp_str, __rmul__, _py_str__rmul__);
  410. py_bindmagic(tp_str, __contains__, _py_str__contains__);
  411. py_bindmagic(tp_str, __str__, _py_str__str__);
  412. py_bindmagic(tp_str, __repr__, _py_str__repr__);
  413. py_bindmagic(tp_str, __iter__, _py_str__iter__);
  414. py_bindmagic(tp_str, __getitem__, _py_str__getitem__);
  415. py_bindmagic(tp_str, __eq__, _py_str__eq__);
  416. py_bindmagic(tp_str, __ne__, _py_str__ne__);
  417. py_bindmagic(tp_str, __lt__, _py_str__lt__);
  418. py_bindmagic(tp_str, __le__, _py_str__le__);
  419. py_bindmagic(tp_str, __gt__, _py_str__gt__);
  420. py_bindmagic(tp_str, __ge__, _py_str__ge__);
  421. py_bindmethod(tp_str, "lower", _py_str__lower);
  422. py_bindmethod(tp_str, "upper", _py_str__upper);
  423. py_bindmethod(tp_str, "startswith", _py_str__startswith);
  424. py_bindmethod(tp_str, "endswith", _py_str__endswith);
  425. py_bindmethod(tp_str, "join", _py_str__join);
  426. py_bindmethod(tp_str, "replace", _py_str__replace);
  427. py_bindmethod(tp_str, "split", _py_str__split);
  428. py_bindmethod(tp_str, "count", _py_str__count);
  429. py_bindmethod(tp_str, "strip", _py_str__strip);
  430. py_bindmethod(tp_str, "lstrip", _py_str__lstrip);
  431. py_bindmethod(tp_str, "rstrip", _py_str__rstrip);
  432. py_bindmethod(tp_str, "zfill", _py_str__zfill);
  433. py_bindmethod(tp_str, "ljust", _py_str__ljust);
  434. py_bindmethod(tp_str, "rjust", _py_str__rjust);
  435. return type;
  436. }
  437. static bool _py_str_iterator__new__(int argc, py_Ref argv) {
  438. PY_CHECK_ARGC(2);
  439. PY_CHECK_ARG_TYPE(1, tp_str);
  440. int* ud = py_newobject(py_retval(), tp_str_iterator, 1, sizeof(int));
  441. *ud = 0;
  442. py_setslot(py_retval(), 0, &argv[1]);
  443. return true;
  444. }
  445. static bool _py_str_iterator__iter__(int argc, py_Ref argv) {
  446. PY_CHECK_ARGC(1);
  447. *py_retval() = argv[0];
  448. return true;
  449. }
  450. static bool _py_str_iterator__next__(int argc, py_Ref argv) {
  451. PY_CHECK_ARGC(1);
  452. int* ud = py_touserdata(&argv[0]);
  453. int size;
  454. const char* data = py_tostrn(py_getslot(argv, 0), &size);
  455. if(*ud == size) {
  456. *py_retval() = pk_current_vm->StopIteration;
  457. return true;
  458. }
  459. int start = *ud;
  460. int len = c11__u8_header(data[*ud], false);
  461. *ud += len;
  462. py_newstrn(py_retval(), data + start, len);
  463. return true;
  464. }
  465. py_Type pk_str_iterator__register() {
  466. pk_VM* vm = pk_current_vm;
  467. py_Type type = pk_VM__new_type(vm, "str_iterator", tp_object, NULL, false);
  468. py_bindmagic(type, __new__, _py_str_iterator__new__);
  469. py_bindmagic(type, __iter__, _py_str_iterator__iter__);
  470. py_bindmagic(type, __next__, _py_str_iterator__next__);
  471. return type;
  472. }
  473. py_Type pk_bytes__register() {
  474. pk_VM* vm = pk_current_vm;
  475. py_Type type = pk_VM__new_type(vm, "bytes", tp_object, NULL, false);
  476. // no need to dtor because the memory is controlled by the object
  477. return type;
  478. }
  479. bool py_str(py_Ref val) {
  480. py_Ref tmp = py_tpfindmagic(val->type, __str__);
  481. if(!tmp) return py_repr(val);
  482. return py_call(tmp, 1, val);
  483. }