str.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. #include "pocketpy/str.h"
  2. namespace pkpy {
  3. int utf8len(unsigned char c, bool suppress){
  4. if((c & 0b10000000) == 0) return 1;
  5. if((c & 0b11100000) == 0b11000000) return 2;
  6. if((c & 0b11110000) == 0b11100000) return 3;
  7. if((c & 0b11111000) == 0b11110000) return 4;
  8. if((c & 0b11111100) == 0b11111000) return 5;
  9. if((c & 0b11111110) == 0b11111100) return 6;
  10. if(!suppress) throw std::runtime_error("invalid utf8 char: " + std::to_string(c));
  11. return 0;
  12. }
  13. #define PK_STR_ALLOCATE() \
  14. if(this->size < (int)sizeof(this->_inlined)){ \
  15. this->data = this->_inlined; \
  16. }else{ \
  17. this->data = (char*)pool64_alloc(this->size+1); \
  18. }
  19. #define PK_STR_COPY_INIT(__s) \
  20. for(int i=0; i<this->size; i++){ \
  21. this->data[i] = __s[i]; \
  22. if(!isascii(__s[i])) is_ascii = false; \
  23. } \
  24. this->data[this->size] = '\0';
  25. Str::Str(): size(0), is_ascii(true), data(_inlined) {
  26. _inlined[0] = '\0';
  27. }
  28. Str::Str(int size, bool is_ascii): size(size), is_ascii(is_ascii) {
  29. PK_STR_ALLOCATE()
  30. }
  31. Str::Str(const std::string& s): size(s.size()), is_ascii(true) {
  32. PK_STR_ALLOCATE()
  33. PK_STR_COPY_INIT(s)
  34. }
  35. Str::Str(std::string_view s): size(s.size()), is_ascii(true) {
  36. PK_STR_ALLOCATE()
  37. PK_STR_COPY_INIT(s)
  38. }
  39. Str::Str(const char* s): size(strlen(s)), is_ascii(true) {
  40. PK_STR_ALLOCATE()
  41. PK_STR_COPY_INIT(s)
  42. }
  43. Str::Str(const char* s, int len): size(len), is_ascii(true) {
  44. PK_STR_ALLOCATE()
  45. PK_STR_COPY_INIT(s)
  46. }
  47. Str::Str(std::pair<char *, int> detached): size(detached.second), is_ascii(true) {
  48. this->data = detached.first;
  49. for(int i=0; i<size; i++){
  50. if(!isascii(data[i])){ is_ascii = false; break; }
  51. }
  52. PK_ASSERT(data[size] == '\0');
  53. }
  54. Str::Str(const Str& other): size(other.size), is_ascii(other.is_ascii) {
  55. PK_STR_ALLOCATE()
  56. memcpy(data, other.data, size);
  57. data[size] = '\0';
  58. }
  59. Str::Str(Str&& other): size(other.size), is_ascii(other.is_ascii) {
  60. if(other.is_inlined()){
  61. data = _inlined;
  62. for(int i=0; i<size; i++) _inlined[i] = other._inlined[i];
  63. data[size] = '\0';
  64. }else{
  65. data = other.data;
  66. // zero out `other`
  67. other.data = other._inlined;
  68. other.data[0] = '\0';
  69. other.size = 0;
  70. }
  71. }
  72. Str operator+(const char* p, const Str& str){
  73. Str other(p);
  74. return other + str;
  75. }
  76. std::ostream& operator<<(std::ostream& os, const Str& str){
  77. return os << str.sv();
  78. }
  79. bool operator<(const std::string_view other, const Str& str){
  80. return other < str.sv();
  81. }
  82. Str& Str::operator=(const Str& other){
  83. if(!is_inlined()) pool64_dealloc(data);
  84. size = other.size;
  85. is_ascii = other.is_ascii;
  86. PK_STR_ALLOCATE()
  87. memcpy(data, other.data, size);
  88. data[size] = '\0';
  89. return *this;
  90. }
  91. Str Str::operator+(const Str& other) const {
  92. Str ret(size + other.size, is_ascii && other.is_ascii);
  93. memcpy(ret.data, data, size);
  94. memcpy(ret.data + size, other.data, other.size);
  95. ret.data[ret.size] = '\0';
  96. return ret;
  97. }
  98. Str Str::operator+(const char* p) const {
  99. Str other(p);
  100. return *this + other;
  101. }
  102. bool Str::operator==(const Str& other) const {
  103. if(size != other.size) return false;
  104. return memcmp(data, other.data, size) == 0;
  105. }
  106. bool Str::operator!=(const Str& other) const {
  107. if(size != other.size) return true;
  108. return memcmp(data, other.data, size) != 0;
  109. }
  110. bool Str::operator==(const std::string_view other) const {
  111. if(size != (int)other.size()) return false;
  112. return memcmp(data, other.data(), size) == 0;
  113. }
  114. bool Str::operator!=(const std::string_view other) const {
  115. if(size != (int)other.size()) return true;
  116. return memcmp(data, other.data(), size) != 0;
  117. }
  118. bool Str::operator==(const char* p) const {
  119. return *this == std::string_view(p);
  120. }
  121. bool Str::operator!=(const char* p) const {
  122. return *this != std::string_view(p);
  123. }
  124. bool Str::operator<(const Str& other) const {
  125. return this->sv() < other.sv();
  126. }
  127. bool Str::operator<(const std::string_view other) const {
  128. return this->sv() < other;
  129. }
  130. bool Str::operator>(const Str& other) const {
  131. return this->sv() > other.sv();
  132. }
  133. bool Str::operator<=(const Str& other) const {
  134. return this->sv() <= other.sv();
  135. }
  136. bool Str::operator>=(const Str& other) const {
  137. return this->sv() >= other.sv();
  138. }
  139. Str::~Str(){
  140. if(!is_inlined()) pool64_dealloc(data);
  141. }
  142. Str Str::substr(int start, int len) const {
  143. Str ret(len, is_ascii);
  144. memcpy(ret.data, data + start, len);
  145. ret.data[len] = '\0';
  146. return ret;
  147. }
  148. Str Str::substr(int start) const {
  149. return substr(start, size - start);
  150. }
  151. const char* Str::c_str() const{
  152. return data;
  153. }
  154. std::string_view Str::sv() const {
  155. return std::string_view(data, size);
  156. }
  157. std::string Str::str() const {
  158. return std::string(data, size);
  159. }
  160. Str Str::strip(bool left, bool right, const Str& chars) const {
  161. int L = 0;
  162. int R = u8_length();
  163. if(left){
  164. while(L < R && chars.index(u8_getitem(L)) != -1) L++;
  165. }
  166. if(right){
  167. while(L < R && chars.index(u8_getitem(R-1)) != -1) R--;
  168. }
  169. return u8_slice(L, R, 1);
  170. }
  171. Str Str::strip(bool left, bool right) const {
  172. if(is_ascii){
  173. int L = 0;
  174. int R = size;
  175. if(left){
  176. while(L < R && (data[L] == ' ' || data[L] == '\t' || data[L] == '\n' || data[L] == '\r')) L++;
  177. }
  178. if(right){
  179. while(L < R && (data[R-1] == ' ' || data[R-1] == '\t' || data[R-1] == '\n' || data[R-1] == '\r')) R--;
  180. }
  181. return substr(L, R - L);
  182. }else{
  183. return strip(left, right, " \t\n\r");
  184. }
  185. }
  186. Str Str::lower() const{
  187. std::string copy(data, size);
  188. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  189. if('A' <= c && c <= 'Z') return c + ('a' - 'A');
  190. return (int)c;
  191. });
  192. return Str(copy);
  193. }
  194. Str Str::upper() const{
  195. std::string copy(data, size);
  196. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  197. if('a' <= c && c <= 'z') return c - ('a' - 'A');
  198. return (int)c;
  199. });
  200. return Str(copy);
  201. }
  202. Str Str::escape(bool single_quote) const{
  203. SStream ss;
  204. escape_(ss, single_quote);
  205. return ss.str();
  206. }
  207. void Str::escape_(SStream& ss, bool single_quote) const {
  208. ss << (single_quote ? '\'' : '"');
  209. for (int i=0; i<length(); i++) {
  210. char c = this->operator[](i);
  211. switch (c) {
  212. case '"':
  213. if(!single_quote) ss << '\\';
  214. ss << '"';
  215. break;
  216. case '\'':
  217. if(single_quote) ss << '\\';
  218. ss << '\'';
  219. break;
  220. case '\\': ss << '\\' << '\\'; break;
  221. case '\n': ss << "\\n"; break;
  222. case '\r': ss << "\\r"; break;
  223. case '\t': ss << "\\t"; break;
  224. case '\b': ss << "\\b"; break;
  225. default:
  226. if ('\x00' <= c && c <= '\x1f') {
  227. ss << "\\x"; // << std::hex << std::setw(2) << std::setfill('0') << (int)c;
  228. ss << "0123456789abcdef"[c >> 4];
  229. ss << "0123456789abcdef"[c & 0xf];
  230. } else {
  231. ss << c;
  232. }
  233. }
  234. }
  235. ss << (single_quote ? '\'' : '"');
  236. }
  237. int Str::index(const Str& sub, int start) const {
  238. auto p = std::search(data + start, data + size, sub.data, sub.data + sub.size);
  239. if(p == data + size) return -1;
  240. return p - data;
  241. }
  242. Str Str::replace(char old, char new_) const{
  243. Str copied = *this;
  244. for(int i=0; i<copied.size; i++){
  245. if(copied.data[i] == old) copied.data[i] = new_;
  246. }
  247. return copied;
  248. }
  249. Str Str::replace(const Str& old, const Str& new_, int count) const {
  250. SStream ss;
  251. int start = 0;
  252. while(true){
  253. int i = index(old, start);
  254. if(i == -1) break;
  255. ss << substr(start, i - start);
  256. ss << new_;
  257. start = i + old.size;
  258. if(count != -1 && --count == 0) break;
  259. }
  260. ss << substr(start, size - start);
  261. return ss.str();
  262. }
  263. int Str::_unicode_index_to_byte(int i) const{
  264. if(is_ascii) return i;
  265. int j = 0;
  266. while(i > 0){
  267. j += utf8len(data[j]);
  268. i--;
  269. }
  270. return j;
  271. }
  272. int Str::_byte_index_to_unicode(int n) const{
  273. if(is_ascii) return n;
  274. int cnt = 0;
  275. for(int i=0; i<n; i++){
  276. if((data[i] & 0xC0) != 0x80) cnt++;
  277. }
  278. return cnt;
  279. }
  280. Str Str::u8_getitem(int i) const{
  281. i = _unicode_index_to_byte(i);
  282. return substr(i, utf8len(data[i]));
  283. }
  284. Str Str::u8_slice(int start, int stop, int step) const{
  285. SStream ss;
  286. if(is_ascii){
  287. PK_SLICE_LOOP(i, start, stop, step) ss << data[i];
  288. }else{
  289. PK_SLICE_LOOP(i, start, stop, step) ss << u8_getitem(i);
  290. }
  291. return ss.str();
  292. }
  293. int Str::u8_length() const {
  294. return _byte_index_to_unicode(size);
  295. }
  296. pod_vector<std::string_view> Str::split(const Str& sep) const{
  297. pod_vector<std::string_view> result;
  298. std::string_view tmp;
  299. int start = 0;
  300. while(true){
  301. int i = index(sep, start);
  302. if(i == -1) break;
  303. tmp = sv().substr(start, i - start);
  304. if(!tmp.empty()) result.push_back(tmp);
  305. start = i + sep.size;
  306. }
  307. tmp = sv().substr(start, size - start);
  308. if(!tmp.empty()) result.push_back(tmp);
  309. return result;
  310. }
  311. pod_vector<std::string_view> Str::split(char sep) const{
  312. pod_vector<std::string_view> result;
  313. int i = 0;
  314. for(int j = 0; j < size; j++){
  315. if(data[j] == sep){
  316. if(j > i) result.emplace_back(data+i, j-i);
  317. i = j + 1;
  318. continue;
  319. }
  320. }
  321. if(size > i) result.emplace_back(data+i, size-i);
  322. return result;
  323. }
  324. int Str::count(const Str& sub) const{
  325. if(sub.empty()) return size + 1;
  326. int cnt = 0;
  327. int start = 0;
  328. while(true){
  329. int i = index(sub, start);
  330. if(i == -1) break;
  331. cnt++;
  332. start = i + sub.size;
  333. }
  334. return cnt;
  335. }
  336. std::map<std::string, uint16_t, std::less<>>& StrName::_interned(){
  337. static std::map<std::string, uint16_t, std::less<>> interned;
  338. return interned;
  339. }
  340. std::map<uint16_t, std::string>& StrName::_r_interned(){
  341. static std::map<uint16_t, std::string> r_interned;
  342. return r_interned;
  343. }
  344. uint32_t StrName::_pesudo_random_index = 0;
  345. StrName StrName::get(std::string_view s){
  346. auto it = _interned().find(s);
  347. if(it != _interned().end()) return StrName(it->second);
  348. // generate new index
  349. // https://github.com/python/cpython/blob/3.12/Objects/dictobject.c#L175
  350. uint16_t index = ((_pesudo_random_index*5) + 1) & 65535;
  351. if(index == 0) throw std::runtime_error("StrName index overflow");
  352. _interned()[std::string(s)] = index;
  353. if(is_valid(index)) throw std::runtime_error("StrName index conflict");
  354. _r_interned()[index] = std::string(s);
  355. _pesudo_random_index = index;
  356. return StrName(index);
  357. }
  358. Str StrName::escape() const {
  359. return Str(sv()).escape();
  360. }
  361. bool StrName::is_valid(int index) {
  362. return _r_interned().find(index) != _r_interned().end();
  363. }
  364. StrName::StrName(): index(0) {}
  365. StrName::StrName(uint16_t index): index(index) {}
  366. StrName::StrName(const char* s): index(get(s).index) {}
  367. StrName::StrName(const Str& s){
  368. index = get(s.sv()).index;
  369. }
  370. std::string_view StrName::sv() const {
  371. const std::string& str = _r_interned()[index];
  372. return std::string_view(str);
  373. }
  374. const char* StrName::c_str() const{
  375. const std::string& str = _r_interned()[index];
  376. return str.c_str();
  377. }
  378. Str SStream::str(){
  379. // after this call, the buffer is no longer valid
  380. buffer.reserve(buffer.size() + 1); // allocate one more byte for '\0'
  381. buffer[buffer.size()] = '\0'; // set '\0'
  382. return Str(buffer.detach());
  383. }
  384. SStream& SStream::operator<<(const Str& s){
  385. buffer.extend(s.begin(), s.end());
  386. return *this;
  387. }
  388. SStream& SStream::operator<<(const char* s){
  389. buffer.extend(s, s + strlen(s));
  390. return *this;
  391. }
  392. SStream& SStream::operator<<(const std::string& s){
  393. buffer.extend(s.data(), s.data() + s.size());
  394. return *this;
  395. }
  396. SStream& SStream::operator<<(std::string_view s){
  397. buffer.extend(s.data(), s.data() + s.size());
  398. return *this;
  399. }
  400. SStream& SStream::operator<<(char c){
  401. buffer.push_back(c);
  402. return *this;
  403. }
  404. SStream& SStream::operator<<(StrName sn){
  405. return *this << sn.sv();
  406. }
  407. SStream& SStream::operator<<(size_t val){
  408. // size_t could be out of range of `i64`, use `std::to_string` instead
  409. return (*this) << std::to_string(val);
  410. }
  411. SStream& SStream::operator<<(int val){
  412. return (*this) << static_cast<i64>(val);
  413. }
  414. SStream& SStream::operator<<(i64 val){
  415. // str(-2**64).__len__() == 21
  416. buffer.reserve(buffer.size() + 24);
  417. if(val == 0){
  418. buffer.push_back('0');
  419. return *this;
  420. }
  421. if(val < 0){
  422. buffer.push_back('-');
  423. val = -val;
  424. }
  425. char* begin = buffer.end();
  426. while(val){
  427. buffer.push_back('0' + val % 10);
  428. val /= 10;
  429. }
  430. std::reverse(begin, buffer.end());
  431. return *this;
  432. }
  433. SStream& SStream::operator<<(f64 val){
  434. if(std::isinf(val)){
  435. return (*this) << (val > 0 ? "inf" : "-inf");
  436. }
  437. if(std::isnan(val)){
  438. return (*this) << "nan";
  439. }
  440. char b[32];
  441. if(_precision == -1){
  442. int prec = std::numeric_limits<f64>::max_digits10-1;
  443. snprintf(b, sizeof(b), "%.*g", prec, val);
  444. }else{
  445. int prec = _precision;
  446. snprintf(b, sizeof(b), "%.*f", prec, val);
  447. }
  448. (*this) << b;
  449. if(std::all_of(b+1, b+strlen(b), isdigit)) (*this) << ".0";
  450. return *this;
  451. }
  452. void SStream::write_hex(unsigned char c, bool non_zero){
  453. unsigned char high = c >> 4;
  454. unsigned char low = c & 0xf;
  455. if(non_zero){
  456. if(high) (*this) << "0123456789abcdef"[high];
  457. if(high || low) (*this) << "0123456789abcdef"[low];
  458. }else{
  459. (*this) << "0123456789abcdef"[high];
  460. (*this) << "0123456789abcdef"[low];
  461. }
  462. }
  463. void SStream::write_hex(void* p){
  464. if(p == nullptr){
  465. (*this) << "0x0";
  466. return;
  467. }
  468. (*this) << "0x";
  469. uintptr_t p_t = reinterpret_cast<uintptr_t>(p);
  470. bool non_zero = true;
  471. for(int i=sizeof(void*)-1; i>=0; i--){
  472. unsigned char cpnt = (p_t >> (i * 8)) & 0xff;
  473. write_hex(cpnt, non_zero);
  474. if(cpnt != 0) non_zero = false;
  475. }
  476. }
  477. void SStream::write_hex(i64 val){
  478. if(val == 0){
  479. (*this) << "0x0";
  480. return;
  481. }
  482. if(val < 0){
  483. (*this) << "-";
  484. val = -val;
  485. }
  486. (*this) << "0x";
  487. bool non_zero = true;
  488. for(int i=56; i>=0; i-=8){
  489. unsigned char cpnt = (val >> i) & 0xff;
  490. write_hex(cpnt, non_zero);
  491. if(cpnt != 0) non_zero = false;
  492. }
  493. }
  494. #undef PK_STR_ALLOCATE
  495. #undef PK_STR_COPY_INIT
  496. } // namespace pkpy