str.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #include "pocketpy/str.h"
  2. namespace pkpy {
  3. int utf8len(unsigned char c, bool suppress){
  4. if((c & 0b10000000) == 0) return 1;
  5. if((c & 0b11100000) == 0b11000000) return 2;
  6. if((c & 0b11110000) == 0b11100000) return 3;
  7. if((c & 0b11111000) == 0b11110000) return 4;
  8. if((c & 0b11111100) == 0b11111000) return 5;
  9. if((c & 0b11111110) == 0b11111100) return 6;
  10. if(!suppress) throw std::runtime_error("invalid utf8 char: " + std::to_string(c));
  11. return 0;
  12. }
  13. Str::Str(int size, bool is_ascii): size(size), is_ascii(is_ascii) {
  14. _alloc();
  15. }
  16. #define STR_INIT() \
  17. _alloc(); \
  18. for(int i=0; i<size; i++){ \
  19. data[i] = s[i]; \
  20. if(!isascii(s[i])) is_ascii = false; \
  21. }
  22. Str::Str(const std::string& s): size(s.size()), is_ascii(true) {
  23. STR_INIT()
  24. }
  25. Str::Str(std::string_view s): size(s.size()), is_ascii(true) {
  26. STR_INIT()
  27. }
  28. Str::Str(const char* s): size(strlen(s)), is_ascii(true) {
  29. STR_INIT()
  30. }
  31. Str::Str(const char* s, int len): size(len), is_ascii(true) {
  32. STR_INIT()
  33. }
  34. #undef STR_INIT
  35. Str::Str(std::pair<char *, int> detached) {
  36. this->size = detached.second;
  37. this->data = detached.first;
  38. this->is_ascii = true;
  39. // check is_ascii
  40. for(int i=0; i<size; i++){
  41. if(!isascii(data[i])){
  42. is_ascii = false;
  43. break;
  44. }
  45. }
  46. }
  47. Str::Str(const Str& other): size(other.size), is_ascii(other.is_ascii) {
  48. _alloc();
  49. memcpy(data, other.data, size);
  50. }
  51. Str::Str(Str&& other): size(other.size), is_ascii(other.is_ascii) {
  52. if(other.is_inlined()){
  53. data = _inlined;
  54. for(int i=0; i<size; i++) _inlined[i] = other._inlined[i];
  55. }else{
  56. data = other.data;
  57. other.data = other._inlined;
  58. other.size = 0;
  59. }
  60. }
  61. Str operator+(const char* p, const Str& str){
  62. Str other(p);
  63. return other + str;
  64. }
  65. std::ostream& operator<<(std::ostream& os, const Str& str){
  66. return os << str.sv();
  67. }
  68. bool operator<(const std::string_view other, const Str& str){
  69. return other < str.sv();
  70. }
  71. void Str::_alloc(){
  72. if(size <= 16){
  73. this->data = _inlined;
  74. }else{
  75. this->data = (char*)pool64_alloc(size);
  76. }
  77. }
  78. Str& Str::operator=(const Str& other){
  79. if(!is_inlined()) pool64_dealloc(data);
  80. size = other.size;
  81. is_ascii = other.is_ascii;
  82. _cached_c_str = nullptr;
  83. _alloc();
  84. memcpy(data, other.data, size);
  85. return *this;
  86. }
  87. Str Str::operator+(const Str& other) const {
  88. Str ret(size + other.size, is_ascii && other.is_ascii);
  89. memcpy(ret.data, data, size);
  90. memcpy(ret.data + size, other.data, other.size);
  91. return ret;
  92. }
  93. Str Str::operator+(const char* p) const {
  94. Str other(p);
  95. return *this + other;
  96. }
  97. bool Str::operator==(const Str& other) const {
  98. if(size != other.size) return false;
  99. return memcmp(data, other.data, size) == 0;
  100. }
  101. bool Str::operator!=(const Str& other) const {
  102. if(size != other.size) return true;
  103. return memcmp(data, other.data, size) != 0;
  104. }
  105. bool Str::operator==(const std::string_view other) const {
  106. if(size != (int)other.size()) return false;
  107. return memcmp(data, other.data(), size) == 0;
  108. }
  109. bool Str::operator!=(const std::string_view other) const {
  110. if(size != (int)other.size()) return true;
  111. return memcmp(data, other.data(), size) != 0;
  112. }
  113. bool Str::operator==(const char* p) const {
  114. return *this == std::string_view(p);
  115. }
  116. bool Str::operator!=(const char* p) const {
  117. return *this != std::string_view(p);
  118. }
  119. bool Str::operator<(const Str& other) const {
  120. return this->sv() < other.sv();
  121. }
  122. bool Str::operator<(const std::string_view other) const {
  123. return this->sv() < other;
  124. }
  125. bool Str::operator>(const Str& other) const {
  126. return this->sv() > other.sv();
  127. }
  128. bool Str::operator<=(const Str& other) const {
  129. return this->sv() <= other.sv();
  130. }
  131. bool Str::operator>=(const Str& other) const {
  132. return this->sv() >= other.sv();
  133. }
  134. Str::~Str(){
  135. if(!is_inlined()) pool64_dealloc(data);
  136. if(_cached_c_str != nullptr) free((void*)_cached_c_str);
  137. }
  138. Str Str::substr(int start, int len) const {
  139. Str ret(len, is_ascii);
  140. memcpy(ret.data, data + start, len);
  141. return ret;
  142. }
  143. Str Str::substr(int start) const {
  144. return substr(start, size - start);
  145. }
  146. char* Str::c_str_dup() const {
  147. char* p = (char*)malloc(size + 1);
  148. memcpy(p, data, size);
  149. p[size] = 0;
  150. return p;
  151. }
  152. const char* Str::c_str() const{
  153. if(_cached_c_str == nullptr){
  154. _cached_c_str = c_str_dup();
  155. }
  156. return _cached_c_str;
  157. }
  158. std::string_view Str::sv() const {
  159. return std::string_view(data, size);
  160. }
  161. std::string Str::str() const {
  162. return std::string(data, size);
  163. }
  164. Str Str::lstrip() const {
  165. std::string copy(data, size);
  166. copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) {
  167. // std::isspace(c) does not working on windows (Debug)
  168. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  169. }));
  170. return Str(copy);
  171. }
  172. Str Str::strip() const {
  173. std::string copy(data, size);
  174. copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) {
  175. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  176. }));
  177. copy.erase(std::find_if(copy.rbegin(), copy.rend(), [](char c) {
  178. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  179. }).base(), copy.end());
  180. return Str(copy);
  181. }
  182. Str Str::lower() const{
  183. std::string copy(data, size);
  184. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  185. if('A' <= c && c <= 'Z') return c + ('a' - 'A');
  186. return (int)c;
  187. });
  188. return Str(copy);
  189. }
  190. Str Str::upper() const{
  191. std::string copy(data, size);
  192. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  193. if('a' <= c && c <= 'z') return c - ('a' - 'A');
  194. return (int)c;
  195. });
  196. return Str(copy);
  197. }
  198. Str Str::escape(bool single_quote) const{
  199. SStream ss;
  200. escape_(ss, single_quote);
  201. return ss.str();
  202. }
  203. void Str::escape_(SStream& ss, bool single_quote) const {
  204. ss << (single_quote ? '\'' : '"');
  205. for (int i=0; i<length(); i++) {
  206. char c = this->operator[](i);
  207. switch (c) {
  208. case '"':
  209. if(!single_quote) ss << '\\';
  210. ss << '"';
  211. break;
  212. case '\'':
  213. if(single_quote) ss << '\\';
  214. ss << '\'';
  215. break;
  216. case '\\': ss << '\\' << '\\'; break;
  217. case '\n': ss << "\\n"; break;
  218. case '\r': ss << "\\r"; break;
  219. case '\t': ss << "\\t"; break;
  220. case '\b': ss << "\\b"; break;
  221. default:
  222. if ('\x00' <= c && c <= '\x1f') {
  223. ss << "\\x"; // << std::hex << std::setw(2) << std::setfill('0') << (int)c;
  224. ss << "0123456789abcdef"[c >> 4];
  225. ss << "0123456789abcdef"[c & 0xf];
  226. } else {
  227. ss << c;
  228. }
  229. }
  230. }
  231. ss << (single_quote ? '\'' : '"');
  232. }
  233. int Str::index(const Str& sub, int start) const {
  234. auto p = std::search(data + start, data + size, sub.data, sub.data + sub.size);
  235. if(p == data + size) return -1;
  236. return p - data;
  237. }
  238. Str Str::replace(char old, char new_) const{
  239. Str copied = *this;
  240. for(int i=0; i<copied.size; i++){
  241. if(copied.data[i] == old) copied.data[i] = new_;
  242. }
  243. return copied;
  244. }
  245. Str Str::replace(const Str& old, const Str& new_, int count) const {
  246. SStream ss;
  247. int start = 0;
  248. while(true){
  249. int i = index(old, start);
  250. if(i == -1) break;
  251. ss << substr(start, i - start);
  252. ss << new_;
  253. start = i + old.size;
  254. if(count != -1 && --count == 0) break;
  255. }
  256. ss << substr(start, size - start);
  257. return ss.str();
  258. }
  259. int Str::_unicode_index_to_byte(int i) const{
  260. if(is_ascii) return i;
  261. int j = 0;
  262. while(i > 0){
  263. j += utf8len(data[j]);
  264. i--;
  265. }
  266. return j;
  267. }
  268. int Str::_byte_index_to_unicode(int n) const{
  269. if(is_ascii) return n;
  270. int cnt = 0;
  271. for(int i=0; i<n; i++){
  272. if((data[i] & 0xC0) != 0x80) cnt++;
  273. }
  274. return cnt;
  275. }
  276. Str Str::u8_getitem(int i) const{
  277. i = _unicode_index_to_byte(i);
  278. return substr(i, utf8len(data[i]));
  279. }
  280. Str Str::u8_slice(int start, int stop, int step) const{
  281. SStream ss;
  282. if(is_ascii){
  283. for(int i=start; step>0?i<stop:i>stop; i+=step) ss << data[i];
  284. }else{
  285. for(int i=start; step>0?i<stop:i>stop; i+=step) ss << u8_getitem(i);
  286. }
  287. return ss.str();
  288. }
  289. int Str::u8_length() const {
  290. return _byte_index_to_unicode(size);
  291. }
  292. std::vector<std::string_view> Str::split(const Str& sep) const{
  293. std::vector<std::string_view> result;
  294. std::string_view tmp;
  295. int start = 0;
  296. while(true){
  297. int i = index(sep, start);
  298. if(i == -1) break;
  299. tmp = sv().substr(start, i - start);
  300. if(!tmp.empty()) result.push_back(tmp);
  301. start = i + sep.size;
  302. }
  303. tmp = sv().substr(start, size - start);
  304. if(!tmp.empty()) result.push_back(tmp);
  305. return result;
  306. }
  307. std::vector<std::string_view> Str::split(char sep) const{
  308. std::vector<std::string_view> result;
  309. int i = 0;
  310. for(int j = 0; j < size; j++){
  311. if(data[j] == sep){
  312. if(j > i) result.emplace_back(data+i, j-i);
  313. i = j + 1;
  314. continue;
  315. }
  316. }
  317. if(size > i) result.emplace_back(data+i, size-i);
  318. return result;
  319. }
  320. int Str::count(const Str& sub) const{
  321. if(sub.empty()) return size + 1;
  322. int cnt = 0;
  323. int start = 0;
  324. while(true){
  325. int i = index(sub, start);
  326. if(i == -1) break;
  327. cnt++;
  328. start = i + sub.size;
  329. }
  330. return cnt;
  331. }
  332. std::ostream& operator<<(std::ostream& os, const StrName& sn){
  333. return os << sn.sv();
  334. }
  335. std::map<std::string, uint16_t, std::less<>>& StrName::_interned(){
  336. static std::map<std::string, uint16_t, std::less<>> interned;
  337. return interned;
  338. }
  339. std::map<uint16_t, std::string>& StrName::_r_interned(){
  340. static std::map<uint16_t, std::string> r_interned;
  341. return r_interned;
  342. }
  343. uint32_t StrName::_pesudo_random_index = 0;
  344. StrName StrName::get(std::string_view s){
  345. auto it = _interned().find(s);
  346. if(it != _interned().end()) return StrName(it->second);
  347. // generate new index
  348. // https://github.com/python/cpython/blob/3.12/Objects/dictobject.c#L175
  349. uint16_t index = ((_pesudo_random_index*5) + 1) & 65535;
  350. if(index == 0) throw std::runtime_error("StrName index overflow");
  351. _interned()[std::string(s)] = index;
  352. if(is_valid(index)) throw std::runtime_error("StrName index conflict");
  353. _r_interned()[index] = std::string(s);
  354. _pesudo_random_index = index;
  355. return StrName(index);
  356. }
  357. Str StrName::escape() const {
  358. return Str(sv()).escape();
  359. }
  360. bool StrName::is_valid(int index) {
  361. return _r_interned().find(index) != _r_interned().end();
  362. }
  363. StrName::StrName(): index(0) {}
  364. StrName::StrName(uint16_t index): index(index) {}
  365. StrName::StrName(const char* s): index(get(s).index) {}
  366. StrName::StrName(const Str& s){
  367. index = get(s.sv()).index;
  368. }
  369. std::string_view StrName::sv() const {
  370. const std::string& str = _r_interned()[index];
  371. return std::string_view(str);
  372. }
  373. Str SStream::str(){
  374. // after this call, the buffer is no longer valid
  375. return Str(buffer.detach());
  376. }
  377. SStream& SStream::operator<<(const Str& s){
  378. buffer.extend(s.begin(), s.end());
  379. return *this;
  380. }
  381. SStream& SStream::operator<<(const char* s){
  382. buffer.extend(s, s + strlen(s));
  383. return *this;
  384. }
  385. SStream& SStream::operator<<(const std::string& s){
  386. buffer.extend(s.data(), s.data() + s.size());
  387. return *this;
  388. }
  389. SStream& SStream::operator<<(std::string_view s){
  390. buffer.extend(s.data(), s.data() + s.size());
  391. return *this;
  392. }
  393. SStream& SStream::operator<<(char c){
  394. buffer.push_back(c);
  395. return *this;
  396. }
  397. SStream& SStream::operator<<(StrName sn){
  398. return *this << sn.sv();
  399. }
  400. SStream& SStream::operator<<(i64 val){
  401. // str(-2**64).__len__() == 21
  402. buffer.reserve(buffer.size() + 24);
  403. if(val == 0){
  404. buffer.push_back('0');
  405. return *this;
  406. }
  407. if(val < 0){
  408. buffer.push_back('-');
  409. val = -val;
  410. }
  411. char* begin = buffer.end();
  412. while(val){
  413. buffer.push_back('0' + val % 10);
  414. val /= 10;
  415. }
  416. std::reverse(begin, buffer.end());
  417. return *this;
  418. }
  419. } // namespace pkpy