str.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. #include "pocketpy/str.h"
  2. namespace pkpy {
  3. int utf8len(unsigned char c, bool suppress){
  4. if((c & 0b10000000) == 0) return 1;
  5. if((c & 0b11100000) == 0b11000000) return 2;
  6. if((c & 0b11110000) == 0b11100000) return 3;
  7. if((c & 0b11111000) == 0b11110000) return 4;
  8. if((c & 0b11111100) == 0b11111000) return 5;
  9. if((c & 0b11111110) == 0b11111100) return 6;
  10. if(!suppress) throw std::runtime_error("invalid utf8 char: " + std::to_string(c));
  11. return 0;
  12. }
  13. Str::Str(int size, bool is_ascii): size(size), is_ascii(is_ascii) {
  14. _alloc();
  15. }
  16. #define STR_INIT() \
  17. _alloc(); \
  18. for(int i=0; i<size; i++){ \
  19. data[i] = s[i]; \
  20. if(!isascii(s[i])) is_ascii = false; \
  21. }
  22. Str::Str(const std::string& s): size(s.size()), is_ascii(true) {
  23. STR_INIT()
  24. }
  25. Str::Str(std::string_view s): size(s.size()), is_ascii(true) {
  26. STR_INIT()
  27. }
  28. Str::Str(const char* s): size(strlen(s)), is_ascii(true) {
  29. STR_INIT()
  30. }
  31. Str::Str(const char* s, int len): size(len), is_ascii(true) {
  32. STR_INIT()
  33. }
  34. #undef STR_INIT
  35. Str::Str(const Str& other): size(other.size), is_ascii(other.is_ascii) {
  36. _alloc();
  37. memcpy(data, other.data, size);
  38. }
  39. Str::Str(Str&& other): size(other.size), is_ascii(other.is_ascii) {
  40. if(other.is_inlined()){
  41. data = _inlined;
  42. for(int i=0; i<size; i++) _inlined[i] = other._inlined[i];
  43. }else{
  44. data = other.data;
  45. other.data = other._inlined;
  46. other.size = 0;
  47. }
  48. }
  49. Str operator+(const char* p, const Str& str){
  50. Str other(p);
  51. return other + str;
  52. }
  53. std::ostream& operator<<(std::ostream& os, const Str& str){
  54. return os << str.sv();
  55. }
  56. bool operator<(const std::string_view other, const Str& str){
  57. return other < str.sv();
  58. }
  59. void Str::_alloc(){
  60. if(size <= 16){
  61. this->data = _inlined;
  62. }else{
  63. this->data = (char*)pool64_alloc(size);
  64. }
  65. }
  66. Str& Str::operator=(const Str& other){
  67. if(!is_inlined()) pool64_dealloc(data);
  68. size = other.size;
  69. is_ascii = other.is_ascii;
  70. _cached_c_str = nullptr;
  71. _alloc();
  72. memcpy(data, other.data, size);
  73. return *this;
  74. }
  75. Str Str::operator+(const Str& other) const {
  76. Str ret(size + other.size, is_ascii && other.is_ascii);
  77. memcpy(ret.data, data, size);
  78. memcpy(ret.data + size, other.data, other.size);
  79. return ret;
  80. }
  81. Str Str::operator+(const char* p) const {
  82. Str other(p);
  83. return *this + other;
  84. }
  85. bool Str::operator==(const Str& other) const {
  86. if(size != other.size) return false;
  87. return memcmp(data, other.data, size) == 0;
  88. }
  89. bool Str::operator!=(const Str& other) const {
  90. if(size != other.size) return true;
  91. return memcmp(data, other.data, size) != 0;
  92. }
  93. bool Str::operator==(const std::string_view other) const {
  94. if(size != (int)other.size()) return false;
  95. return memcmp(data, other.data(), size) == 0;
  96. }
  97. bool Str::operator!=(const std::string_view other) const {
  98. if(size != (int)other.size()) return true;
  99. return memcmp(data, other.data(), size) != 0;
  100. }
  101. bool Str::operator==(const char* p) const {
  102. return *this == std::string_view(p);
  103. }
  104. bool Str::operator!=(const char* p) const {
  105. return *this != std::string_view(p);
  106. }
  107. bool Str::operator<(const Str& other) const {
  108. return this->sv() < other.sv();
  109. }
  110. bool Str::operator<(const std::string_view other) const {
  111. return this->sv() < other;
  112. }
  113. bool Str::operator>(const Str& other) const {
  114. return this->sv() > other.sv();
  115. }
  116. bool Str::operator<=(const Str& other) const {
  117. return this->sv() <= other.sv();
  118. }
  119. bool Str::operator>=(const Str& other) const {
  120. return this->sv() >= other.sv();
  121. }
  122. Str::~Str(){
  123. if(!is_inlined()) pool64_dealloc(data);
  124. if(_cached_c_str != nullptr) free((void*)_cached_c_str);
  125. }
  126. Str Str::substr(int start, int len) const {
  127. Str ret(len, is_ascii);
  128. memcpy(ret.data, data + start, len);
  129. return ret;
  130. }
  131. Str Str::substr(int start) const {
  132. return substr(start, size - start);
  133. }
  134. char* Str::c_str_dup() const {
  135. char* p = (char*)malloc(size + 1);
  136. memcpy(p, data, size);
  137. p[size] = 0;
  138. return p;
  139. }
  140. const char* Str::c_str() const{
  141. if(_cached_c_str == nullptr){
  142. _cached_c_str = c_str_dup();
  143. }
  144. return _cached_c_str;
  145. }
  146. std::string_view Str::sv() const {
  147. return std::string_view(data, size);
  148. }
  149. std::string Str::str() const {
  150. return std::string(data, size);
  151. }
  152. Str Str::lstrip() const {
  153. std::string copy(data, size);
  154. copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) {
  155. // std::isspace(c) does not working on windows (Debug)
  156. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  157. }));
  158. return Str(copy);
  159. }
  160. Str Str::strip() const {
  161. std::string copy(data, size);
  162. copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) {
  163. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  164. }));
  165. copy.erase(std::find_if(copy.rbegin(), copy.rend(), [](char c) {
  166. return c != ' ' && c != '\t' && c != '\r' && c != '\n';
  167. }).base(), copy.end());
  168. return Str(copy);
  169. }
  170. Str Str::lower() const{
  171. std::string copy(data, size);
  172. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  173. if('A' <= c && c <= 'Z') return c + ('a' - 'A');
  174. return (int)c;
  175. });
  176. return Str(copy);
  177. }
  178. Str Str::upper() const{
  179. std::string copy(data, size);
  180. std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c){
  181. if('a' <= c && c <= 'z') return c - ('a' - 'A');
  182. return (int)c;
  183. });
  184. return Str(copy);
  185. }
  186. Str Str::escape(bool single_quote) const{
  187. SStream ss;
  188. escape_(ss, single_quote);
  189. return ss.str();
  190. }
  191. void Str::escape_(SStream& ss, bool single_quote) const {
  192. ss << (single_quote ? '\'' : '"');
  193. for (int i=0; i<length(); i++) {
  194. char c = this->operator[](i);
  195. switch (c) {
  196. case '"':
  197. if(!single_quote) ss << '\\';
  198. ss << '"';
  199. break;
  200. case '\'':
  201. if(single_quote) ss << '\\';
  202. ss << '\'';
  203. break;
  204. case '\\': ss << '\\' << '\\'; break;
  205. case '\n': ss << "\\n"; break;
  206. case '\r': ss << "\\r"; break;
  207. case '\t': ss << "\\t"; break;
  208. default:
  209. if ('\x00' <= c && c <= '\x1f') {
  210. ss << "\\x"; // << std::hex << std::setw(2) << std::setfill('0') << (int)c;
  211. ss << "0123456789abcdef"[c >> 4];
  212. ss << "0123456789abcdef"[c & 0xf];
  213. } else {
  214. ss << c;
  215. }
  216. }
  217. }
  218. ss << (single_quote ? '\'' : '"');
  219. }
  220. int Str::index(const Str& sub, int start) const {
  221. auto p = std::search(data + start, data + size, sub.data, sub.data + sub.size);
  222. if(p == data + size) return -1;
  223. return p - data;
  224. }
  225. Str Str::replace(char old, char new_) const{
  226. Str copied = *this;
  227. for(int i=0; i<copied.size; i++){
  228. if(copied.data[i] == old) copied.data[i] = new_;
  229. }
  230. return copied;
  231. }
  232. Str Str::replace(const Str& old, const Str& new_, int count) const {
  233. SStream ss;
  234. int start = 0;
  235. while(true){
  236. int i = index(old, start);
  237. if(i == -1) break;
  238. ss << substr(start, i - start);
  239. ss << new_;
  240. start = i + old.size;
  241. if(count != -1 && --count == 0) break;
  242. }
  243. ss << substr(start, size - start);
  244. return ss.str();
  245. }
  246. int Str::_unicode_index_to_byte(int i) const{
  247. if(is_ascii) return i;
  248. int j = 0;
  249. while(i > 0){
  250. j += utf8len(data[j]);
  251. i--;
  252. }
  253. return j;
  254. }
  255. int Str::_byte_index_to_unicode(int n) const{
  256. if(is_ascii) return n;
  257. int cnt = 0;
  258. for(int i=0; i<n; i++){
  259. if((data[i] & 0xC0) != 0x80) cnt++;
  260. }
  261. return cnt;
  262. }
  263. Str Str::u8_getitem(int i) const{
  264. i = _unicode_index_to_byte(i);
  265. return substr(i, utf8len(data[i]));
  266. }
  267. Str Str::u8_slice(int start, int stop, int step) const{
  268. SStream ss;
  269. if(is_ascii){
  270. for(int i=start; step>0?i<stop:i>stop; i+=step) ss << data[i];
  271. }else{
  272. for(int i=start; step>0?i<stop:i>stop; i+=step) ss << u8_getitem(i);
  273. }
  274. return ss.str();
  275. }
  276. int Str::u8_length() const {
  277. return _byte_index_to_unicode(size);
  278. }
  279. std::vector<std::string_view> Str::split(const Str& sep) const{
  280. std::vector<std::string_view> result;
  281. std::string_view tmp;
  282. int start = 0;
  283. while(true){
  284. int i = index(sep, start);
  285. if(i == -1) break;
  286. tmp = sv().substr(start, i - start);
  287. if(!tmp.empty()) result.push_back(tmp);
  288. start = i + sep.size;
  289. }
  290. tmp = sv().substr(start, size - start);
  291. if(!tmp.empty()) result.push_back(tmp);
  292. return result;
  293. }
  294. std::vector<std::string_view> Str::split(char sep) const{
  295. std::vector<std::string_view> result;
  296. int i = 0;
  297. for(int j = 0; j < size; j++){
  298. if(data[j] == sep){
  299. if(j > i) result.emplace_back(data+i, j-i);
  300. i = j + 1;
  301. continue;
  302. }
  303. }
  304. if(size > i) result.emplace_back(data+i, size-i);
  305. return result;
  306. }
  307. int Str::count(const Str& sub) const{
  308. if(sub.empty()) return size + 1;
  309. int cnt = 0;
  310. int start = 0;
  311. while(true){
  312. int i = index(sub, start);
  313. if(i == -1) break;
  314. cnt++;
  315. start = i + sub.size;
  316. }
  317. return cnt;
  318. }
  319. std::ostream& operator<<(std::ostream& os, const StrName& sn){
  320. return os << sn.sv();
  321. }
  322. std::map<std::string, uint16_t, std::less<>>& StrName::_interned(){
  323. static std::map<std::string, uint16_t, std::less<>> interned;
  324. return interned;
  325. }
  326. std::map<uint16_t, std::string>& StrName::_r_interned(){
  327. static std::map<uint16_t, std::string> r_interned;
  328. return r_interned;
  329. }
  330. uint32_t StrName::_pesudo_random_index = 0;
  331. StrName StrName::get(std::string_view s){
  332. auto it = _interned().find(s);
  333. if(it != _interned().end()) return StrName(it->second);
  334. // generate new index
  335. // https://github.com/python/cpython/blob/3.12/Objects/dictobject.c#L175
  336. uint16_t index = ((_pesudo_random_index*5) + 1) & 65535;
  337. if(index == 0) throw std::runtime_error("StrName index overflow");
  338. _interned()[std::string(s)] = index;
  339. if(is_valid(index)) throw std::runtime_error("StrName index conflict");
  340. _r_interned()[index] = std::string(s);
  341. _pesudo_random_index = index;
  342. return StrName(index);
  343. }
  344. Str StrName::escape() const {
  345. return Str(sv()).escape();
  346. }
  347. bool StrName::is_valid(int index) {
  348. return _r_interned().find(index) != _r_interned().end();
  349. }
  350. StrName::StrName(): index(0) {}
  351. StrName::StrName(uint16_t index): index(index) {}
  352. StrName::StrName(const char* s): index(get(s).index) {}
  353. StrName::StrName(const Str& s){
  354. index = get(s.sv()).index;
  355. }
  356. std::string_view StrName::sv() const {
  357. const std::string& str = _r_interned()[index];
  358. return std::string_view(str);
  359. }
  360. } // namespace pkpy