lexer.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747
  1. #include "pocketpy/compiler/lexer.hpp"
  2. #include "pocketpy/common/config.h"
  3. #include "pocketpy/common/str.h"
  4. #include "pocketpy/common/smallmap.h"
  5. #include "pocketpy/compiler/lexer.h"
  6. #include <cstdarg>
  7. namespace pkpy {
  8. static bool is_possible_number_char(char c) noexcept{
  9. switch(c) {
  10. // clang-format off
  11. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
  12. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  13. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  14. case '.': case 'L': case 'x': case 'o': case 'j':
  15. return true;
  16. default: return false;
  17. // clang-format on
  18. }
  19. }
  20. bool Lexer::match_n_chars(int n, char c0) noexcept{
  21. const char* c = curr_char;
  22. for(int i = 0; i < n; i++) {
  23. if(*c == '\0') return false;
  24. if(*c != c0) return false;
  25. c++;
  26. }
  27. for(int i = 0; i < n; i++)
  28. eatchar_include_newline();
  29. return true;
  30. }
  31. bool Lexer::match_string(const char* s) noexcept{
  32. int s_len = strlen(s);
  33. bool ok = strncmp(curr_char, s, s_len) == 0;
  34. if(ok)
  35. for(int i = 0; i < s_len; i++)
  36. eatchar_include_newline();
  37. return ok;
  38. }
  39. int Lexer::eat_spaces() noexcept{
  40. int count = 0;
  41. while(true) {
  42. switch(peekchar()) {
  43. case ' ': count += 1; break;
  44. case '\t': count += 4; break;
  45. default: return count;
  46. }
  47. eatchar();
  48. }
  49. }
  50. bool Lexer::eat_indentation() noexcept{
  51. if(brackets_level > 0) return true;
  52. int spaces = eat_spaces();
  53. if(peekchar() == '#') skip_line_comment();
  54. if(peekchar() == '\0' || peekchar() == '\n') return true;
  55. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  56. if(spaces > indents.back()) {
  57. indents.push_back(spaces);
  58. nexts.push_back(Token{TK("@indent"), token_start, 0, current_line, brackets_level, {}});
  59. } else if(spaces < indents.back()) {
  60. while(spaces < indents.back()) {
  61. indents.pop_back();
  62. nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line, brackets_level, {}});
  63. }
  64. if(spaces != indents.back()) { return false; }
  65. }
  66. return true;
  67. }
  68. char Lexer::eatchar() noexcept{
  69. char c = peekchar();
  70. assert(c != '\n'); // eatchar() cannot consume a newline
  71. curr_char++;
  72. return c;
  73. }
  74. char Lexer::eatchar_include_newline() noexcept{
  75. char c = peekchar();
  76. curr_char++;
  77. if(c == '\n') {
  78. current_line++;
  79. c11_vector__push(const char*, &src->line_starts, curr_char);
  80. }
  81. return c;
  82. }
  83. Error* Lexer::eat_name() noexcept{
  84. curr_char--;
  85. while(true) {
  86. unsigned char c = peekchar();
  87. int u8bytes = c11__u8_header(c, true);
  88. if(u8bytes == 0) return SyntaxError("invalid char: %c", c);
  89. if(u8bytes == 1) {
  90. if(isalpha(c) || c == '_' || isdigit(c)) {
  91. curr_char++;
  92. continue;
  93. } else {
  94. break;
  95. }
  96. }
  97. // handle multibyte char
  98. Str u8str(curr_char, u8bytes);
  99. if(u8str.size != u8bytes) return SyntaxError("invalid utf8 sequence: %s", u8str.c_str());
  100. uint32_t value = 0;
  101. for(int k = 0; k < u8bytes; k++) {
  102. uint8_t b = u8str[k];
  103. if(k == 0) {
  104. if(u8bytes == 2)
  105. value = (b & 0b00011111) << 6;
  106. else if(u8bytes == 3)
  107. value = (b & 0b00001111) << 12;
  108. else if(u8bytes == 4)
  109. value = (b & 0b00000111) << 18;
  110. } else {
  111. value |= (b & 0b00111111) << (6 * (u8bytes - k - 1));
  112. }
  113. }
  114. if(c11__is_unicode_Lo_char(value))
  115. curr_char += u8bytes;
  116. else
  117. break;
  118. }
  119. int length = (int)(curr_char - token_start);
  120. if(length == 0) return SyntaxError("@id contains invalid char");
  121. std::string_view name(token_start, length);
  122. if(src->mode == JSON_MODE) {
  123. if(name == "true") {
  124. add_token(TK("True"));
  125. } else if(name == "false") {
  126. add_token(TK("False"));
  127. } else if(name == "null") {
  128. add_token(TK("None"));
  129. } else {
  130. return SyntaxError("invalid JSON token");
  131. }
  132. return NULL;
  133. }
  134. const auto KW_BEGIN = kTokens + TK("False");
  135. const auto KW_END = kTokens + kTokenCount;
  136. auto it = lower_bound(KW_BEGIN, KW_END, name);
  137. if(it != KW_END && *it == name) {
  138. add_token(it - kTokens);
  139. } else {
  140. add_token(TK("@id"));
  141. }
  142. return NULL;
  143. }
  144. void Lexer::skip_line_comment() noexcept{
  145. char c;
  146. while((c = peekchar()) != '\0') {
  147. if(c == '\n') return;
  148. eatchar();
  149. }
  150. }
  151. bool Lexer::matchchar(char c) noexcept{
  152. if(peekchar() != c) return false;
  153. eatchar_include_newline();
  154. return true;
  155. }
  156. void Lexer::add_token(TokenIndex type, TokenValue value) noexcept{
  157. switch(type) {
  158. case TK("{"):
  159. case TK("["):
  160. case TK("("): brackets_level++; break;
  161. case TK(")"):
  162. case TK("]"):
  163. case TK("}"): brackets_level--; break;
  164. }
  165. auto token = Token{type,
  166. token_start,
  167. (int)(curr_char - token_start),
  168. current_line - ((type == TK("@eol")) ? 1 : 0),
  169. brackets_level,
  170. value};
  171. // handle "not in", "is not", "yield from"
  172. if(!nexts.empty()) {
  173. auto& back = nexts.back();
  174. if(back.type == TK("not") && type == TK("in")) {
  175. back.type = TK("not in");
  176. return;
  177. }
  178. if(back.type == TK("is") && type == TK("not")) {
  179. back.type = TK("is not");
  180. return;
  181. }
  182. if(back.type == TK("yield") && type == TK("from")) {
  183. back.type = TK("yield from");
  184. return;
  185. }
  186. nexts.push_back(token);
  187. }
  188. }
  189. void Lexer::add_token_2(char c, TokenIndex one, TokenIndex two) noexcept{
  190. if(matchchar(c))
  191. add_token(two);
  192. else
  193. add_token(one);
  194. }
  195. Error* Lexer::eat_string_until(char quote, bool raw, Str* out) noexcept{
  196. bool quote3 = match_n_chars(2, quote);
  197. small_vector_2<char, 32> buff;
  198. while(true) {
  199. char c = eatchar_include_newline();
  200. if(c == quote) {
  201. if(quote3 && !match_n_chars(2, quote)) {
  202. buff.push_back(c);
  203. continue;
  204. }
  205. break;
  206. }
  207. if(c == '\0') {
  208. if(quote3 && src->mode == REPL_MODE) return NeedMoreLines();
  209. return SyntaxError("EOL while scanning string literal");
  210. }
  211. if(c == '\n') {
  212. if(!quote3)
  213. return SyntaxError("EOL while scanning string literal");
  214. else {
  215. buff.push_back(c);
  216. continue;
  217. }
  218. }
  219. if(!raw && c == '\\') {
  220. switch(eatchar_include_newline()) {
  221. case '"': buff.push_back('"'); break;
  222. case '\'': buff.push_back('\''); break;
  223. case '\\': buff.push_back('\\'); break;
  224. case 'n': buff.push_back('\n'); break;
  225. case 'r': buff.push_back('\r'); break;
  226. case 't': buff.push_back('\t'); break;
  227. case 'b': buff.push_back('\b'); break;
  228. case 'x': {
  229. char hex[3] = {eatchar(), eatchar(), '\0'};
  230. size_t parsed;
  231. char code;
  232. try {
  233. code = (char)std::stoi(hex, &parsed, 16);
  234. } catch(...) {
  235. return SyntaxError("invalid hex char");
  236. }
  237. if(parsed != 2) return SyntaxError("invalid hex char");
  238. buff.push_back(code);
  239. } break;
  240. default: return SyntaxError("invalid escape char");
  241. }
  242. } else {
  243. buff.push_back(c);
  244. }
  245. }
  246. *out = Str(buff.data(), buff.size());
  247. return nullptr;
  248. }
  249. Error* Lexer::eat_string(char quote, StringType type) noexcept{
  250. Str s;
  251. Error* err = eat_string_until(quote, type == StringType::RAW_STRING, &s);
  252. if(err) return err;
  253. if(type == StringType::F_STRING) {
  254. add_token(TK("@fstr"), s);
  255. }else if(type == StringType::NORMAL_BYTES) {
  256. add_token(TK("@bytes"), s);
  257. }else{
  258. add_token(TK("@str"), s);
  259. }
  260. return NULL;
  261. }
  262. Error* Lexer::eat_number() noexcept{
  263. const char* i = token_start;
  264. while(is_possible_number_char(*i))
  265. i++;
  266. bool is_scientific_notation = false;
  267. if(*(i - 1) == 'e' && (*i == '+' || *i == '-')) {
  268. i++;
  269. while(isdigit(*i) || *i == 'j')
  270. i++;
  271. is_scientific_notation = true;
  272. }
  273. std::string_view text(token_start, i - token_start);
  274. this->curr_char = i;
  275. if(text[0] != '.' && !is_scientific_notation) {
  276. // try long
  277. if(i[-1] == 'L') {
  278. add_token(TK("@long"));
  279. return NULL;
  280. }
  281. // try integer
  282. i64 int_out;
  283. switch(parse_uint(text, &int_out, -1)) {
  284. case IntParsingResult::Success: add_token(TK("@num"), int_out); return NULL;
  285. case IntParsingResult::Overflow: return SyntaxError("int literal is too large");
  286. case IntParsingResult::Failure: break; // do nothing
  287. }
  288. }
  289. // try float
  290. double float_out;
  291. char* p_end;
  292. try {
  293. float_out = std::strtod(text.data(), &p_end);
  294. } catch(...) {
  295. return SyntaxError("invalid number literal");
  296. }
  297. if(p_end == text.data() + text.size()) {
  298. add_token(TK("@num"), (f64)float_out);
  299. return NULL;
  300. }
  301. if(i[-1] == 'j' && p_end == text.data() + text.size() - 1) {
  302. add_token(TK("@imag"), (f64)float_out);
  303. return NULL;
  304. }
  305. return SyntaxError("invalid number literal");
  306. }
  307. Error* Lexer::lex_one_token(bool* eof) noexcept{
  308. *eof = false;
  309. while(peekchar() != '\0') {
  310. token_start = curr_char;
  311. char c = eatchar_include_newline();
  312. switch(c) {
  313. case '\'':
  314. case '"': {
  315. Error* err = eat_string(c, StringType::NORMAL_STRING);
  316. if(err) return err;
  317. return NULL;
  318. }
  319. case '#': skip_line_comment(); break;
  320. case '~': add_token(TK("~")); return NULL;
  321. case '{': add_token(TK("{")); return NULL;
  322. case '}': add_token(TK("}")); return NULL;
  323. case ',': add_token(TK(",")); return NULL;
  324. case ':': add_token(TK(":")); return NULL;
  325. case ';': add_token(TK(";")); return NULL;
  326. case '(': add_token(TK("(")); return NULL;
  327. case ')': add_token(TK(")")); return NULL;
  328. case '[': add_token(TK("[")); return NULL;
  329. case ']': add_token(TK("]")); return NULL;
  330. case '@': add_token(TK("@")); return NULL;
  331. case '\\': {
  332. // line continuation character
  333. char c = eatchar_include_newline();
  334. if(c != '\n') {
  335. if(src->mode == REPL_MODE && c == '\0') return NeedMoreLines();
  336. return SyntaxError("expected newline after line continuation character");
  337. }
  338. eat_spaces();
  339. return NULL;
  340. }
  341. case '%': add_token_2('=', TK("%"), TK("%=")); return NULL;
  342. case '&': add_token_2('=', TK("&"), TK("&=")); return NULL;
  343. case '|': add_token_2('=', TK("|"), TK("|=")); return NULL;
  344. case '^': add_token_2('=', TK("^"), TK("^=")); return NULL;
  345. case '.': {
  346. if(matchchar('.')) {
  347. if(matchchar('.')) {
  348. add_token(TK("..."));
  349. } else {
  350. add_token(TK(".."));
  351. }
  352. } else {
  353. char next_char = peekchar();
  354. if(next_char >= '0' && next_char <= '9') {
  355. Error* err = eat_number();
  356. if(err) return err;
  357. } else {
  358. add_token(TK("."));
  359. }
  360. }
  361. return NULL;
  362. }
  363. case '=': add_token_2('=', TK("="), TK("==")); return NULL;
  364. case '+': add_token_2('=', TK("+"), TK("+=")); return NULL;
  365. case '>': {
  366. if(matchchar('='))
  367. add_token(TK(">="));
  368. else if(matchchar('>'))
  369. add_token_2('=', TK(">>"), TK(">>="));
  370. else
  371. add_token(TK(">"));
  372. return NULL;
  373. }
  374. case '<': {
  375. if(matchchar('='))
  376. add_token(TK("<="));
  377. else if(matchchar('<'))
  378. add_token_2('=', TK("<<"), TK("<<="));
  379. else
  380. add_token(TK("<"));
  381. return NULL;
  382. }
  383. case '-': {
  384. if(matchchar('='))
  385. add_token(TK("-="));
  386. else if(matchchar('>'))
  387. add_token(TK("->"));
  388. else
  389. add_token(TK("-"));
  390. return NULL;
  391. }
  392. case '!':
  393. if(matchchar('=')){
  394. add_token(TK("!="));
  395. }else{
  396. Error* err = SyntaxError("expected '=' after '!'");
  397. if(err) return err;
  398. }
  399. break;
  400. case '*':
  401. if(matchchar('*')) {
  402. add_token(TK("**")); // '**'
  403. } else {
  404. add_token_2('=', TK("*"), TK("*="));
  405. }
  406. return NULL;
  407. case '/':
  408. if(matchchar('/')) {
  409. add_token_2('=', TK("//"), TK("//="));
  410. } else {
  411. add_token_2('=', TK("/"), TK("/="));
  412. }
  413. return NULL;
  414. case ' ':
  415. case '\t': eat_spaces(); break;
  416. case '\n': {
  417. add_token(TK("@eol"));
  418. if(!eat_indentation()){
  419. return IndentationError("unindent does not match any outer indentation level");
  420. }
  421. return NULL;
  422. }
  423. default: {
  424. if(c == 'f') {
  425. if(matchchar('\'')) return eat_string('\'', StringType::F_STRING);
  426. if(matchchar('"')) return eat_string('"', StringType::F_STRING);
  427. } else if(c == 'r') {
  428. if(matchchar('\'')) return eat_string('\'', StringType::RAW_STRING);
  429. if(matchchar('"')) return eat_string('"', StringType::RAW_STRING);
  430. } else if(c == 'b') {
  431. if(matchchar('\'')) return eat_string('\'', StringType::NORMAL_BYTES);
  432. if(matchchar('"')) return eat_string('"', StringType::NORMAL_BYTES);
  433. }
  434. if(c >= '0' && c <= '9') return eat_number();
  435. return eat_name();
  436. }
  437. }
  438. }
  439. token_start = curr_char;
  440. while(indents.size() > 1) {
  441. indents.pop_back();
  442. add_token(TK("@dedent"));
  443. return NULL;
  444. }
  445. add_token(TK("@eof"));
  446. *eof = true;
  447. return NULL;
  448. }
  449. Error* Lexer::_error(bool lexer_err, const char* type, const char* msg, va_list* args, i64 userdata) noexcept{
  450. Error* err = (Error*)malloc(sizeof(Error));
  451. err->type = type;
  452. err->src = src;
  453. PK_INCREF(src);
  454. if(lexer_err){
  455. err->lineno = current_line;
  456. err->cursor = curr_char;
  457. if(*curr_char == '\n') {
  458. err->lineno--;
  459. err->cursor--;
  460. }
  461. }else{
  462. err->lineno = -1;
  463. err->cursor = NULL;
  464. }
  465. if(args){
  466. vsnprintf(err->msg, sizeof(err->msg), msg, *args);
  467. }else{
  468. std::strncpy(err->msg, msg, sizeof(err->msg));
  469. }
  470. err->userdata = userdata;
  471. return err;
  472. }
  473. Error* Lexer::SyntaxError(const char* fmt, ...) noexcept{
  474. va_list args;
  475. va_start(args, fmt);
  476. Error* err = _error(true, "SyntaxError", fmt, &args);
  477. va_end(args);
  478. return err;
  479. }
  480. Error* Lexer::run() noexcept{
  481. assert(!this->used);
  482. this->used = true;
  483. if(src->is_precompiled) {
  484. return from_precompiled();
  485. }
  486. // push initial tokens
  487. this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line, brackets_level, {}});
  488. this->indents.push_back(0);
  489. bool eof = false;
  490. while(!eof) {
  491. Error* err = lex_one_token(&eof);
  492. if(err) return err;
  493. }
  494. return NULL;
  495. }
  496. Error* Lexer::from_precompiled() noexcept{
  497. pkpy_TokenDeserializer deserializer;
  498. pkpy_TokenDeserializer__ctor(&deserializer, pkpy_Str__data(&src->source));
  499. deserializer.curr += 5; // skip "pkpy:"
  500. c11_string version = pkpy_TokenDeserializer__read_string(&deserializer, '\n');
  501. if(c11_string__cmp3(version, PK_VERSION) != 0) {
  502. return SyntaxError("precompiled version mismatch");
  503. }
  504. if(pkpy_TokenDeserializer__read_uint(&deserializer, '\n') != (i64)src->mode){
  505. return SyntaxError("precompiled mode mismatch");
  506. }
  507. int count = pkpy_TokenDeserializer__read_count(&deserializer);
  508. c11_vector* precompiled_tokens = &src->_precompiled_tokens;
  509. for(int i = 0; i < count; i++) {
  510. c11_string item = pkpy_TokenDeserializer__read_string(&deserializer, '\n');
  511. pkpy_Str copied_item;
  512. pkpy_Str__ctor2(&copied_item, item.data, item.size);
  513. c11_vector__push(pkpy_Str, precompiled_tokens, copied_item);
  514. }
  515. count = pkpy_TokenDeserializer__read_count(&deserializer);
  516. for(int i = 0; i < count; i++) {
  517. Token t;
  518. t.type = (unsigned char)pkpy_TokenDeserializer__read_uint(&deserializer, ',');
  519. if(is_raw_string_used(t.type)) {
  520. i64 index = pkpy_TokenDeserializer__read_uint(&deserializer, ',');
  521. pkpy_Str* p = c11__at(pkpy_Str, precompiled_tokens, index);
  522. t.start = pkpy_Str__data(p);
  523. t.length = c11__getitem(pkpy_Str, precompiled_tokens, index).size;
  524. } else {
  525. t.start = NULL;
  526. t.length = 0;
  527. }
  528. if(pkpy_TokenDeserializer__match_char(&deserializer, ',')) {
  529. t.line = nexts.back().line;
  530. } else {
  531. t.line = (int)pkpy_TokenDeserializer__read_uint(&deserializer, ',');
  532. }
  533. if(pkpy_TokenDeserializer__match_char(&deserializer, ',')) {
  534. t.brackets_level = nexts.back().brackets_level;
  535. } else {
  536. t.brackets_level = (int)pkpy_TokenDeserializer__read_uint(&deserializer, ',');
  537. }
  538. char type = (*deserializer.curr++); // read_char
  539. switch(type) {
  540. case 'I':
  541. t.value = pkpy_TokenDeserializer__read_uint(&deserializer, '\n');
  542. break;
  543. case 'F':
  544. t.value = pkpy_TokenDeserializer__read_float(&deserializer, '\n');
  545. break;
  546. case 'S': {
  547. pkpy_Str res = pkpy_TokenDeserializer__read_string_from_hex(&deserializer, '\n');
  548. t.value = Str(std::move(res));
  549. } break;
  550. default:
  551. t.value = {};
  552. break;
  553. }
  554. nexts.push_back(t);
  555. }
  556. return NULL;
  557. }
  558. Error* Lexer::precompile(Str* out) noexcept{
  559. assert(!src->is_precompiled);
  560. Error* err = run();
  561. if(err) return err;
  562. SStream ss;
  563. ss << "pkpy:" PK_VERSION << '\n'; // L1: version string
  564. ss << (int)src->mode << '\n'; // L2: mode
  565. c11_smallmap_s2n token_indices;
  566. c11_smallmap_s2n__ctor(&token_indices);
  567. for(auto token: nexts) {
  568. if(is_raw_string_used(token.type)) {
  569. c11_string token_sv = {token.start, token.length};
  570. if(!c11_smallmap_s2n__contains(&token_indices, token_sv)) {
  571. c11_smallmap_s2n__set(&token_indices, token_sv, 0);
  572. // assert no '\n' in token.sv()
  573. for(char c: token.sv())
  574. assert(c != '\n');
  575. }
  576. }
  577. }
  578. ss << "=" << (int)token_indices.count << '\n'; // L3: raw string count
  579. uint16_t index = 0;
  580. for(int i=0; i<token_indices.count; i++){
  581. auto kv = c11__at(c11_smallmap_s2n_KV, &token_indices, i);
  582. ss << kv->key << '\n'; // L4: raw strings
  583. kv->value = index++;
  584. }
  585. ss << "=" << (int)nexts.size() << '\n'; // L5: token count
  586. for(int i = 0; i < nexts.size(); i++) {
  587. const Token& token = nexts[i];
  588. ss << (int)token.type << ',';
  589. if(is_raw_string_used(token.type)) {
  590. uint16_t *p = c11_smallmap_s2n__try_get(&token_indices, {token.start, token.length});
  591. assert(p != NULL);
  592. ss << (int)*p << ',';
  593. }
  594. if(i > 0 && nexts[i - 1].line == token.line)
  595. ss << ',';
  596. else
  597. ss << token.line << ',';
  598. if(i > 0 && nexts[i - 1].brackets_level == token.brackets_level)
  599. ss << ',';
  600. else
  601. ss << token.brackets_level << ',';
  602. // visit token value
  603. std::visit(
  604. [&ss](auto&& arg) {
  605. using T = std::decay_t<decltype(arg)>;
  606. if constexpr(std::is_same_v<T, i64>) {
  607. ss << 'I' << arg;
  608. } else if constexpr(std::is_same_v<T, f64>) {
  609. ss << 'F' << arg;
  610. } else if constexpr(std::is_same_v<T, Str>) {
  611. ss << 'S';
  612. for(char c: arg)
  613. ss.write_hex((unsigned char)c);
  614. }
  615. ss << '\n';
  616. },
  617. token.value);
  618. }
  619. *out = ss.str();
  620. c11_smallmap_s2n__dtor(&token_indices);
  621. return NULL;
  622. }
  623. IntParsingResult parse_uint(std::string_view text, i64* out, int base) noexcept{
  624. *out = 0;
  625. if(base == -1) {
  626. if(text.substr(0, 2) == "0b")
  627. base = 2;
  628. else if(text.substr(0, 2) == "0o")
  629. base = 8;
  630. else if(text.substr(0, 2) == "0x")
  631. base = 16;
  632. else
  633. base = 10;
  634. }
  635. if(base == 10) {
  636. // 10-base 12334
  637. if(text.length() == 0) return IntParsingResult::Failure;
  638. for(char c: text) {
  639. if(c >= '0' && c <= '9') {
  640. *out = (*out * 10) + (c - '0');
  641. } else {
  642. return IntParsingResult::Failure;
  643. }
  644. }
  645. const std::string_view INT64_MAX_S = "9223372036854775807";
  646. if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
  647. return IntParsingResult::Success;
  648. } else if(base == 2) {
  649. // 2-base 0b101010
  650. if(text.substr(0, 2) == "0b") text.remove_prefix(2);
  651. if(text.length() == 0) return IntParsingResult::Failure;
  652. for(char c: text) {
  653. if(c == '0' || c == '1') {
  654. *out = (*out << 1) | (c - '0');
  655. } else {
  656. return IntParsingResult::Failure;
  657. }
  658. }
  659. const std::string_view INT64_MAX_S = "111111111111111111111111111111111111111111111111111111111111111";
  660. if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
  661. return IntParsingResult::Success;
  662. } else if(base == 8) {
  663. // 8-base 0o123
  664. if(text.substr(0, 2) == "0o") text.remove_prefix(2);
  665. if(text.length() == 0) return IntParsingResult::Failure;
  666. for(char c: text) {
  667. if(c >= '0' && c <= '7') {
  668. *out = (*out << 3) | (c - '0');
  669. } else {
  670. return IntParsingResult::Failure;
  671. }
  672. }
  673. const std::string_view INT64_MAX_S = "777777777777777777777";
  674. if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
  675. return IntParsingResult::Success;
  676. } else if(base == 16) {
  677. // 16-base 0x123
  678. if(text.substr(0, 2) == "0x") text.remove_prefix(2);
  679. if(text.length() == 0) return IntParsingResult::Failure;
  680. for(char c: text) {
  681. if(c >= '0' && c <= '9') {
  682. *out = (*out << 4) | (c - '0');
  683. } else if(c >= 'a' && c <= 'f') {
  684. *out = (*out << 4) | (c - 'a' + 10);
  685. } else if(c >= 'A' && c <= 'F') {
  686. *out = (*out << 4) | (c - 'A' + 10);
  687. } else {
  688. return IntParsingResult::Failure;
  689. }
  690. }
  691. const std::string_view INT64_MAX_S = "7fffffffffffffff";
  692. if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
  693. return IntParsingResult::Success;
  694. }
  695. return IntParsingResult::Failure;
  696. }
  697. } // namespace pkpy