lexer.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. #pragma once
  2. #include "common.h"
  3. #include "error.h"
  4. #include "str.h"
  5. namespace pkpy{
  6. typedef uint8_t TokenIndex;
  7. constexpr const char* kTokens[] = {
  8. "is not", "not in", "yield from",
  9. "@eof", "@eol", "@sof",
  10. "@id", "@num", "@str", "@fstr", "@long",
  11. "@indent", "@dedent",
  12. /*****************************************/
  13. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  14. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  15. "&", "&=", "|", "|=", "^", "^=",
  16. "<<", "<<=", ">>", ">>=",
  17. /*****************************************/
  18. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  19. "**", "=", ">", "<", "...", "->", "?", "@", "==", "!=", ">=", "<=",
  20. "++", "--",
  21. /** SPEC_BEGIN **/
  22. "$goto", "$label",
  23. /** KW_BEGIN **/
  24. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  25. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  26. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  27. };
  28. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  29. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  30. constexpr TokenIndex TK(const char token[]) {
  31. for(int k=0; k<kTokenCount; k++){
  32. const char* i = kTokens[k];
  33. const char* j = token;
  34. while(*i && *j && *i == *j) { i++; j++;}
  35. if(*i == *j) return k;
  36. }
  37. return 255;
  38. }
  39. #define TK_STR(t) kTokens[t]
  40. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  41. std::map<std::string_view, TokenIndex> map;
  42. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  43. return map;
  44. }();
  45. struct Token{
  46. TokenIndex type;
  47. const char* start;
  48. int length;
  49. int line;
  50. int brackets_level;
  51. TokenValue value;
  52. Str str() const { return Str(start, length);}
  53. std::string_view sv() const { return std::string_view(start, length);}
  54. std::string info() const {
  55. std::stringstream ss;
  56. ss << line << ": " << TK_STR(type) << " '" << (
  57. sv()=="\n" ? "\\n" : sv()
  58. ) << "'";
  59. return ss.str();
  60. }
  61. };
  62. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  63. enum Precedence {
  64. PREC_NONE,
  65. PREC_TUPLE, // ,
  66. PREC_LAMBDA, // lambda
  67. PREC_TERNARY, // ?:
  68. PREC_LOGICAL_OR, // or
  69. PREC_LOGICAL_AND, // and
  70. PREC_LOGICAL_NOT, // not
  71. /* https://docs.python.org/3/reference/expressions.html#comparisons
  72. * Unlike C, all comparison operations in Python have the same priority,
  73. * which is lower than that of any arithmetic, shifting or bitwise operation.
  74. * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
  75. */
  76. PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in
  77. PREC_BITWISE_OR, // |
  78. PREC_BITWISE_XOR, // ^
  79. PREC_BITWISE_AND, // &
  80. PREC_BITWISE_SHIFT, // << >>
  81. PREC_TERM, // + -
  82. PREC_FACTOR, // * / % // @
  83. PREC_UNARY, // - not
  84. PREC_EXPONENT, // **
  85. PREC_CALL, // ()
  86. PREC_SUBSCRIPT, // []
  87. PREC_ATTRIB, // .index
  88. PREC_PRIMARY,
  89. };
  90. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
  91. struct Lexer {
  92. shared_ptr<SourceData> src;
  93. const char* token_start;
  94. const char* curr_char;
  95. int current_line = 1;
  96. std::vector<Token> nexts;
  97. stack<int> indents;
  98. int brackets_level = 0;
  99. bool used = false;
  100. char peekchar() const{ return *curr_char; }
  101. bool match_n_chars(int n, char c0){
  102. const char* c = curr_char;
  103. for(int i=0; i<n; i++){
  104. if(*c == '\0') return false;
  105. if(*c != c0) return false;
  106. c++;
  107. }
  108. for(int i=0; i<n; i++) eatchar_include_newline();
  109. return true;
  110. }
  111. bool match_string(const char* s){
  112. int s_len = strlen(s);
  113. bool ok = strncmp(curr_char, s, s_len) == 0;
  114. if(ok) for(int i=0; i<s_len; i++) eatchar_include_newline();
  115. return ok;
  116. }
  117. int eat_spaces(){
  118. int count = 0;
  119. while (true) {
  120. switch (peekchar()) {
  121. case ' ' : count+=1; break;
  122. case '\t': count+=4; break;
  123. default: return count;
  124. }
  125. eatchar();
  126. }
  127. }
  128. bool eat_indentation(){
  129. if(brackets_level > 0) return true;
  130. int spaces = eat_spaces();
  131. if(peekchar() == '#') skip_line_comment();
  132. if(peekchar() == '\0' || peekchar() == '\n') return true;
  133. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  134. if(spaces > indents.top()){
  135. indents.push(spaces);
  136. nexts.push_back(Token{TK("@indent"), token_start, 0, current_line, brackets_level});
  137. } else if(spaces < indents.top()){
  138. while(spaces < indents.top()){
  139. indents.pop();
  140. nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line, brackets_level});
  141. }
  142. if(spaces != indents.top()){
  143. return false;
  144. }
  145. }
  146. return true;
  147. }
  148. char eatchar() {
  149. char c = peekchar();
  150. if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
  151. curr_char++;
  152. return c;
  153. }
  154. char eatchar_include_newline() {
  155. char c = peekchar();
  156. curr_char++;
  157. if (c == '\n'){
  158. current_line++;
  159. src->line_starts.push_back(curr_char);
  160. }
  161. return c;
  162. }
  163. int eat_name() {
  164. curr_char--;
  165. while(true){
  166. unsigned char c = peekchar();
  167. int u8bytes = utf8len(c, true);
  168. if(u8bytes == 0) return 1;
  169. if(u8bytes == 1){
  170. if(isalpha(c) || c=='_' || isdigit(c)) {
  171. curr_char++;
  172. continue;
  173. }else{
  174. break;
  175. }
  176. }
  177. // handle multibyte char
  178. std::string u8str(curr_char, u8bytes);
  179. if(u8str.size() != u8bytes) return 2;
  180. uint32_t value = 0;
  181. for(int k=0; k < u8bytes; k++){
  182. uint8_t b = u8str[k];
  183. if(k==0){
  184. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  185. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  186. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  187. }else{
  188. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  189. }
  190. }
  191. if(is_unicode_Lo_char(value)) curr_char += u8bytes;
  192. else break;
  193. }
  194. int length = (int)(curr_char - token_start);
  195. if(length == 0) return 3;
  196. std::string_view name(token_start, length);
  197. if(src->mode == JSON_MODE){
  198. if(name == "true"){
  199. add_token(TK("True"));
  200. } else if(name == "false"){
  201. add_token(TK("False"));
  202. } else if(name == "null"){
  203. add_token(TK("None"));
  204. } else {
  205. return 4;
  206. }
  207. return 0;
  208. }
  209. if(kTokenKwMap.count(name)){
  210. add_token(kTokenKwMap.at(name));
  211. } else {
  212. add_token(TK("@id"));
  213. }
  214. return 0;
  215. }
  216. void skip_line_comment() {
  217. char c;
  218. while ((c = peekchar()) != '\0') {
  219. if (c == '\n') return;
  220. eatchar();
  221. }
  222. }
  223. bool matchchar(char c) {
  224. if (peekchar() != c) return false;
  225. eatchar_include_newline();
  226. return true;
  227. }
  228. void add_token(TokenIndex type, TokenValue value={}) {
  229. switch(type){
  230. case TK("{"): case TK("["): case TK("("): brackets_level++; break;
  231. case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
  232. }
  233. auto token = Token{
  234. type,
  235. token_start,
  236. (int)(curr_char - token_start),
  237. current_line - ((type == TK("@eol")) ? 1 : 0),
  238. brackets_level,
  239. value
  240. };
  241. // handle "not in", "is not", "yield from"
  242. if(!nexts.empty()){
  243. auto& back = nexts.back();
  244. if(back.type == TK("not") && type == TK("in")){
  245. back.type = TK("not in");
  246. return;
  247. }
  248. if(back.type == TK("is") && type == TK("not")){
  249. back.type = TK("is not");
  250. return;
  251. }
  252. if(back.type == TK("yield") && type == TK("from")){
  253. back.type = TK("yield from");
  254. return;
  255. }
  256. nexts.push_back(token);
  257. }
  258. }
  259. void add_token_2(char c, TokenIndex one, TokenIndex two) {
  260. if (matchchar(c)) add_token(two);
  261. else add_token(one);
  262. }
  263. Str eat_string_until(char quote, bool raw) {
  264. bool quote3 = match_n_chars(2, quote);
  265. std::vector<char> buff;
  266. while (true) {
  267. char c = eatchar_include_newline();
  268. if (c == quote){
  269. if(quote3 && !match_n_chars(2, quote)){
  270. buff.push_back(c);
  271. continue;
  272. }
  273. break;
  274. }
  275. if (c == '\0'){
  276. if(quote3 && src->mode == REPL_MODE){
  277. throw NeedMoreLines(false);
  278. }
  279. SyntaxError("EOL while scanning string literal");
  280. }
  281. if (c == '\n'){
  282. if(!quote3) SyntaxError("EOL while scanning string literal");
  283. else{
  284. buff.push_back(c);
  285. continue;
  286. }
  287. }
  288. if (!raw && c == '\\') {
  289. switch (eatchar_include_newline()) {
  290. case '"': buff.push_back('"'); break;
  291. case '\'': buff.push_back('\''); break;
  292. case '\\': buff.push_back('\\'); break;
  293. case 'n': buff.push_back('\n'); break;
  294. case 'r': buff.push_back('\r'); break;
  295. case 't': buff.push_back('\t'); break;
  296. case 'x': {
  297. char hex[3] = {eatchar(), eatchar(), '\0'};
  298. size_t parsed;
  299. char code;
  300. try{
  301. code = (char)Number::stoi(hex, &parsed, 16);
  302. }catch(std::invalid_argument&){
  303. SyntaxError("invalid hex char");
  304. }
  305. if (parsed != 2) SyntaxError("invalid hex char");
  306. buff.push_back(code);
  307. } break;
  308. default: SyntaxError("invalid escape char");
  309. }
  310. } else {
  311. buff.push_back(c);
  312. }
  313. }
  314. return Str(buff.data(), buff.size());
  315. }
  316. void eat_string(char quote, StringType type) {
  317. Str s = eat_string_until(quote, type == RAW_STRING);
  318. if(type == F_STRING){
  319. add_token(TK("@fstr"), s);
  320. }else{
  321. add_token(TK("@str"), s);
  322. }
  323. }
  324. void eat_number() {
  325. static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?(L)?");
  326. std::smatch m;
  327. const char* i = token_start;
  328. while(*i != '\n' && *i != '\0') i++;
  329. std::string s = std::string(token_start, i);
  330. bool ok = std::regex_search(s, m, pattern);
  331. PK_ASSERT(ok);
  332. // here is m.length()-1, since the first char was eaten by lex_token()
  333. for(int j=0; j<m.length()-1; j++) eatchar();
  334. if(m[3].matched){
  335. add_token(TK("@long"));
  336. return;
  337. }
  338. try{
  339. int base = 10;
  340. size_t size;
  341. if (m[1].matched) base = 16;
  342. if (m[2].matched) {
  343. if(base == 16) SyntaxError("hex literal should not contain a dot");
  344. add_token(TK("@num"), Number::stof(m[0], &size));
  345. } else {
  346. add_token(TK("@num"), Number::stoi(m[0], &size, base));
  347. }
  348. PK_ASSERT(size == m.length());
  349. }catch(std::exception& _){
  350. SyntaxError("invalid number literal");
  351. }
  352. }
  353. bool lex_one_token() {
  354. while (peekchar() != '\0') {
  355. token_start = curr_char;
  356. char c = eatchar_include_newline();
  357. switch (c) {
  358. case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
  359. case '#': skip_line_comment(); break;
  360. case '{': add_token(TK("{")); return true;
  361. case '}': add_token(TK("}")); return true;
  362. case ',': add_token(TK(",")); return true;
  363. case ':': add_token(TK(":")); return true;
  364. case ';': add_token(TK(";")); return true;
  365. case '(': add_token(TK("(")); return true;
  366. case ')': add_token(TK(")")); return true;
  367. case '[': add_token(TK("[")); return true;
  368. case ']': add_token(TK("]")); return true;
  369. case '@': add_token(TK("@")); return true;
  370. case '$': {
  371. for(int i=TK("$goto"); i<=TK("$label"); i++){
  372. // +1 to skip the '$'
  373. if(match_string(TK_STR(i) + 1)){
  374. add_token((TokenIndex)i);
  375. return true;
  376. }
  377. }
  378. SyntaxError("invalid special token");
  379. } return false;
  380. case '%': add_token_2('=', TK("%"), TK("%=")); return true;
  381. case '&': add_token_2('=', TK("&"), TK("&=")); return true;
  382. case '|': add_token_2('=', TK("|"), TK("|=")); return true;
  383. case '^': add_token_2('=', TK("^"), TK("^=")); return true;
  384. case '?': add_token(TK("?")); return true;
  385. case '.': {
  386. if(matchchar('.')) {
  387. if(matchchar('.')) {
  388. add_token(TK("..."));
  389. } else {
  390. SyntaxError("invalid token '..'");
  391. }
  392. } else {
  393. add_token(TK("."));
  394. }
  395. return true;
  396. }
  397. case '=': add_token_2('=', TK("="), TK("==")); return true;
  398. case '+':
  399. if(matchchar('+')){
  400. add_token(TK("++"));
  401. }else{
  402. add_token_2('=', TK("+"), TK("+="));
  403. }
  404. return true;
  405. case '>': {
  406. if(matchchar('=')) add_token(TK(">="));
  407. else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
  408. else add_token(TK(">"));
  409. return true;
  410. }
  411. case '<': {
  412. if(matchchar('=')) add_token(TK("<="));
  413. else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
  414. else add_token(TK("<"));
  415. return true;
  416. }
  417. case '-': {
  418. if(matchchar('-')){
  419. add_token(TK("--"));
  420. }else{
  421. if(matchchar('=')) add_token(TK("-="));
  422. else if(matchchar('>')) add_token(TK("->"));
  423. else add_token(TK("-"));
  424. }
  425. return true;
  426. }
  427. case '!':
  428. if(matchchar('=')) add_token(TK("!="));
  429. else SyntaxError("expected '=' after '!'");
  430. break;
  431. case '*':
  432. if (matchchar('*')) {
  433. add_token(TK("**")); // '**'
  434. } else {
  435. add_token_2('=', TK("*"), TK("*="));
  436. }
  437. return true;
  438. case '/':
  439. if(matchchar('/')) {
  440. add_token_2('=', TK("//"), TK("//="));
  441. } else {
  442. add_token_2('=', TK("/"), TK("/="));
  443. }
  444. return true;
  445. case ' ': case '\t': eat_spaces(); break;
  446. case '\n': {
  447. add_token(TK("@eol"));
  448. if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
  449. return true;
  450. }
  451. default: {
  452. if(c == 'f'){
  453. if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
  454. if(matchchar('"')) {eat_string('"', F_STRING); return true;}
  455. }else if(c == 'r'){
  456. if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
  457. if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
  458. }
  459. if (c >= '0' && c <= '9') {
  460. eat_number();
  461. return true;
  462. }
  463. switch (eat_name())
  464. {
  465. case 0: break;
  466. case 1: SyntaxError("invalid char: " + std::string(1, c)); break;
  467. case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c)); break;
  468. case 3: SyntaxError("@id contains invalid char"); break;
  469. case 4: SyntaxError("invalid JSON token"); break;
  470. default: FATAL_ERROR();
  471. }
  472. return true;
  473. }
  474. }
  475. }
  476. token_start = curr_char;
  477. while(indents.size() > 1){
  478. indents.pop();
  479. add_token(TK("@dedent"));
  480. return true;
  481. }
  482. add_token(TK("@eof"));
  483. return false;
  484. }
  485. /***** Error Reporter *****/
  486. void throw_err(Str type, Str msg){
  487. int lineno = current_line;
  488. const char* cursor = curr_char;
  489. if(peekchar() == '\n'){
  490. lineno--;
  491. cursor--;
  492. }
  493. throw_err(type, msg, lineno, cursor);
  494. }
  495. void throw_err(Str type, Str msg, int lineno, const char* cursor){
  496. auto e = Exception(type, msg);
  497. e.st_push(src->snapshot(lineno, cursor));
  498. throw e;
  499. }
  500. void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  501. void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  502. void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  503. Lexer(shared_ptr<SourceData> src) {
  504. this->src = src;
  505. this->token_start = src->source.c_str();
  506. this->curr_char = src->source.c_str();
  507. this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line, brackets_level});
  508. this->indents.push(0);
  509. }
  510. std::vector<Token> run() {
  511. if(used) FATAL_ERROR();
  512. used = true;
  513. while (lex_one_token());
  514. return std::move(nexts);
  515. }
  516. };
  517. } // namespace pkpy