lexer.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. #pragma once
  2. #include "common.h"
  3. #include "error.h"
  4. #include "str.h"
  5. namespace pkpy{
  6. typedef uint8_t TokenIndex;
  7. constexpr const char* kTokens[] = {
  8. "is not", "not in", "yield from",
  9. "@eof", "@eol", "@sof",
  10. "@id", "@num", "@str", "@fstr",
  11. "@indent", "@dedent",
  12. /*****************************************/
  13. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  14. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  15. "&", "&=", "|", "|=", "^", "^=",
  16. "<<", "<<=", ">>", ">>=",
  17. /*****************************************/
  18. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  19. "**", "=", ">", "<", "...", "->", "?", "@", "==", "!=", ">=", "<=",
  20. /** SPEC_BEGIN **/
  21. "$goto", "$label",
  22. /** KW_BEGIN **/
  23. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  24. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  25. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  26. };
  27. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  28. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  29. constexpr TokenIndex TK(const char token[]) {
  30. for(int k=0; k<kTokenCount; k++){
  31. const char* i = kTokens[k];
  32. const char* j = token;
  33. while(*i && *j && *i == *j) { i++; j++;}
  34. if(*i == *j) return k;
  35. }
  36. return 255;
  37. }
  38. #define TK_STR(t) kTokens[t]
  39. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  40. std::map<std::string_view, TokenIndex> map;
  41. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  42. return map;
  43. }();
  44. struct Token{
  45. TokenIndex type;
  46. const char* start;
  47. int length;
  48. int line;
  49. TokenValue value;
  50. Str str() const { return Str(start, length);}
  51. std::string_view sv() const { return std::string_view(start, length);}
  52. std::string info() const {
  53. std::stringstream ss;
  54. ss << line << ": " << TK_STR(type) << " '" << (
  55. sv()=="\n" ? "\\n" : sv()
  56. ) << "'";
  57. return ss.str();
  58. }
  59. };
  60. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  61. enum Precedence {
  62. PREC_NONE,
  63. PREC_TUPLE, // ,
  64. PREC_LAMBDA, // lambda
  65. PREC_TERNARY, // ?:
  66. PREC_LOGICAL_OR, // or
  67. PREC_LOGICAL_AND, // and
  68. PREC_LOGICAL_NOT, // not
  69. PREC_EQUALITY, // == !=
  70. PREC_TEST, // in / is / is not / not in
  71. PREC_COMPARISION, // < > <= >=
  72. PREC_BITWISE_OR, // |
  73. PREC_BITWISE_XOR, // ^
  74. PREC_BITWISE_AND, // &
  75. PREC_BITWISE_SHIFT, // << >>
  76. PREC_TERM, // + -
  77. PREC_FACTOR, // * / % // @
  78. PREC_UNARY, // - not
  79. PREC_EXPONENT, // **
  80. PREC_CALL, // ()
  81. PREC_SUBSCRIPT, // []
  82. PREC_ATTRIB, // .index
  83. PREC_PRIMARY,
  84. };
  85. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
  86. struct Lexer {
  87. shared_ptr<SourceData> src;
  88. const char* token_start;
  89. const char* curr_char;
  90. int current_line = 1;
  91. std::vector<Token> nexts;
  92. stack<int> indents;
  93. int brackets_level = 0;
  94. bool used = false;
  95. char peekchar() const{ return *curr_char; }
  96. bool match_n_chars(int n, char c0){
  97. const char* c = curr_char;
  98. for(int i=0; i<n; i++){
  99. if(*c == '\0') return false;
  100. if(*c != c0) return false;
  101. c++;
  102. }
  103. for(int i=0; i<n; i++) eatchar_include_newline();
  104. return true;
  105. }
  106. bool match_string(const char* s){
  107. int s_len = strlen(s);
  108. bool ok = strncmp(curr_char, s, s_len) == 0;
  109. if(ok) for(int i=0; i<s_len; i++) eatchar_include_newline();
  110. return ok;
  111. }
  112. int eat_spaces(){
  113. int count = 0;
  114. while (true) {
  115. switch (peekchar()) {
  116. case ' ' : count+=1; break;
  117. case '\t': count+=4; break;
  118. default: return count;
  119. }
  120. eatchar();
  121. }
  122. }
  123. bool eat_indentation(){
  124. if(brackets_level > 0) return true;
  125. int spaces = eat_spaces();
  126. if(peekchar() == '#') skip_line_comment();
  127. if(peekchar() == '\0' || peekchar() == '\n') return true;
  128. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  129. if(spaces > indents.top()){
  130. indents.push(spaces);
  131. nexts.push_back(Token{TK("@indent"), token_start, 0, current_line});
  132. } else if(spaces < indents.top()){
  133. while(spaces < indents.top()){
  134. indents.pop();
  135. nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line});
  136. }
  137. if(spaces != indents.top()){
  138. return false;
  139. }
  140. }
  141. return true;
  142. }
  143. char eatchar() {
  144. char c = peekchar();
  145. if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
  146. curr_char++;
  147. return c;
  148. }
  149. char eatchar_include_newline() {
  150. char c = peekchar();
  151. curr_char++;
  152. if (c == '\n'){
  153. current_line++;
  154. src->line_starts.push_back(curr_char);
  155. }
  156. return c;
  157. }
  158. int eat_name() {
  159. curr_char--;
  160. while(true){
  161. unsigned char c = peekchar();
  162. int u8bytes = utf8len(c, true);
  163. if(u8bytes == 0) return 1;
  164. if(u8bytes == 1){
  165. if(isalpha(c) || c=='_' || isdigit(c)) {
  166. curr_char++;
  167. continue;
  168. }else{
  169. break;
  170. }
  171. }
  172. // handle multibyte char
  173. std::string u8str(curr_char, u8bytes);
  174. if(u8str.size() != u8bytes) return 2;
  175. uint32_t value = 0;
  176. for(int k=0; k < u8bytes; k++){
  177. uint8_t b = u8str[k];
  178. if(k==0){
  179. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  180. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  181. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  182. }else{
  183. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  184. }
  185. }
  186. if(is_unicode_Lo_char(value)) curr_char += u8bytes;
  187. else break;
  188. }
  189. int length = (int)(curr_char - token_start);
  190. if(length == 0) return 3;
  191. std::string_view name(token_start, length);
  192. if(src->mode == JSON_MODE){
  193. if(name == "true"){
  194. add_token(TK("True"));
  195. } else if(name == "false"){
  196. add_token(TK("False"));
  197. } else if(name == "null"){
  198. add_token(TK("None"));
  199. } else {
  200. return 4;
  201. }
  202. return 0;
  203. }
  204. if(kTokenKwMap.count(name)){
  205. if(name == "not"){
  206. if(strncmp(curr_char, " in", 3) == 0){
  207. curr_char += 3;
  208. add_token(TK("not in"));
  209. return 0;
  210. }
  211. }else if(name == "is"){
  212. if(strncmp(curr_char, " not", 4) == 0){
  213. curr_char += 4;
  214. add_token(TK("is not"));
  215. return 0;
  216. }
  217. }else if(name == "yield"){
  218. if(strncmp(curr_char, " from", 5) == 0){
  219. curr_char += 5;
  220. add_token(TK("yield from"));
  221. return 0;
  222. }
  223. }
  224. add_token(kTokenKwMap.at(name));
  225. } else {
  226. add_token(TK("@id"));
  227. }
  228. return 0;
  229. }
  230. void skip_line_comment() {
  231. char c;
  232. while ((c = peekchar()) != '\0') {
  233. if (c == '\n') return;
  234. eatchar();
  235. }
  236. }
  237. bool matchchar(char c) {
  238. if (peekchar() != c) return false;
  239. eatchar_include_newline();
  240. return true;
  241. }
  242. void add_token(TokenIndex type, TokenValue value={}) {
  243. switch(type){
  244. case TK("{"): case TK("["): case TK("("): brackets_level++; break;
  245. case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
  246. }
  247. nexts.push_back( Token{
  248. type,
  249. token_start,
  250. (int)(curr_char - token_start),
  251. current_line - ((type == TK("@eol")) ? 1 : 0),
  252. value
  253. });
  254. }
  255. void add_token_2(char c, TokenIndex one, TokenIndex two) {
  256. if (matchchar(c)) add_token(two);
  257. else add_token(one);
  258. }
  259. Str eat_string_until(char quote, bool raw) {
  260. bool quote3 = match_n_chars(2, quote);
  261. std::vector<char> buff;
  262. while (true) {
  263. char c = eatchar_include_newline();
  264. if (c == quote){
  265. if(quote3 && !match_n_chars(2, quote)){
  266. buff.push_back(c);
  267. continue;
  268. }
  269. break;
  270. }
  271. if (c == '\0'){
  272. if(quote3 && src->mode == REPL_MODE){
  273. throw NeedMoreLines(false);
  274. }
  275. SyntaxError("EOL while scanning string literal");
  276. }
  277. if (c == '\n'){
  278. if(!quote3) SyntaxError("EOL while scanning string literal");
  279. else{
  280. buff.push_back(c);
  281. continue;
  282. }
  283. }
  284. if (!raw && c == '\\') {
  285. switch (eatchar_include_newline()) {
  286. case '"': buff.push_back('"'); break;
  287. case '\'': buff.push_back('\''); break;
  288. case '\\': buff.push_back('\\'); break;
  289. case 'n': buff.push_back('\n'); break;
  290. case 'r': buff.push_back('\r'); break;
  291. case 't': buff.push_back('\t'); break;
  292. default: SyntaxError("invalid escape char");
  293. }
  294. } else {
  295. buff.push_back(c);
  296. }
  297. }
  298. return Str(buff.data(), buff.size());
  299. }
  300. void eat_string(char quote, StringType type) {
  301. Str s = eat_string_until(quote, type == RAW_STRING);
  302. if(type == F_STRING){
  303. add_token(TK("@fstr"), s);
  304. }else{
  305. add_token(TK("@str"), s);
  306. }
  307. }
  308. void eat_number() {
  309. static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
  310. std::smatch m;
  311. const char* i = token_start;
  312. while(*i != '\n' && *i != '\0') i++;
  313. std::string s = std::string(token_start, i);
  314. try{
  315. if (std::regex_search(s, m, pattern)) {
  316. // here is m.length()-1, since the first char was eaten by lex_token()
  317. for(int j=0; j<m.length()-1; j++) eatchar();
  318. int base = 10;
  319. size_t size;
  320. if (m[1].matched) base = 16;
  321. if (m[2].matched) {
  322. if(base == 16) SyntaxError("hex literal should not contain a dot");
  323. add_token(TK("@num"), Number::stof(m[0], &size));
  324. } else {
  325. add_token(TK("@num"), Number::stoi(m[0], &size, base));
  326. }
  327. if (size != m.length()) FATAL_ERROR();
  328. }
  329. }catch(std::exception& _){
  330. SyntaxError("invalid number literal");
  331. }
  332. }
  333. bool lex_one_token() {
  334. while (peekchar() != '\0') {
  335. token_start = curr_char;
  336. char c = eatchar_include_newline();
  337. switch (c) {
  338. case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
  339. case '#': skip_line_comment(); break;
  340. case '{': add_token(TK("{")); return true;
  341. case '}': add_token(TK("}")); return true;
  342. case ',': add_token(TK(",")); return true;
  343. case ':': add_token(TK(":")); return true;
  344. case ';': add_token(TK(";")); return true;
  345. case '(': add_token(TK("(")); return true;
  346. case ')': add_token(TK(")")); return true;
  347. case '[': add_token(TK("[")); return true;
  348. case ']': add_token(TK("]")); return true;
  349. case '@': add_token(TK("@")); return true;
  350. case '$': {
  351. for(int i=TK("$goto"); i<=TK("$label"); i++){
  352. // +1 to skip the '$'
  353. if(match_string(TK_STR(i) + 1)){
  354. add_token((TokenIndex)i);
  355. return true;
  356. }
  357. }
  358. SyntaxError("invalid special token");
  359. }
  360. case '%': add_token_2('=', TK("%"), TK("%=")); return true;
  361. case '&': add_token_2('=', TK("&"), TK("&=")); return true;
  362. case '|': add_token_2('=', TK("|"), TK("|=")); return true;
  363. case '^': add_token_2('=', TK("^"), TK("^=")); return true;
  364. case '?': add_token(TK("?")); return true;
  365. case '.': {
  366. if(matchchar('.')) {
  367. if(matchchar('.')) {
  368. add_token(TK("..."));
  369. } else {
  370. SyntaxError("invalid token '..'");
  371. }
  372. } else {
  373. add_token(TK("."));
  374. }
  375. return true;
  376. }
  377. case '=': add_token_2('=', TK("="), TK("==")); return true;
  378. case '+': add_token_2('=', TK("+"), TK("+=")); return true;
  379. case '>': {
  380. if(matchchar('=')) add_token(TK(">="));
  381. else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
  382. else add_token(TK(">"));
  383. return true;
  384. }
  385. case '<': {
  386. if(matchchar('=')) add_token(TK("<="));
  387. else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
  388. else add_token(TK("<"));
  389. return true;
  390. }
  391. case '-': {
  392. if(matchchar('=')) add_token(TK("-="));
  393. else if(matchchar('>')) add_token(TK("->"));
  394. else add_token(TK("-"));
  395. return true;
  396. }
  397. case '!':
  398. if(matchchar('=')) add_token(TK("!="));
  399. else SyntaxError("expected '=' after '!'");
  400. break;
  401. case '*':
  402. if (matchchar('*')) {
  403. add_token(TK("**")); // '**'
  404. } else {
  405. add_token_2('=', TK("*"), TK("*="));
  406. }
  407. return true;
  408. case '/':
  409. if(matchchar('/')) {
  410. add_token_2('=', TK("//"), TK("//="));
  411. } else {
  412. add_token_2('=', TK("/"), TK("/="));
  413. }
  414. return true;
  415. case ' ': case '\t': eat_spaces(); break;
  416. case '\n': {
  417. add_token(TK("@eol"));
  418. if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
  419. return true;
  420. }
  421. default: {
  422. if(c == 'f'){
  423. if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
  424. if(matchchar('"')) {eat_string('"', F_STRING); return true;}
  425. }else if(c == 'r'){
  426. if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
  427. if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
  428. }
  429. if (c >= '0' && c <= '9') {
  430. eat_number();
  431. return true;
  432. }
  433. switch (eat_name())
  434. {
  435. case 0: break;
  436. case 1: SyntaxError("invalid char: " + std::string(1, c));
  437. case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
  438. case 3: SyntaxError("@id contains invalid char"); break;
  439. case 4: SyntaxError("invalid JSON token"); break;
  440. default: FATAL_ERROR();
  441. }
  442. return true;
  443. }
  444. }
  445. }
  446. token_start = curr_char;
  447. while(indents.size() > 1){
  448. indents.pop();
  449. add_token(TK("@dedent"));
  450. return true;
  451. }
  452. add_token(TK("@eof"));
  453. return false;
  454. }
  455. /***** Error Reporter *****/
  456. void throw_err(Str type, Str msg){
  457. int lineno = current_line;
  458. const char* cursor = curr_char;
  459. if(peekchar() == '\n'){
  460. lineno--;
  461. cursor--;
  462. }
  463. throw_err(type, msg, lineno, cursor);
  464. }
  465. void throw_err(Str type, Str msg, int lineno, const char* cursor){
  466. auto e = Exception("SyntaxError", msg);
  467. e.st_push(src->snapshot(lineno, cursor));
  468. throw e;
  469. }
  470. void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  471. void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  472. void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  473. Lexer(shared_ptr<SourceData> src) {
  474. this->src = src;
  475. this->token_start = src->source.c_str();
  476. this->curr_char = src->source.c_str();
  477. this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line});
  478. this->indents.push(0);
  479. }
  480. std::vector<Token> run() {
  481. if(used) FATAL_ERROR();
  482. used = true;
  483. while (lex_one_token());
  484. return std::move(nexts);
  485. }
  486. };
  487. } // namespace pkpy