lexer.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #pragma once
  2. #include "common.h"
  3. #include "error.h"
  4. #include "str.h"
  5. #include "obj.h"
  6. namespace pkpy{
  7. typedef uint8_t TokenIndex;
  8. constexpr const char* kTokens[] = {
  9. "is not", "not in", "yield from",
  10. "@eof", "@eol", "@sof",
  11. "@id", "@num", "@str", "@fstr", "@long", "@bytes", "@imag",
  12. "@indent", "@dedent",
  13. /*****************************************/
  14. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  15. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  16. "&", "&=", "|", "|=", "^", "^=",
  17. "<<", "<<=", ">>", ">>=",
  18. /*****************************************/
  19. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  20. "**", "=", ">", "<", "..", "...", "->", "@", "==", "!=", ">=", "<=",
  21. "++", "--", "~",
  22. /** KW_BEGIN **/
  23. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  24. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  25. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  26. };
  27. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  28. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  29. constexpr TokenIndex TK(const char token[]) {
  30. for(int k=0; k<kTokenCount; k++){
  31. const char* i = kTokens[k];
  32. const char* j = token;
  33. while(*i && *j && *i == *j) { i++; j++;}
  34. if(*i == *j) return k;
  35. }
  36. return 255;
  37. }
  38. inline constexpr bool is_raw_string_used(TokenIndex t){
  39. return t == TK("@id") || t == TK("@long");
  40. }
  41. #define TK_STR(t) kTokens[t]
  42. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  43. std::map<std::string_view, TokenIndex> map;
  44. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  45. return map;
  46. }();
  47. struct Token{
  48. TokenIndex type;
  49. const char* start;
  50. int length;
  51. int line;
  52. int brackets_level;
  53. TokenValue value;
  54. Str str() const { return Str(start, length);}
  55. std::string_view sv() const { return std::string_view(start, length);}
  56. };
  57. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  58. enum Precedence {
  59. PREC_LOWEST,
  60. PREC_LAMBDA, // lambda
  61. PREC_TERNARY, // ?:
  62. PREC_LOGICAL_OR, // or
  63. PREC_LOGICAL_AND, // and
  64. PREC_LOGICAL_NOT, // not
  65. /* https://docs.python.org/3/reference/expressions.html#comparisons
  66. * Unlike C, all comparison operations in Python have the same priority,
  67. * which is lower than that of any arithmetic, shifting or bitwise operation.
  68. * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
  69. */
  70. PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in
  71. PREC_BITWISE_OR, // |
  72. PREC_BITWISE_XOR, // ^
  73. PREC_BITWISE_AND, // &
  74. PREC_BITWISE_SHIFT, // << >>
  75. PREC_TERM, // + -
  76. PREC_FACTOR, // * / % // @
  77. PREC_UNARY, // - not ~
  78. PREC_EXPONENT, // **
  79. PREC_PRIMARY, // f() x[] a.b 1:2
  80. PREC_HIGHEST,
  81. };
  82. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES };
  83. struct Lexer {
  84. VM* vm;
  85. std::shared_ptr<SourceData> src;
  86. const char* token_start;
  87. const char* curr_char;
  88. int current_line = 1;
  89. std::vector<Token> nexts;
  90. stack_no_copy<int, small_vector_no_copy_and_move<int, 8>> indents;
  91. int brackets_level = 0;
  92. char peekchar() const{ return *curr_char; }
  93. bool match_n_chars(int n, char c0);
  94. bool match_string(const char* s);
  95. int eat_spaces();
  96. bool eat_indentation();
  97. char eatchar();
  98. char eatchar_include_newline();
  99. int eat_name();
  100. void skip_line_comment();
  101. bool matchchar(char c);
  102. void add_token(TokenIndex type, TokenValue value={});
  103. void add_token_2(char c, TokenIndex one, TokenIndex two);
  104. Str eat_string_until(char quote, bool raw);
  105. void eat_string(char quote, StringType type);
  106. void eat_number();
  107. bool lex_one_token();
  108. /***** Error Reporter *****/
  109. void throw_err(StrName type, Str msg);
  110. void throw_err(StrName type, Str msg, int lineno, const char* cursor);
  111. void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  112. void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  113. void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  114. Lexer(VM* vm, std::shared_ptr<SourceData> src);
  115. std::vector<Token> run();
  116. };
  117. enum class IntParsingResult{
  118. Success,
  119. Failure,
  120. Overflow,
  121. };
  122. IntParsingResult parse_uint(std::string_view text, i64* out, int base);
  123. } // namespace pkpy