lexer.h 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #pragma once
  2. #include "common.h"
  3. #include "error.h"
  4. #include "str.h"
  5. #include "obj.h"
  6. namespace pkpy{
  7. typedef uint8_t TokenIndex;
  8. constexpr const char* kTokens[] = {
  9. "is not", "not in", "yield from",
  10. "@eof", "@eol", "@sof",
  11. "@id", "@num", "@str", "@fstr", "@long", "@bytes", "@imag",
  12. "@indent", "@dedent",
  13. /*****************************************/
  14. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  15. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  16. "&", "&=", "|", "|=", "^", "^=",
  17. "<<", "<<=", ">>", ">>=",
  18. /*****************************************/
  19. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  20. "**", "=", ">", "<", "..", "...", "->", "@", "==", "!=", ">=", "<=",
  21. "++", "--", "~",
  22. /** KW_BEGIN **/
  23. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  24. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  25. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  26. };
  27. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  28. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  29. constexpr TokenIndex TK(const char token[]) {
  30. for(int k=0; k<kTokenCount; k++){
  31. const char* i = kTokens[k];
  32. const char* j = token;
  33. while(*i && *j && *i == *j) { i++; j++;}
  34. if(*i == *j) return k;
  35. }
  36. return 255;
  37. }
  38. #define TK_STR(t) kTokens[t]
  39. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  40. std::map<std::string_view, TokenIndex> map;
  41. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  42. return map;
  43. }();
  44. struct Token{
  45. TokenIndex type;
  46. const char* start;
  47. int length;
  48. int line;
  49. int brackets_level;
  50. TokenValue value;
  51. Str str() const { return Str(start, length);}
  52. std::string_view sv() const { return std::string_view(start, length);}
  53. // Str info() const {
  54. // SStream ss;
  55. // ss << line << ": " << TK_STR(type) << " '" << (
  56. // sv()=="\n" ? "\\n" : sv()
  57. // ) << "'";
  58. // return ss.str();
  59. // }
  60. };
  61. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  62. enum Precedence {
  63. PREC_LOWEST,
  64. PREC_LAMBDA, // lambda
  65. PREC_TERNARY, // ?:
  66. PREC_LOGICAL_OR, // or
  67. PREC_LOGICAL_AND, // and
  68. PREC_LOGICAL_NOT, // not
  69. /* https://docs.python.org/3/reference/expressions.html#comparisons
  70. * Unlike C, all comparison operations in Python have the same priority,
  71. * which is lower than that of any arithmetic, shifting or bitwise operation.
  72. * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
  73. */
  74. PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in
  75. PREC_BITWISE_OR, // |
  76. PREC_BITWISE_XOR, // ^
  77. PREC_BITWISE_AND, // &
  78. PREC_BITWISE_SHIFT, // << >>
  79. PREC_TERM, // + -
  80. PREC_FACTOR, // * / % // @
  81. PREC_UNARY, // - not ~
  82. PREC_EXPONENT, // **
  83. PREC_PRIMARY, // f() x[] a.b 1:2
  84. PREC_HIGHEST,
  85. };
  86. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES };
  87. struct Lexer {
  88. VM* vm;
  89. std::shared_ptr<SourceData> src;
  90. const char* token_start;
  91. const char* curr_char;
  92. int current_line = 1;
  93. std::vector<Token> nexts;
  94. stack_no_copy<int, small_vector_no_copy_and_move<int, 8>> indents;
  95. int brackets_level = 0;
  96. char peekchar() const{ return *curr_char; }
  97. bool match_n_chars(int n, char c0);
  98. bool match_string(const char* s);
  99. int eat_spaces();
  100. bool eat_indentation();
  101. char eatchar();
  102. char eatchar_include_newline();
  103. int eat_name();
  104. void skip_line_comment();
  105. bool matchchar(char c);
  106. void add_token(TokenIndex type, TokenValue value={});
  107. void add_token_2(char c, TokenIndex one, TokenIndex two);
  108. Str eat_string_until(char quote, bool raw);
  109. void eat_string(char quote, StringType type);
  110. void eat_number();
  111. bool lex_one_token();
  112. /***** Error Reporter *****/
  113. void throw_err(StrName type, Str msg);
  114. void throw_err(StrName type, Str msg, int lineno, const char* cursor);
  115. void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  116. void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  117. void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  118. Lexer(VM* vm, std::shared_ptr<SourceData> src);
  119. std::vector<Token> run();
  120. };
  121. enum class IntParsingResult{
  122. Success,
  123. Failure,
  124. Overflow,
  125. };
  126. IntParsingResult parse_int(std::string_view text, i64* out, int base);
  127. } // namespace pkpy