lexer.hpp 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. #pragma once
  2. #include "pocketpy/objects/sourcedata.hpp"
  3. #include "pocketpy/objects/error.hpp"
  4. #include <variant>
  5. namespace pkpy{
  6. typedef uint8_t TokenIndex;
  7. constexpr const char* kTokens[] = {
  8. "is not", "not in", "yield from",
  9. "@eof", "@eol", "@sof",
  10. "@id", "@num", "@str", "@fstr", "@long", "@bytes", "@imag",
  11. "@indent", "@dedent",
  12. /*****************************************/
  13. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  14. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  15. "&", "&=", "|", "|=", "^", "^=",
  16. "<<", "<<=", ">>", ">>=",
  17. /*****************************************/
  18. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  19. "**", "=", ">", "<", "..", "...", "->", "@", "==", "!=", ">=", "<=",
  20. "++", "--", "~",
  21. /** KW_BEGIN **/
  22. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  23. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  24. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  25. };
  26. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  27. const int kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  28. constexpr TokenIndex TK(const char token[]) {
  29. for(int k=0; k<kTokenCount; k++){
  30. const char* i = kTokens[k];
  31. const char* j = token;
  32. while(*i && *j && *i == *j) { i++; j++;}
  33. if(*i == *j) return k;
  34. }
  35. return 255;
  36. }
  37. inline constexpr bool is_raw_string_used(TokenIndex t){
  38. return t == TK("@id") || t == TK("@long");
  39. }
  40. #define TK_STR(t) kTokens[t]
  41. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  42. std::map<std::string_view, TokenIndex> map;
  43. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  44. return map;
  45. }();
  46. struct Token{
  47. TokenIndex type;
  48. const char* start;
  49. int length;
  50. int line;
  51. int brackets_level;
  52. TokenValue value;
  53. Str str() const { return Str(start, length);}
  54. std::string_view sv() const { return std::string_view(start, length);}
  55. };
  56. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  57. enum Precedence {
  58. PREC_LOWEST,
  59. PREC_LAMBDA, // lambda
  60. PREC_TERNARY, // ?:
  61. PREC_LOGICAL_OR, // or
  62. PREC_LOGICAL_AND, // and
  63. PREC_LOGICAL_NOT, // not
  64. /* https://docs.python.org/3/reference/expressions.html#comparisons
  65. * Unlike C, all comparison operations in Python have the same priority,
  66. * which is lower than that of any arithmetic, shifting or bitwise operation.
  67. * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
  68. */
  69. PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in
  70. PREC_BITWISE_OR, // |
  71. PREC_BITWISE_XOR, // ^
  72. PREC_BITWISE_AND, // &
  73. PREC_BITWISE_SHIFT, // << >>
  74. PREC_TERM, // + -
  75. PREC_FACTOR, // * / % // @
  76. PREC_UNARY, // - not ~
  77. PREC_EXPONENT, // **
  78. PREC_PRIMARY, // f() x[] a.b 1:2
  79. PREC_HIGHEST,
  80. };
  81. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES };
  82. struct Lexer {
  83. VM* vm;
  84. std::shared_ptr<SourceData> src;
  85. const char* token_start;
  86. const char* curr_char;
  87. int current_line = 1;
  88. vector<Token> nexts;
  89. stack_no_copy<int, small_vector_2<int, 8>> indents;
  90. int brackets_level = 0;
  91. char peekchar() const{ return *curr_char; }
  92. bool match_n_chars(int n, char c0);
  93. bool match_string(const char* s);
  94. int eat_spaces();
  95. bool eat_indentation();
  96. char eatchar();
  97. char eatchar_include_newline();
  98. int eat_name();
  99. void skip_line_comment();
  100. bool matchchar(char c);
  101. void add_token(TokenIndex type, TokenValue value={});
  102. void add_token_2(char c, TokenIndex one, TokenIndex two);
  103. Str eat_string_until(char quote, bool raw);
  104. void eat_string(char quote, StringType type);
  105. void eat_number();
  106. bool lex_one_token();
  107. /***** Error Reporter *****/
  108. [[noreturn]] void throw_err(StrName type, Str msg);
  109. [[noreturn]] void throw_err(StrName type, Str msg, int lineno, const char* cursor);
  110. [[noreturn]] void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  111. [[noreturn]] void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  112. [[noreturn]] void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  113. Lexer(VM* vm, std::shared_ptr<SourceData> src);
  114. vector<Token> run();
  115. };
  116. enum class IntParsingResult{
  117. Success,
  118. Failure,
  119. Overflow,
  120. };
  121. IntParsingResult parse_uint(std::string_view text, i64* out, int base);
  122. } // namespace pkpy