lexer.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. #pragma once
  2. #include "common.h"
  3. #include "error.h"
  4. #include "str.h"
  5. namespace pkpy{
  6. typedef uint8_t TokenIndex;
  7. constexpr const char* kTokens[] = {
  8. "is not", "not in", "yield from",
  9. "@eof", "@eol", "@sof",
  10. "@id", "@num", "@str", "@fstr", "@long",
  11. "@indent", "@dedent",
  12. /*****************************************/
  13. "+", "+=", "-", "-=", // (INPLACE_OP - 1) can get '=' removed
  14. "*", "*=", "/", "/=", "//", "//=", "%", "%=",
  15. "&", "&=", "|", "|=", "^", "^=",
  16. "<<", "<<=", ">>", ">>=",
  17. /*****************************************/
  18. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}",
  19. "**", "=", ">", "<", "...", "->", "?", "@", "==", "!=", ">=", "<=",
  20. "++", "--",
  21. /** SPEC_BEGIN **/
  22. "$goto", "$label",
  23. /** KW_BEGIN **/
  24. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  25. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  26. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise"
  27. };
  28. using TokenValue = std::variant<std::monostate, i64, f64, Str>;
  29. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  30. constexpr TokenIndex TK(const char token[]) {
  31. for(int k=0; k<kTokenCount; k++){
  32. const char* i = kTokens[k];
  33. const char* j = token;
  34. while(*i && *j && *i == *j) { i++; j++;}
  35. if(*i == *j) return k;
  36. }
  37. return 255;
  38. }
  39. #define TK_STR(t) kTokens[t]
  40. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  41. std::map<std::string_view, TokenIndex> map;
  42. for(int k=TK("class"); k<kTokenCount; k++) map[kTokens[k]] = k;
  43. return map;
  44. }();
  45. struct Token{
  46. TokenIndex type;
  47. const char* start;
  48. int length;
  49. int line;
  50. int brackets_level;
  51. TokenValue value;
  52. Str str() const { return Str(start, length);}
  53. std::string_view sv() const { return std::string_view(start, length);}
  54. std::string info() const {
  55. std::stringstream ss;
  56. ss << line << ": " << TK_STR(type) << " '" << (
  57. sv()=="\n" ? "\\n" : sv()
  58. ) << "'";
  59. return ss.str();
  60. }
  61. };
  62. // https://docs.python.org/3/reference/expressions.html#operator-precedence
  63. enum Precedence {
  64. PREC_NONE,
  65. PREC_TUPLE, // ,
  66. PREC_LAMBDA, // lambda
  67. PREC_TERNARY, // ?:
  68. PREC_LOGICAL_OR, // or
  69. PREC_LOGICAL_AND, // and
  70. PREC_LOGICAL_NOT, // not
  71. /* https://docs.python.org/3/reference/expressions.html#comparisons
  72. * Unlike C, all comparison operations in Python have the same priority,
  73. * which is lower than that of any arithmetic, shifting or bitwise operation.
  74. * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
  75. */
  76. PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in
  77. PREC_BITWISE_OR, // |
  78. PREC_BITWISE_XOR, // ^
  79. PREC_BITWISE_AND, // &
  80. PREC_BITWISE_SHIFT, // << >>
  81. PREC_TERM, // + -
  82. PREC_FACTOR, // * / % // @
  83. PREC_UNARY, // - not
  84. PREC_EXPONENT, // **
  85. PREC_CALL, // ()
  86. PREC_SUBSCRIPT, // []
  87. PREC_ATTRIB, // .index
  88. PREC_PRIMARY,
  89. };
  90. enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
  91. struct Lexer {
  92. shared_ptr<SourceData> src;
  93. const char* token_start;
  94. const char* curr_char;
  95. int current_line = 1;
  96. std::vector<Token> nexts;
  97. stack<int> indents;
  98. int brackets_level = 0;
  99. bool used = false;
  100. char peekchar() const{ return *curr_char; }
  101. bool match_n_chars(int n, char c0);
  102. bool match_string(const char* s);
  103. int eat_spaces();
  104. bool eat_indentation();
  105. char eatchar();
  106. char eatchar_include_newline();
  107. int eat_name();
  108. void skip_line_comment();
  109. bool matchchar(char c);
  110. void add_token(TokenIndex type, TokenValue value={});
  111. void add_token_2(char c, TokenIndex one, TokenIndex two);
  112. Str eat_string_until(char quote, bool raw);
  113. void eat_string(char quote, StringType type);
  114. void eat_number();
  115. bool lex_one_token();
  116. /***** Error Reporter *****/
  117. void throw_err(Str type, Str msg);
  118. void throw_err(Str type, Str msg, int lineno, const char* cursor);
  119. void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
  120. void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
  121. void IndentationError(Str msg){ throw_err("IndentationError", msg); }
  122. Lexer(shared_ptr<SourceData> src);
  123. std::vector<Token> run();
  124. };
  125. } // namespace pkpy