parser.h 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. #pragma once
  2. #include "obj.h"
  3. typedef uint8_t TokenIndex;
  4. constexpr const char* kTokens[] = {
  5. "@error", "@eof", "@eol", "@sof",
  6. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%",
  7. "+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
  8. "<<", ">>", "&", "|", "^", "?",
  9. "==", "!=", ">=", "<=",
  10. "+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=",
  11. /** KW_BEGIN **/
  12. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  13. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  14. "goto", "label", // extended keywords, not available in cpython
  15. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
  16. /** KW_END **/
  17. "is not", "not in",
  18. "@id", "@num", "@str", "@fstr",
  19. "@indent", "@dedent"
  20. };
  21. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  22. constexpr TokenIndex TK(const char* const token) {
  23. for(int k=0; k<kTokenCount; k++){
  24. const char* i = kTokens[k];
  25. const char* j = token;
  26. while(*i && *j && *i == *j) { i++; j++;}
  27. if(*i == *j) return k;
  28. }
  29. UNREACHABLE();
  30. }
  31. #define TK_STR(t) kTokens[t]
  32. const TokenIndex kTokenKwBegin = TK("class");
  33. const TokenIndex kTokenKwEnd = TK("raise");
  34. const emhash8::HashMap<std::string_view, TokenIndex> kTokenKwMap = [](){
  35. emhash8::HashMap<std::string_view, TokenIndex> map;
  36. for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
  37. return map;
  38. }();
  39. struct Token{
  40. TokenIndex type;
  41. const char* start; //< Begining of the token in the source.
  42. int length; //< Number of chars of the token.
  43. int line; //< Line number of the token (1 based).
  44. PyVar value; //< Literal value of the token.
  45. const Str str() const { return Str(start, length);}
  46. const Str info() const {
  47. StrStream ss;
  48. Str raw = str();
  49. if (raw == Str("\n")) raw = "\\n";
  50. ss << line << ": " << TK_STR(type) << " '" << raw << "'";
  51. return ss.str();
  52. }
  53. };
  54. enum Precedence {
  55. PREC_NONE,
  56. PREC_ASSIGNMENT, // =
  57. PREC_COMMA, // ,
  58. PREC_TERNARY, // ?:
  59. PREC_LOGICAL_OR, // or
  60. PREC_LOGICAL_AND, // and
  61. PREC_EQUALITY, // == !=
  62. PREC_TEST, // in is
  63. PREC_COMPARISION, // < > <= >=
  64. PREC_BITWISE_OR, // |
  65. PREC_BITWISE_XOR, // ^
  66. PREC_BITWISE_AND, // &
  67. PREC_BITWISE_SHIFT, // << >>
  68. PREC_TERM, // + -
  69. PREC_FACTOR, // * / % //
  70. PREC_UNARY, // - not
  71. PREC_EXPONENT, // **
  72. PREC_CALL, // ()
  73. PREC_SUBSCRIPT, // []
  74. PREC_ATTRIB, // .index
  75. PREC_PRIMARY,
  76. };
  77. // The context of the parsing phase for the compiler.
  78. struct Parser {
  79. pkpy::shared_ptr<SourceData> src;
  80. const char* token_start;
  81. const char* curr_char;
  82. int current_line = 1;
  83. Token prev, curr;
  84. std::queue<Token> nexts;
  85. std::stack<int> indents;
  86. int brackets_level = 0;
  87. Token next_token(){
  88. if(nexts.empty()){
  89. return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
  90. }
  91. Token t = nexts.front();
  92. if(t.type == TK("@eof") && indents.size()>1){
  93. nexts.pop();
  94. indents.pop();
  95. return Token{TK("@dedent"), token_start, 0, current_line};
  96. }
  97. nexts.pop();
  98. return t;
  99. }
  100. inline char peekchar() const{ return *curr_char; }
  101. bool match_n_chars(int n, char c0){
  102. const char* c = curr_char;
  103. for(int i=0; i<n; i++){
  104. if(*c == '\0') return false;
  105. if(*c != c0) return false;
  106. c++;
  107. }
  108. for(int i=0; i<n; i++) eatchar_include_newline();
  109. return true;
  110. }
  111. int eat_spaces(){
  112. int count = 0;
  113. while (true) {
  114. switch (peekchar()) {
  115. case ' ' : count+=1; break;
  116. case '\t': count+=4; break;
  117. default: return count;
  118. }
  119. eatchar();
  120. }
  121. }
  122. bool eat_indentation(){
  123. if(brackets_level > 0) return true;
  124. int spaces = eat_spaces();
  125. if(peekchar() == '#') skip_line_comment();
  126. if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
  127. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  128. if(spaces > indents.top()){
  129. indents.push(spaces);
  130. nexts.push(Token{TK("@indent"), token_start, 0, current_line});
  131. } else if(spaces < indents.top()){
  132. while(spaces < indents.top()){
  133. indents.pop();
  134. nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
  135. }
  136. if(spaces != indents.top()){
  137. return false;
  138. }
  139. }
  140. return true;
  141. }
  142. char eatchar() {
  143. char c = peekchar();
  144. if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
  145. curr_char++;
  146. return c;
  147. }
  148. char eatchar_include_newline() {
  149. char c = peekchar();
  150. curr_char++;
  151. if (c == '\n'){
  152. current_line++;
  153. src->line_starts.push_back(curr_char);
  154. }
  155. return c;
  156. }
  157. int eat_name() {
  158. curr_char--;
  159. while(true){
  160. uint8_t c = peekchar();
  161. int u8bytes = 0;
  162. if((c & 0b10000000) == 0b00000000) u8bytes = 1;
  163. else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
  164. else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
  165. else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
  166. else return 1;
  167. if(u8bytes == 1){
  168. if(isalpha(c) || c=='_' || isdigit(c)) {
  169. curr_char++;
  170. continue;
  171. }else{
  172. break;
  173. }
  174. }
  175. // handle multibyte char
  176. std::string u8str(curr_char, u8bytes);
  177. if(u8str.size() != u8bytes) return 2;
  178. uint32_t value = 0;
  179. for(int k=0; k < u8bytes; k++){
  180. uint8_t b = u8str[k];
  181. if(k==0){
  182. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  183. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  184. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  185. }else{
  186. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  187. }
  188. }
  189. if(is_unicode_Lo_char(value)) curr_char += u8bytes;
  190. else break;
  191. }
  192. int length = (int)(curr_char - token_start);
  193. if(length == 0) return 3;
  194. std::string_view name(token_start, length);
  195. if(src->mode == JSON_MODE){
  196. if(name == "true"){
  197. set_next_token(TK("True"));
  198. } else if(name == "false"){
  199. set_next_token(TK("False"));
  200. } else if(name == "null"){
  201. set_next_token(TK("None"));
  202. } else {
  203. return 4;
  204. }
  205. return 0;
  206. }
  207. if(kTokenKwMap.contains(name)){
  208. if(name == "not"){
  209. if(strncmp(curr_char, " in", 3) == 0){
  210. curr_char += 3;
  211. set_next_token(TK("not in"));
  212. return 0;
  213. }
  214. }else if(name == "is"){
  215. if(strncmp(curr_char, " not", 4) == 0){
  216. curr_char += 4;
  217. set_next_token(TK("is not"));
  218. return 0;
  219. }
  220. }
  221. set_next_token(kTokenKwMap.at(name));
  222. } else {
  223. set_next_token(TK("@id"));
  224. }
  225. return 0;
  226. }
  227. void skip_line_comment() {
  228. char c;
  229. while ((c = peekchar()) != '\0') {
  230. if (c == '\n') return;
  231. eatchar();
  232. }
  233. }
  234. bool matchchar(char c) {
  235. if (peekchar() != c) return false;
  236. eatchar_include_newline();
  237. return true;
  238. }
  239. void set_next_token(TokenIndex type, PyVar value=nullptr) {
  240. switch(type){
  241. case TK("{"): case TK("["): case TK("("): brackets_level++; break;
  242. case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
  243. }
  244. nexts.push( Token{
  245. type,
  246. token_start,
  247. (int)(curr_char - token_start),
  248. current_line - ((type == TK("@eol")) ? 1 : 0),
  249. value
  250. });
  251. }
  252. void set_next_token_2(char c, TokenIndex one, TokenIndex two) {
  253. if (matchchar(c)) set_next_token(two);
  254. else set_next_token(one);
  255. }
  256. Parser(pkpy::shared_ptr<SourceData> src) {
  257. this->src = src;
  258. this->token_start = src->source;
  259. this->curr_char = src->source;
  260. this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
  261. this->indents.push(0);
  262. }
  263. };