parser.h 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. #pragma once
  2. #include "error.h"
  3. #include "obj.h"
  4. namespace pkpy{
  5. typedef uint8_t TokenIndex;
  6. constexpr const char* kTokens[] = {
  7. "@error", "@eof", "@eol", "@sof",
  8. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
  9. "+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
  10. "<<", ">>", "&", "|", "^", "?", "@",
  11. "==", "!=", ">=", "<=",
  12. "+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
  13. /** KW_BEGIN **/
  14. "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
  15. "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
  16. "goto", "label", // extended keywords, not available in cpython
  17. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
  18. /** KW_END **/
  19. "is not", "not in",
  20. "@id", "@num", "@str", "@fstr",
  21. "@indent", "@dedent"
  22. };
  23. const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
  24. constexpr TokenIndex TK(const char token[]) {
  25. for(int k=0; k<kTokenCount; k++){
  26. const char* i = kTokens[k];
  27. const char* j = token;
  28. while(*i && *j && *i == *j) { i++; j++;}
  29. if(*i == *j) return k;
  30. }
  31. UNREACHABLE();
  32. }
  33. #define TK_STR(t) kTokens[t]
  34. const TokenIndex kTokenKwBegin = TK("class");
  35. const TokenIndex kTokenKwEnd = TK("raise");
  36. const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
  37. std::map<std::string_view, TokenIndex> map;
  38. for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
  39. return map;
  40. }();
  41. struct Token{
  42. TokenIndex type;
  43. const char* start;
  44. int length;
  45. int line;
  46. PyObject* value;
  47. Str str() const { return Str(start, length);}
  48. Str info() const {
  49. StrStream ss;
  50. Str raw = str();
  51. if (raw == Str("\n")) raw = "\\n";
  52. ss << line << ": " << TK_STR(type) << " '" << raw << "'";
  53. return ss.str();
  54. }
  55. };
  56. // https://docs.python.org/3/reference/expressions.html
  57. enum Precedence {
  58. PREC_NONE,
  59. PREC_ASSIGNMENT, // =
  60. PREC_COMMA, // ,
  61. PREC_TERNARY, // ?:
  62. PREC_LOGICAL_OR, // or
  63. PREC_LOGICAL_AND, // and
  64. PREC_LOGICAL_NOT, // not
  65. PREC_EQUALITY, // == !=
  66. PREC_TEST, // in / is / is not / not in
  67. PREC_COMPARISION, // < > <= >=
  68. PREC_BITWISE_OR, // |
  69. PREC_BITWISE_XOR, // ^
  70. PREC_BITWISE_AND, // &
  71. PREC_BITWISE_SHIFT, // << >>
  72. PREC_TERM, // + -
  73. PREC_FACTOR, // * / % //
  74. PREC_UNARY, // - not
  75. PREC_EXPONENT, // **
  76. PREC_CALL, // ()
  77. PREC_SUBSCRIPT, // []
  78. PREC_ATTRIB, // .index
  79. PREC_PRIMARY,
  80. };
  81. // The context of the parsing phase for the compiler.
  82. struct Parser {
  83. shared_ptr<SourceData> src;
  84. const char* token_start;
  85. const char* curr_char;
  86. int current_line = 1;
  87. Token prev, curr;
  88. std::queue<Token> nexts;
  89. std::stack<int> indents;
  90. int brackets_level = 0;
  91. Token next_token(){
  92. if(nexts.empty()){
  93. return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
  94. }
  95. Token t = nexts.front();
  96. if(t.type == TK("@eof") && indents.size()>1){
  97. nexts.pop();
  98. indents.pop();
  99. return Token{TK("@dedent"), token_start, 0, current_line};
  100. }
  101. nexts.pop();
  102. return t;
  103. }
  104. inline char peekchar() const{ return *curr_char; }
  105. bool match_n_chars(int n, char c0){
  106. const char* c = curr_char;
  107. for(int i=0; i<n; i++){
  108. if(*c == '\0') return false;
  109. if(*c != c0) return false;
  110. c++;
  111. }
  112. for(int i=0; i<n; i++) eatchar_include_newline();
  113. return true;
  114. }
  115. int eat_spaces(){
  116. int count = 0;
  117. while (true) {
  118. switch (peekchar()) {
  119. case ' ' : count+=1; break;
  120. case '\t': count+=4; break;
  121. default: return count;
  122. }
  123. eatchar();
  124. }
  125. }
  126. bool eat_indentation(){
  127. if(brackets_level > 0) return true;
  128. int spaces = eat_spaces();
  129. if(peekchar() == '#') skip_line_comment();
  130. if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
  131. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  132. if(spaces > indents.top()){
  133. indents.push(spaces);
  134. nexts.push(Token{TK("@indent"), token_start, 0, current_line});
  135. } else if(spaces < indents.top()){
  136. while(spaces < indents.top()){
  137. indents.pop();
  138. nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
  139. }
  140. if(spaces != indents.top()){
  141. return false;
  142. }
  143. }
  144. return true;
  145. }
  146. char eatchar() {
  147. char c = peekchar();
  148. if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
  149. curr_char++;
  150. return c;
  151. }
  152. char eatchar_include_newline() {
  153. char c = peekchar();
  154. curr_char++;
  155. if (c == '\n'){
  156. current_line++;
  157. src->line_starts.push_back(curr_char);
  158. }
  159. return c;
  160. }
  161. int eat_name() {
  162. curr_char--;
  163. while(true){
  164. uint8_t c = peekchar();
  165. int u8bytes = 0;
  166. if((c & 0b10000000) == 0b00000000) u8bytes = 1;
  167. else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
  168. else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
  169. else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
  170. else return 1;
  171. if(u8bytes == 1){
  172. if(isalpha(c) || c=='_' || isdigit(c)) {
  173. curr_char++;
  174. continue;
  175. }else{
  176. break;
  177. }
  178. }
  179. // handle multibyte char
  180. std::string u8str(curr_char, u8bytes);
  181. if(u8str.size() != u8bytes) return 2;
  182. uint32_t value = 0;
  183. for(int k=0; k < u8bytes; k++){
  184. uint8_t b = u8str[k];
  185. if(k==0){
  186. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  187. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  188. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  189. }else{
  190. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  191. }
  192. }
  193. if(is_unicode_Lo_char(value)) curr_char += u8bytes;
  194. else break;
  195. }
  196. int length = (int)(curr_char - token_start);
  197. if(length == 0) return 3;
  198. std::string_view name(token_start, length);
  199. if(src->mode == JSON_MODE){
  200. if(name == "true"){
  201. set_next_token(TK("True"));
  202. } else if(name == "false"){
  203. set_next_token(TK("False"));
  204. } else if(name == "null"){
  205. set_next_token(TK("None"));
  206. } else {
  207. return 4;
  208. }
  209. return 0;
  210. }
  211. if(kTokenKwMap.count(name)){
  212. if(name == "not"){
  213. if(strncmp(curr_char, " in", 3) == 0){
  214. curr_char += 3;
  215. set_next_token(TK("not in"));
  216. return 0;
  217. }
  218. }else if(name == "is"){
  219. if(strncmp(curr_char, " not", 4) == 0){
  220. curr_char += 4;
  221. set_next_token(TK("is not"));
  222. return 0;
  223. }
  224. }
  225. set_next_token(kTokenKwMap.at(name));
  226. } else {
  227. set_next_token(TK("@id"));
  228. }
  229. return 0;
  230. }
  231. void skip_line_comment() {
  232. char c;
  233. while ((c = peekchar()) != '\0') {
  234. if (c == '\n') return;
  235. eatchar();
  236. }
  237. }
  238. bool matchchar(char c) {
  239. if (peekchar() != c) return false;
  240. eatchar_include_newline();
  241. return true;
  242. }
  243. void set_next_token(TokenIndex type, PyObject* value=nullptr) {
  244. switch(type){
  245. case TK("{"): case TK("["): case TK("("): brackets_level++; break;
  246. case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
  247. }
  248. nexts.push( Token{
  249. type,
  250. token_start,
  251. (int)(curr_char - token_start),
  252. current_line - ((type == TK("@eol")) ? 1 : 0),
  253. value
  254. });
  255. }
  256. void set_next_token_2(char c, TokenIndex one, TokenIndex two) {
  257. if (matchchar(c)) set_next_token(two);
  258. else set_next_token(one);
  259. }
  260. Parser(shared_ptr<SourceData> src) {
  261. this->src = src;
  262. this->token_start = src->source;
  263. this->curr_char = src->source;
  264. this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
  265. this->indents.push(0);
  266. }
  267. };
  268. } // namespace pkpy