parser.h 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. #pragma once
  2. #include "obj.h"
  3. typedef uint8_t _TokenType;
  4. constexpr const char* __TOKENS[] = {
  5. "@error", "@eof", "@eol", "@sof",
  6. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%",
  7. "+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
  8. "<<", ">>", "&", "|", "^", "?",
  9. "==", "!=", ">=", "<=",
  10. "+=", "-=", "*=", "/=", "//=",
  11. /** KW_BEGIN **/
  12. "class", "import", "as", "def", "lambda", "pass", "del",
  13. "None", "in", "is", "and", "or", "not", "True", "False", "global",
  14. "goto", "label", // extended keywords, not available in cpython
  15. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
  16. /** KW_END **/
  17. "is not", "not in",
  18. "@id", "@num", "@str", "@fstr",
  19. "@indent", "@dedent"
  20. };
  21. const _TokenType __TOKENS_LEN = sizeof(__TOKENS) / sizeof(__TOKENS[0]);
  22. constexpr _TokenType TK(const char* const token) {
  23. for(int k=0; k<__TOKENS_LEN; k++){
  24. const char* i = __TOKENS[k];
  25. const char* j = token;
  26. while(*i && *j && *i == *j){
  27. i++; j++;
  28. }
  29. if(*i == *j) return k;
  30. }
  31. return 0;
  32. }
  33. #define TK_STR(t) __TOKENS[t]
  34. const _TokenType __KW_BEGIN = TK("class");
  35. const _TokenType __KW_END = TK("raise");
  36. const std::unordered_map<std::string_view, _TokenType> __KW_MAP = [](){
  37. std::unordered_map<std::string_view, _TokenType> map;
  38. for(int k=__KW_BEGIN; k<=__KW_END; k++) map[__TOKENS[k]] = k;
  39. return map;
  40. }();
  41. struct Token{
  42. _TokenType type;
  43. const char* start; //< Begining of the token in the source.
  44. int length; //< Number of chars of the token.
  45. int line; //< Line number of the token (1 based).
  46. PyVar value; //< Literal value of the token.
  47. const _Str str() const {
  48. return _Str(start, length);
  49. }
  50. const _Str info() const {
  51. _StrStream ss;
  52. _Str raw = str();
  53. if (raw == _Str("\n")) raw = "\\n";
  54. ss << line << ": " << TK_STR(type) << " '" << raw << "'";
  55. return ss.str();
  56. }
  57. };
  58. enum Precedence {
  59. PREC_NONE,
  60. PREC_ASSIGNMENT, // =
  61. PREC_COMMA, // ,
  62. PREC_TERNARY, // ?:
  63. PREC_LOGICAL_OR, // or
  64. PREC_LOGICAL_AND, // and
  65. PREC_EQUALITY, // == !=
  66. PREC_TEST, // in is
  67. PREC_COMPARISION, // < > <= >=
  68. PREC_BITWISE_OR, // |
  69. PREC_BITWISE_XOR, // ^
  70. PREC_BITWISE_AND, // &
  71. PREC_BITWISE_SHIFT, // << >>
  72. PREC_TERM, // + -
  73. PREC_FACTOR, // * / % //
  74. PREC_UNARY, // - not
  75. PREC_EXPONENT, // **
  76. PREC_CALL, // ()
  77. PREC_SUBSCRIPT, // []
  78. PREC_ATTRIB, // .index
  79. PREC_PRIMARY,
  80. };
  81. // The context of the parsing phase for the compiler.
  82. struct Parser {
  83. _Source src;
  84. const char* token_start;
  85. const char* current_char;
  86. int current_line = 1;
  87. Token previous, current;
  88. std::queue<Token> nexts;
  89. std::stack<int> indents;
  90. int brackets_level_0 = 0;
  91. int brackets_level_1 = 0;
  92. int brackets_level_2 = 0;
  93. Token nextToken(){
  94. if(nexts.empty()) return makeErrToken();
  95. Token t = nexts.front();
  96. if(t.type == TK("@eof") && indents.size()>1){
  97. nexts.pop();
  98. indents.pop();
  99. return Token{TK("@dedent"), token_start, 0, current_line};
  100. }
  101. nexts.pop();
  102. return t;
  103. }
  104. char peekChar() {
  105. return *current_char;
  106. }
  107. char peekNextChar() {
  108. if (peekChar() == '\0') return '\0';
  109. return *(current_char + 1);
  110. }
  111. int eatSpaces(){
  112. int count = 0;
  113. while (true) {
  114. switch (peekChar()) {
  115. case ' ': count++; break;
  116. case '\t': count+=4; break;
  117. default: return count;
  118. }
  119. eatChar();
  120. }
  121. }
  122. bool eatIndentation(){
  123. if(brackets_level_0 > 0 || brackets_level_1 > 0 || brackets_level_2 > 0) return true;
  124. int spaces = eatSpaces();
  125. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  126. if(spaces > indents.top()){
  127. indents.push(spaces);
  128. nexts.push(Token{TK("@indent"), token_start, 0, current_line});
  129. } else if(spaces < indents.top()){
  130. while(spaces < indents.top()){
  131. indents.pop();
  132. nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
  133. }
  134. if(spaces != indents.top()){
  135. return false;
  136. }
  137. }
  138. return true;
  139. }
  140. char eatChar() {
  141. char c = peekChar();
  142. if(c == '\n') throw std::runtime_error("eatChar() cannot consume a newline");
  143. current_char++;
  144. return c;
  145. }
  146. char eatCharIncludeNewLine() {
  147. char c = peekChar();
  148. current_char++;
  149. if (c == '\n'){
  150. current_line++;
  151. src->lineStarts.push_back(current_char);
  152. }
  153. return c;
  154. }
  155. inline bool isNameStart(char c){
  156. if(isalpha(c) || c=='_') return true;
  157. if(!isascii(c)) return true;
  158. return false;
  159. }
  160. int eatName() {
  161. current_char--;
  162. while(true){
  163. uint8_t c = peekChar();
  164. int u8bytes = 0;
  165. if((c & 0b10000000) == 0b00000000) u8bytes = 1;
  166. else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
  167. else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
  168. else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
  169. else return 1;
  170. if(u8bytes == 1){
  171. if(isalpha(c) || c=='_' || isdigit(c)) {
  172. current_char++;
  173. continue;
  174. }else{
  175. break;
  176. }
  177. }
  178. // handle multibyte char
  179. std::string u8str(current_char, u8bytes);
  180. if(u8str.size() != u8bytes) return 2;
  181. uint32_t value = 0;
  182. for(int k=0; k < u8bytes; k++){
  183. uint8_t b = u8str[k];
  184. if(k==0){
  185. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  186. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  187. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  188. }else{
  189. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  190. }
  191. }
  192. if(__isLoChar(value)) current_char += u8bytes;
  193. else break;
  194. }
  195. int length = (int)(current_char - token_start);
  196. if(length == 0) return 3;
  197. std::string_view name(token_start, length);
  198. if(__KW_MAP.count(name)){
  199. if(name == "not"){
  200. if(strncmp(current_char, " in", 3) == 0){
  201. current_char += 3;
  202. setNextToken(TK("not in"));
  203. return 0;
  204. }
  205. }else if(name == "is"){
  206. if(strncmp(current_char, " not", 4) == 0){
  207. current_char += 4;
  208. setNextToken(TK("is not"));
  209. return 0;
  210. }
  211. }
  212. setNextToken(__KW_MAP.at(name));
  213. } else {
  214. setNextToken(TK("@id"));
  215. }
  216. return 0;
  217. }
  218. void skipLineComment() {
  219. char c;
  220. while ((c = peekChar()) != '\0') {
  221. if (c == '\n') return;
  222. eatChar();
  223. }
  224. }
  225. // If the current char is [c] consume it and advance char by 1 and returns
  226. // true otherwise returns false.
  227. bool matchChar(char c) {
  228. if (peekChar() != c) return false;
  229. eatCharIncludeNewLine();
  230. return true;
  231. }
  232. // Returns an error token from the current position for reporting error.
  233. Token makeErrToken() {
  234. return Token{TK("@error"), token_start, (int)(current_char - token_start), current_line};
  235. }
  236. // Initialize the next token as the type.
  237. void setNextToken(_TokenType type, PyVar value=nullptr) {
  238. switch(type){
  239. case TK("("): brackets_level_0++; break;
  240. case TK(")"): brackets_level_0--; break;
  241. case TK("["): brackets_level_1++; break;
  242. case TK("]"): brackets_level_1--; break;
  243. case TK("{"): brackets_level_2++; break;
  244. case TK("}"): brackets_level_2--; break;
  245. }
  246. nexts.push( Token{
  247. type,
  248. token_start,
  249. (int)(current_char - token_start),
  250. current_line - ((type == TK("@eol")) ? 1 : 0),
  251. value
  252. });
  253. }
  254. void setNextTwoCharToken(char c, _TokenType one, _TokenType two) {
  255. if (matchChar(c)) setNextToken(two);
  256. else setNextToken(one);
  257. }
  258. Parser(_Source src) {
  259. this->src = src;
  260. this->token_start = src->source;
  261. this->current_char = src->source;
  262. this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
  263. this->indents.push(0);
  264. }
  265. };