parser.h 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. #pragma once
  2. #include "obj.h"
  3. typedef uint8_t _TokenType;
  4. constexpr const char* __TOKENS[] = {
  5. "@error", "@eof", "@eol", "@sof",
  6. ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%",
  7. "+", "-", "*", "/", "//", "**", "=", ">", "<",
  8. "<<", ">>", "&", "|", "^",
  9. "==", "!=", ">=", "<=",
  10. "+=", "-=", "*=", "/=", "//=",
  11. /** KW_BEGIN **/
  12. "class", "import", "as", "def", "lambda", "pass", "del",
  13. "None", "in", "is", "and", "or", "not", "True", "False", "global",
  14. "goto", "label", // extended keywords, not available in cpython
  15. "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
  16. /** KW_END **/
  17. "is not", "not in",
  18. "@id", "@num", "@str", "@fstr",
  19. "@indent", "@dedent"
  20. };
  21. const _TokenType __TOKENS_LEN = sizeof(__TOKENS) / sizeof(__TOKENS[0]);
  22. constexpr _TokenType TK(const char* const token) {
  23. for(int k=0; k<__TOKENS_LEN; k++){
  24. const char* i = __TOKENS[k];
  25. const char* j = token;
  26. while(*i && *j && *i == *j){
  27. i++; j++;
  28. }
  29. if(*i == *j) return k;
  30. }
  31. return 0;
  32. }
  33. #define TK_STR(t) __TOKENS[t]
  34. const _TokenType __KW_BEGIN = TK("class");
  35. const _TokenType __KW_END = TK("raise");
  36. const std::unordered_map<std::string_view, _TokenType> __KW_MAP = [](){
  37. std::unordered_map<std::string_view, _TokenType> map;
  38. for(int k=__KW_BEGIN; k<=__KW_END; k++) map[__TOKENS[k]] = k;
  39. return map;
  40. }();
  41. struct Token{
  42. _TokenType type;
  43. const char* start; //< Begining of the token in the source.
  44. int length; //< Number of chars of the token.
  45. int line; //< Line number of the token (1 based).
  46. PyVar value; //< Literal value of the token.
  47. const _Str str() const {
  48. return _Str(start, length);
  49. }
  50. const _Str info() const {
  51. _StrStream ss;
  52. _Str raw = str();
  53. if (raw == _Str("\n")) raw = "\\n";
  54. ss << line << ": " << TK_STR(type) << " '" << raw << "'";
  55. return ss.str();
  56. }
  57. };
  58. enum Precedence {
  59. PREC_NONE,
  60. PREC_ASSIGNMENT, // =
  61. PREC_COMMA, // ,
  62. PREC_LOGICAL_OR, // or
  63. PREC_LOGICAL_AND, // and
  64. PREC_EQUALITY, // == !=
  65. PREC_TEST, // in is
  66. PREC_COMPARISION, // < > <= >=
  67. PREC_BITWISE_OR, // |
  68. PREC_BITWISE_XOR, // ^
  69. PREC_BITWISE_AND, // &
  70. PREC_BITWISE_SHIFT, // << >>
  71. PREC_TERM, // + -
  72. PREC_FACTOR, // * / % //
  73. PREC_UNARY, // - not
  74. PREC_EXPONENT, // **
  75. PREC_CALL, // ()
  76. PREC_SUBSCRIPT, // []
  77. PREC_ATTRIB, // .index
  78. PREC_PRIMARY,
  79. };
  80. // The context of the parsing phase for the compiler.
  81. struct Parser {
  82. _Source src;
  83. const char* token_start;
  84. const char* current_char;
  85. int current_line = 1;
  86. Token previous, current;
  87. std::queue<Token> nexts;
  88. std::stack<int> indents;
  89. int brackets_level_0 = 0;
  90. int brackets_level_1 = 0;
  91. Token nextToken(){
  92. if(nexts.empty()) return makeErrToken();
  93. Token t = nexts.front();
  94. if(t.type == TK("@eof") && indents.size()>1){
  95. nexts.pop();
  96. indents.pop();
  97. return Token{TK("@dedent"), token_start, 0, current_line};
  98. }
  99. nexts.pop();
  100. return t;
  101. }
  102. char peekChar() {
  103. return *current_char;
  104. }
  105. char peekNextChar() {
  106. if (peekChar() == '\0') return '\0';
  107. return *(current_char + 1);
  108. }
  109. int eatSpaces(){
  110. int count = 0;
  111. while (true) {
  112. switch (peekChar()) {
  113. case ' ': count++; break;
  114. case '\t': count+=4; break;
  115. default: return count;
  116. }
  117. eatChar();
  118. }
  119. }
  120. bool eatIndentation(){
  121. if(brackets_level_0 > 0 || brackets_level_1 > 0) return true;
  122. int spaces = eatSpaces();
  123. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  124. if(spaces > indents.top()){
  125. indents.push(spaces);
  126. nexts.push(Token{TK("@indent"), token_start, 0, current_line});
  127. } else if(spaces < indents.top()){
  128. while(spaces < indents.top()){
  129. indents.pop();
  130. nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
  131. }
  132. if(spaces != indents.top()){
  133. return false;
  134. }
  135. }
  136. return true;
  137. }
  138. char eatChar() {
  139. char c = peekChar();
  140. if(c == '\n') throw std::runtime_error("eatChar() cannot consume a newline");
  141. current_char++;
  142. return c;
  143. }
  144. char eatCharIncludeNewLine() {
  145. char c = peekChar();
  146. current_char++;
  147. if (c == '\n'){
  148. current_line++;
  149. src->lineStarts.push_back(current_char);
  150. }
  151. return c;
  152. }
  153. inline bool isNameStart(char c){
  154. if(isalpha(c) || c=='_') return true;
  155. if(!isascii(c)) return true;
  156. return false;
  157. }
  158. int eatName() {
  159. current_char--;
  160. while(true){
  161. uint8_t c = peekChar();
  162. //printf("eatName: %d = %c\n", (int)c, c);
  163. int u8bytes = 0;
  164. if((c & 0b10000000) == 0b00000000) u8bytes = 1;
  165. else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
  166. else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
  167. else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
  168. else return 1;
  169. std::string u8str(current_char, u8bytes);
  170. //printf("%s %d %c\n", u8str.c_str(), u8bytes, c);
  171. if(u8str.size() != u8bytes) return 2;
  172. if(u8bytes == 1){
  173. if(isalpha(c) || c=='_' || isdigit(c)) goto __EAT_ALL_BYTES;
  174. }else{
  175. uint32_t value = 0;
  176. for(int k=0; k < u8bytes; k++){
  177. uint8_t b = u8str[k];
  178. if(k==0){
  179. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  180. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  181. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  182. }else{
  183. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  184. }
  185. }
  186. // printf("value: %d", value);
  187. if(__isLoChar(value)) goto __EAT_ALL_BYTES;
  188. }
  189. break;
  190. __EAT_ALL_BYTES:
  191. current_char += u8bytes;
  192. }
  193. int length = (int)(current_char - token_start);
  194. if(length == 0) return 3;
  195. std::string_view name(token_start, length);
  196. if(__KW_MAP.count(name)){
  197. if(name == "not"){
  198. if(strncmp(current_char, " in", 3) == 0){
  199. current_char += 3;
  200. setNextToken(TK("not in"));
  201. return 0;
  202. }
  203. }else if(name == "is"){
  204. if(strncmp(current_char, " not", 4) == 0){
  205. current_char += 4;
  206. setNextToken(TK("is not"));
  207. return 0;
  208. }
  209. }
  210. setNextToken(__KW_MAP.at(name));
  211. } else {
  212. setNextToken(TK("@id"));
  213. }
  214. return 0;
  215. }
  216. void skipLineComment() {
  217. char c;
  218. while ((c = peekChar()) != '\0') {
  219. if (c == '\n') return;
  220. eatChar();
  221. }
  222. }
  223. // If the current char is [c] consume it and advance char by 1 and returns
  224. // true otherwise returns false.
  225. bool matchChar(char c) {
  226. if (peekChar() != c) return false;
  227. eatCharIncludeNewLine();
  228. return true;
  229. }
  230. // Returns an error token from the current position for reporting error.
  231. Token makeErrToken() {
  232. return Token{TK("@error"), token_start, (int)(current_char - token_start), current_line};
  233. }
  234. // Initialize the next token as the type.
  235. void setNextToken(_TokenType type, PyVar value=nullptr) {
  236. switch(type){
  237. case TK("("): brackets_level_0++; break;
  238. case TK(")"): brackets_level_0--; break;
  239. case TK("["): brackets_level_1++; break;
  240. case TK("]"): brackets_level_1--; break;
  241. }
  242. nexts.push( Token{
  243. type,
  244. token_start,
  245. (int)(current_char - token_start),
  246. current_line - ((type == TK("@eol")) ? 1 : 0),
  247. value
  248. });
  249. }
  250. void setNextTwoCharToken(char c, _TokenType one, _TokenType two) {
  251. if (matchChar(c)) setNextToken(two);
  252. else setNextToken(one);
  253. }
  254. Parser(_Source src) {
  255. this->src = src;
  256. this->token_start = src->source;
  257. this->current_char = src->source;
  258. this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
  259. this->indents.push(0);
  260. }
  261. };