blueloveTH 2 лет назад
Родитель
Сommit
e78aa44895
11 измененных файлов с 730 добавлено и 630 удалено
  1. 2 2
      amalgamate.py
  2. 2 2
      src/ceval.h
  3. 10 24
      src/common.h
  4. 87 291
      src/compiler.h
  5. 108 0
      src/expr.h
  6. 4 4
      src/frame.h
  7. 3 3
      src/gc.h
  8. 510 0
      src/lexer.h
  9. 0 302
      src/parser.h
  10. 2 0
      src/pocketpy.h
  11. 2 2
      src/vm.h

+ 2 - 2
amalgamate.py

@@ -6,8 +6,8 @@ with open("src/opcodes.h", "rt", encoding='utf-8') as f:
 	OPCODES_TEXT = f.read()
 
 pipeline = [
-	["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h"],
-	["obj.h", "parser.h", "codeobject.h", "frame.h"],
+	["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h", "lexer.h"],
+	["obj.h", "codeobject.h", "frame.h"],
 	["gc.h", "vm.h", "ref.h", "ceval.h", "compiler.h", "repl.h"],
 	["iter.h", "cffi.h", "io.h", "_generated.h", "pocketpy.h"]
 ]

+ 2 - 2
src/ceval.h

@@ -7,7 +7,7 @@ namespace pkpy{
 
 inline PyObject* VM::run_frame(Frame* frame){
     while(frame->has_next_bytecode()){
-        // heap._auto_collect(this);
+        heap._auto_collect(this);
 
         const Bytecode& byte = frame->next_bytecode();
         switch (byte.op)
@@ -325,7 +325,7 @@ inline PyObject* VM::run_frame(Frame* frame){
         if(frame->_data.size() != 1) throw std::runtime_error("_data.size() != 1 in EVAL/JSON_MODE");
         return frame->pop_value(this);
     }
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
     if(!frame->_data.empty()) throw std::runtime_error("_data.size() != 0 in EXEC_MODE");
 #endif
     return None;

+ 10 - 24
src/common.h

@@ -10,7 +10,6 @@
 #include <sstream>
 #include <regex>
 #include <cmath>
-#include <cstdlib>
 #include <stdexcept>
 #include <vector>
 #include <string>
@@ -26,10 +25,13 @@
 #include <algorithm>
 #include <random>
 #include <initializer_list>
-#include <list>
+#include <variant>
 
-#define PK_VERSION				"0.9.5"
-#define PK_EXTRA_CHECK 			0
+#define PK_VERSION				"0.9.6"
+
+// debug macros
+#define DEBUG_NO_BUILTIN_MODULES	0
+#define DEBUG_EXTRA_CHECK			1
 
 #if (defined(__ANDROID__) && __ANDROID_API__ <= 22) || defined(__EMSCRIPTEN__)
 #define PK_ENABLE_FILEIO 		0
@@ -40,13 +42,13 @@
 #if defined(__EMSCRIPTEN__) || defined(__arm__) || defined(__i386__)
 typedef int32_t i64;
 typedef float f64;
-#define S_TO_INT std::stoi
-#define S_TO_FLOAT std::stof
+#define S_TO_INT(...) static_cast<i64>(std::stoi(__VA_ARGS__))
+#define S_TO_FLOAT(...) static_cast<f64>(std::stof(__VA_ARGS__))
 #else
 typedef int64_t i64;
 typedef double f64;
-#define S_TO_INT std::stoll
-#define S_TO_FLOAT std::stod
+#define S_TO_INT(...) static_cast<i64>(std::stoll(__VA_ARGS__))
+#define S_TO_FLOAT(...) static_cast<f64>(std::stod(__VA_ARGS__))
 #endif
 
 namespace pkpy{
@@ -100,22 +102,6 @@ inline bool is_both_int(PyObject* a, PyObject* b) noexcept {
     return is_int(a) && is_int(b);
 }
 
-
-template <typename T>
-class queue{
-	std::list<T> list;
-public:
-	void push(const T& t){ list.push_back(t); }
-	void push(T&& t){ list.push_back(std::move(t)); }
-	void pop(){ list.pop_front(); }
-	void clear(){ list.clear(); }
-	bool empty() const { return list.empty(); }
-	size_t size() const { return list.size(); }
-	T& front(){ return list.front(); }
-	const T& front() const { return list.front(); }
-	const std::list<T>& data() const { return list; }
-};
-
 template <typename T>
 class stack{
 	std::vector<T> vec;

+ 87 - 291
src/compiler.h

@@ -2,7 +2,7 @@
 
 #include "codeobject.h"
 #include "common.h"
-#include "parser.h"
+#include "lexer.h"
 #include "error.h"
 #include "ceval.h"
 
@@ -18,24 +18,21 @@ struct GrammarRule{
     Precedence precedence;
 };
 
-enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
-
 class Compiler {
-    std::unique_ptr<Parser> parser;
+    std::unique_ptr<Lexer> lexer;
     stack<CodeObject_> codes;
-    int lexing_count = 0;
     bool used = false;
     VM* vm;
     std::map<TokenIndex, GrammarRule> rules;
 
     CodeObject_ co() const{ return codes.top(); }
-    CompileMode mode() const{ return parser->src->mode; }
+    CompileMode mode() const{ return lexer->src->mode; }
     NameScope name_scope() const { return codes.size()>1 ? NAME_LOCAL : NAME_GLOBAL; }
 
 public:
     Compiler(VM* vm, const char* source, Str filename, CompileMode mode){
         this->vm = vm;
-        this->parser = std::make_unique<Parser>(
+        this->lexer = std::make_unique<Lexer>(
             make_sp<SourceData>(source, filename, mode)
         );
 
@@ -104,239 +101,36 @@ public:
     }
 
 private:
-    Str eat_string_until(char quote, bool raw) {
-        bool quote3 = parser->match_n_chars(2, quote);
-        std::vector<char> buff;
-        while (true) {
-            char c = parser->eatchar_include_newline();
-            if (c == quote){
-                if(quote3 && !parser->match_n_chars(2, quote)){
-                    buff.push_back(c);
-                    continue;
-                }
-                break;
-            }
-            if (c == '\0'){
-                if(quote3 && parser->src->mode == REPL_MODE){
-                    throw NeedMoreLines(false);
-                }
-                SyntaxError("EOL while scanning string literal");
-            }
-            if (c == '\n'){
-                if(!quote3) SyntaxError("EOL while scanning string literal");
-                else{
-                    buff.push_back(c);
-                    continue;
-                }
-            }
-            if (!raw && c == '\\') {
-                switch (parser->eatchar_include_newline()) {
-                    case '"':  buff.push_back('"');  break;
-                    case '\'': buff.push_back('\''); break;
-                    case '\\': buff.push_back('\\'); break;
-                    case 'n':  buff.push_back('\n'); break;
-                    case 'r':  buff.push_back('\r'); break;
-                    case 't':  buff.push_back('\t'); break;
-                    default: SyntaxError("invalid escape char");
-                }
-            } else {
-                buff.push_back(c);
-            }
-        }
-        return Str(buff.data(), buff.size());
-    }
-
-    void eat_string(char quote, StringType type) {
-        Str s = eat_string_until(quote, type == RAW_STRING);
-        if(type == F_STRING){
-            parser->set_next_token(TK("@fstr"), VAR(s));
-        }else{
-            parser->set_next_token(TK("@str"), VAR(s));
-        }
-    }
-
-    void eat_number() {
-        static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
-        std::smatch m;
-
-        const char* i = parser->token_start;
-        while(*i != '\n' && *i != '\0') i++;
-        std::string s = std::string(parser->token_start, i);
-
-        try{
-            if (std::regex_search(s, m, pattern)) {
-                // here is m.length()-1, since the first char was eaten by lex_token()
-                for(int j=0; j<m.length()-1; j++) parser->eatchar();
-
-                int base = 10;
-                size_t size;
-                if (m[1].matched) base = 16;
-                if (m[2].matched) {
-                    if(base == 16) SyntaxError("hex literal should not contain a dot");
-                    parser->set_next_token(TK("@num"), VAR(S_TO_FLOAT(m[0], &size)));
-                } else {
-                    parser->set_next_token(TK("@num"), VAR(S_TO_INT(m[0], &size, base)));
-                }
-                if (size != m.length()) UNREACHABLE();
-            }
-        }catch(std::exception& _){
-            SyntaxError("invalid number literal");
-        } 
-    }
-
-    void lex_token(){
-        lexing_count++;
-        _lex_token();
-        lexing_count--;
-    }
-
-    // Lex the next token and set it as the next token.
-    void _lex_token() {
-        parser->prev = parser->curr;
-        parser->curr = parser->next_token();
-        //std::cout << parser->curr.info() << std::endl;
-
-        while (parser->peekchar() != '\0') {
-            parser->token_start = parser->curr_char;
-            char c = parser->eatchar_include_newline();
-            switch (c) {
-                case '\'': case '"': eat_string(c, NORMAL_STRING); return;
-                case '#': parser->skip_line_comment(); break;
-                case '{': parser->set_next_token(TK("{")); return;
-                case '}': parser->set_next_token(TK("}")); return;
-                case ',': parser->set_next_token(TK(",")); return;
-                case ':': parser->set_next_token_2(':', TK(":"), TK("::")); return;
-                case ';': parser->set_next_token(TK(";")); return;
-                case '(': parser->set_next_token(TK("(")); return;
-                case ')': parser->set_next_token(TK(")")); return;
-                case '[': parser->set_next_token(TK("[")); return;
-                case ']': parser->set_next_token(TK("]")); return;
-                case '@': parser->set_next_token(TK("@")); return;
-                case '%': parser->set_next_token_2('=', TK("%"), TK("%=")); return;
-                case '&': parser->set_next_token_2('=', TK("&"), TK("&=")); return;
-                case '|': parser->set_next_token_2('=', TK("|"), TK("|=")); return;
-                case '^': parser->set_next_token_2('=', TK("^"), TK("^=")); return;
-                case '?': parser->set_next_token(TK("?")); return;
-                case '.': {
-                    if(parser->matchchar('.')) {
-                        if(parser->matchchar('.')) {
-                            parser->set_next_token(TK("..."));
-                        } else {
-                            SyntaxError("invalid token '..'");
-                        }
-                    } else {
-                        parser->set_next_token(TK("."));
-                    }
-                    return;
-                }
-                case '=': parser->set_next_token_2('=', TK("="), TK("==")); return;
-                case '+': parser->set_next_token_2('=', TK("+"), TK("+=")); return;
-                case '>': {
-                    if(parser->matchchar('=')) parser->set_next_token(TK(">="));
-                    else if(parser->matchchar('>')) parser->set_next_token_2('=', TK(">>"), TK(">>="));
-                    else parser->set_next_token(TK(">"));
-                    return;
-                }
-                case '<': {
-                    if(parser->matchchar('=')) parser->set_next_token(TK("<="));
-                    else if(parser->matchchar('<')) parser->set_next_token_2('=', TK("<<"), TK("<<="));
-                    else parser->set_next_token(TK("<"));
-                    return;
-                }
-                case '-': {
-                    if(parser->matchchar('=')) parser->set_next_token(TK("-="));
-                    else if(parser->matchchar('>')) parser->set_next_token(TK("->"));
-                    else parser->set_next_token(TK("-"));
-                    return;
-                }
-                case '!':
-                    if(parser->matchchar('=')) parser->set_next_token(TK("!="));
-                    else SyntaxError("expected '=' after '!'");
-                    break;
-                case '*':
-                    if (parser->matchchar('*')) {
-                        parser->set_next_token(TK("**"));  // '**'
-                    } else {
-                        parser->set_next_token_2('=', TK("*"), TK("*="));
-                    }
-                    return;
-                case '/':
-                    if(parser->matchchar('/')) {
-                        parser->set_next_token_2('=', TK("//"), TK("//="));
-                    } else {
-                        parser->set_next_token_2('=', TK("/"), TK("/="));
-                    }
-                    return;
-                case '\r': break;       // just ignore '\r'
-                case ' ': case '\t': parser->eat_spaces(); break;
-                case '\n': {
-                    parser->set_next_token(TK("@eol"));
-                    if(!parser->eat_indentation()) IndentationError("unindent does not match any outer indentation level");
-                    return;
-                }
-                default: {
-                    if(c == 'f'){
-                        if(parser->matchchar('\'')) {eat_string('\'', F_STRING); return;}
-                        if(parser->matchchar('"')) {eat_string('"', F_STRING); return;}
-                    }else if(c == 'r'){
-                        if(parser->matchchar('\'')) {eat_string('\'', RAW_STRING); return;}
-                        if(parser->matchchar('"')) {eat_string('"', RAW_STRING); return;}
-                    }
+    int i = 0;
+    std::vector<Token> tokens;
 
-                    if (c >= '0' && c <= '9') {
-                        eat_number();
-                        return;
-                    }
-                    
-                    switch (parser->eat_name())
-                    {
-                        case 0: break;
-                        case 1: SyntaxError("invalid char: " + std::string(1, c));
-                        case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
-                        case 3: SyntaxError("@id contains invalid char"); break;
-                        case 4: SyntaxError("invalid JSON token"); break;
-                        default: UNREACHABLE();
-                    }
-                    return;
-                }
-            }
-        }
-
-        parser->token_start = parser->curr_char;
-        parser->set_next_token(TK("@eof"));
-    }
-
-    TokenIndex peek() {
-        return parser->curr.type;
-    }
-
-    // not sure this will work
-    TokenIndex peek_next() {
-        if(parser->nexts.empty()) return TK("@eof");
-        return parser->nexts.front().type;
-    }
+    const Token& prev() { return tokens.at(i-1); }
+    const Token& curr() { return tokens.at(i); }
+    const Token& next() { return tokens.at(i+1); }
+    const Token& peek(int offset=0) { return tokens.at(i+offset); }
+    void advance() { i++; }
 
     bool match(TokenIndex expected) {
-        if (peek() != expected) return false;
-        lex_token();
+        if (curr().type != expected) return false;
+        advance();
         return true;
     }
 
     void consume(TokenIndex expected) {
         if (!match(expected)){
             StrStream ss;
-            ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(peek()) << "'";
+            ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(curr().type) << "'";
             SyntaxError(ss.str());
         }
     }
 
     bool match_newlines(bool repl_throw=false) {
         bool consumed = false;
-        if (peek() == TK("@eol")) {
-            while (peek() == TK("@eol")) lex_token();
+        if (curr().type == TK("@eol")) {
+            while (curr().type == TK("@eol")) advance();
             consumed = true;
         }
-        if (repl_throw && peek() == TK("@eof")){
+        if (repl_throw && curr().type == TK("@eof")){
             throw NeedMoreLines(co()->_is_compiling_class);
         }
         return consumed;
@@ -344,8 +138,8 @@ private:
 
     bool match_end_stmt() {
         if (match(TK(";"))) { match_newlines(); return true; }
-        if (match_newlines() || peek()==TK("@eof")) return true;
-        if (peek() == TK("@dedent")) return true;
+        if (match_newlines() || curr().type == TK("@eof")) return true;
+        if (curr().type == TK("@dedent")) return true;
         return false;
     }
 
@@ -353,15 +147,27 @@ private:
         if (!match_end_stmt()) SyntaxError("expected statement end");
     }
 
+    PyObject* get_value(const Token& token) {
+        switch (token.type) {
+            case TK("@num"):
+                if(std::holds_alternative<i64>(token.value)) return VAR(std::get<i64>(token.value));
+                if(std::holds_alternative<f64>(token.value)) return VAR(std::get<f64>(token.value));
+                UNREACHABLE();
+            case TK("@str"): case TK("@fstr"):
+                return VAR(std::get<Str>(token.value));
+            default: throw std::runtime_error(Str("invalid token type: ") + TK_STR(token.type));
+        }
+    }
+
     void exprLiteral() {
-        PyObject* value = parser->prev.value;
+        PyObject* value = get_value(prev());
         int index = co()->add_const(value);
         emit(OP_LOAD_CONST, index);
     }
 
     void exprFString() {
         static const std::regex pattern(R"(\{(.*?)\})");
-        PyObject* value = parser->prev.value;
+        PyObject* value = get_value(prev());
         Str s = CAST(Str, value);
         std::sregex_iterator begin(s.begin(), s.end(), pattern);
         std::sregex_iterator end;
@@ -395,7 +201,7 @@ private:
             _compile_f_args(func, false);
             consume(TK(":"));
         }
-        func.code = make_sp<CodeObject>(parser->src, func.name.str());
+        func.code = make_sp<CodeObject>(lexer->src, func.name.str());
         this->codes.push(func.code);
         co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
         emit(OP_RETURN_VALUE);
@@ -414,7 +220,7 @@ private:
         if(is_load_name_ref) co()->codes.pop_back();
 
         co()->_rvalue += 1;
-        TokenIndex op = parser->prev.type;
+        TokenIndex op = prev().type;
         if(op == TK("=")) {     // a = (expr)
             EXPR_TUPLE();
             if(is_load_name_ref){
@@ -487,7 +293,7 @@ private:
     }
 
     void exprBinaryOp() {
-        TokenIndex op = parser->prev.type;
+        TokenIndex op = prev().type;
         parse_expression((Precedence)(rules[op].precedence + 1));
 
         switch (op) {
@@ -525,7 +331,7 @@ private:
     }
 
     void exprUnaryOp() {
-        TokenIndex op = parser->prev.type;
+        TokenIndex op = prev().type;
         parse_expression((Precedence)(PREC_UNARY + 1));
         switch (op) {
             case TK("-"):     emit(OP_UNARY_NEGATIVE); break;
@@ -588,7 +394,7 @@ private:
         int ARGC = 0;
         do {
             match_newlines(mode()==REPL_MODE);
-            if (peek() == TK("]")) break;
+            if (curr().type == TK("]")) break;
             EXPR(); ARGC++;
             match_newlines(mode()==REPL_MODE);
             if(ARGC == 1 && match(TK("for"))){
@@ -609,9 +415,9 @@ private:
         int ARGC = 0;
         do {
             match_newlines(mode()==REPL_MODE);
-            if (peek() == TK("}")) break;
+            if (curr().type == TK("}")) break;
             EXPR();
-            if(peek() == TK(":")) parsing_dict = true;
+            if(curr().type == TK(":")) parsing_dict = true;
             if(parsing_dict){
                 consume(TK(":"));
                 EXPR();
@@ -637,10 +443,10 @@ private:
         bool need_unpack = false;
         do {
             match_newlines(mode()==REPL_MODE);
-            if (peek() == TK(")")) break;
-            if(peek() == TK("@id") && peek_next() == TK("=")) {
+            if (curr().type == TK(")")) break;
+            if(curr().type == TK("@id") && next().type == TK("=")) {
                 consume(TK("@id"));
-                const Str& key = parser->prev.str();
+                const Str& key = prev().str();
                 emit(OP_LOAD_CONST, co()->add_const(VAR(key)));
                 consume(TK("="));
                 co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
@@ -666,7 +472,7 @@ private:
     void exprName(){ _exprName(false); }
 
     void _exprName(bool force_lvalue) {
-        Token tkname = parser->prev;
+        const Token& tkname = prev();
         int index = co()->add_name(tkname.str(), name_scope());
         bool fast_load = !force_lvalue && co()->_rvalue>0;
         emit(fast_load ? OP_LOAD_NAME : OP_LOAD_NAME_REF, index);
@@ -674,7 +480,7 @@ private:
 
     void exprAttrib() {
         consume(TK("@id"));
-        const Str& name = parser->prev.str();
+        const Str& name = prev().str();
         int index = co()->add_name(name, NAME_ATTR);
         emit(co()->_rvalue ? OP_BUILD_ATTR : OP_BUILD_ATTR_REF, index);
     }
@@ -710,7 +516,7 @@ private:
     }
 
     void exprValue() {
-        TokenIndex op = parser->prev.type;
+        TokenIndex op = prev().type;
         switch (op) {
             case TK("None"):    emit(OP_LOAD_NONE);  break;
             case TK("True"):    emit(OP_LOAD_TRUE);  break;
@@ -721,7 +527,7 @@ private:
     }
 
     int emit(Opcode opcode, int arg=-1, bool keepline=false) {
-        int line = parser->prev.line;
+        int line = prev().line;
         co()->codes.push_back(
             Bytecode{(uint8_t)opcode, (uint16_t)co()->_curr_block_i, arg, line}
         );
@@ -738,7 +544,7 @@ private:
     void compile_block_body(CompilerAction action=nullptr) {
         if(action == nullptr) action = &Compiler::compile_stmt;
         consume(TK(":"));
-        if(peek()!=TK("@eol") && peek()!=TK("@eof")){
+        if(curr().type!=TK("@eol") && curr().type!=TK("@eof")){
             (this->*action)();  // inline block
             return;
         }
@@ -746,7 +552,7 @@ private:
             SyntaxError("expected a new line after ':'");
         }
         consume(TK("@indent"));
-        while (peek() != TK("@dedent")) {
+        while (curr().type != TK("@dedent")) {
             match_newlines();
             (this->*action)();
             match_newlines();
@@ -756,7 +562,7 @@ private:
 
     Token _compile_import() {
         consume(TK("@id"));
-        Token tkmodule = parser->prev;
+        Token tkmodule = prev();
         int index = co()->add_name(tkmodule.str(), NAME_SPECIAL);
         emit(OP_IMPORT_NAME, index);
         return tkmodule;
@@ -768,7 +574,7 @@ private:
             Token tkmodule = _compile_import();
             if (match(TK("as"))) {
                 consume(TK("@id"));
-                tkmodule = parser->prev;
+                tkmodule = prev();
             }
             int index = co()->add_name(tkmodule.str(), name_scope());
             emit(OP_STORE_NAME, index);
@@ -789,12 +595,12 @@ private:
         do {
             emit(OP_DUP_TOP_VALUE);
             consume(TK("@id"));
-            Token tkname = parser->prev;
+            Token tkname = prev();
             int index = co()->add_name(tkname.str(), NAME_ATTR);
             emit(OP_BUILD_ATTR, index);
             if (match(TK("as"))) {
                 consume(TK("@id"));
-                tkname = parser->prev;
+                tkname = prev();
             }
             index = co()->add_name(tkname.str(), name_scope());
             emit(OP_STORE_NAME, index);
@@ -807,14 +613,14 @@ private:
     // ['a', '1', '2', '+', '=']
     // 
     void parse_expression(Precedence precedence) {
-        lex_token();
-        GrammarFn prefix = rules[parser->prev.type].prefix;
-        if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(parser->prev.type));
+        advance();
+        GrammarFn prefix = rules[prev().type].prefix;
+        if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(prev().type));
         (this->*prefix)();
         bool meet_assign_token = false;
-        while (rules[peek()].precedence >= precedence) {
-            lex_token();
-            TokenIndex op = parser->prev.type;
+        while (rules[curr().type].precedence >= precedence) {
+            advance();
+            TokenIndex op = prev().type;
             if (op == TK("=")){
                 if(meet_assign_token) SyntaxError();
                 meet_assign_token = true;
@@ -891,7 +697,7 @@ private:
         do {
             consume(TK("except"));
             if(match(TK("@id"))){
-                int name_idx = co()->add_name(parser->prev.str(), NAME_SPECIAL);
+                int name_idx = co()->add_name(prev().str(), NAME_SPECIAL);
                 emit(OP_EXCEPTION_MATCH, name_idx);
             }else{
                 emit(OP_LOAD_TRUE);
@@ -901,7 +707,7 @@ private:
             compile_block_body();
             patches.push_back(emit(OP_JUMP_ABSOLUTE));
             patch_jump(patch);
-        }while(peek() == TK("except"));
+        }while(curr().type == TK("except"));
         emit(OP_RE_RAISE);      // no match, re-raise
         for (int patch : patches) patch_jump(patch);
     }
@@ -968,7 +774,7 @@ private:
             EXPR();
             consume(TK("as"));
             consume(TK("@id"));
-            Token tkname = parser->prev;
+            Token tkname = prev();
             int index = co()->add_name(tkname.str(), name_scope());
             emit(OP_STORE_NAME, index);
             emit(OP_LOAD_NAME_REF, index);
@@ -979,18 +785,18 @@ private:
         } else if(match(TK("label"))){
             if(mode() != EXEC_MODE) SyntaxError("'label' is only available in EXEC_MODE");
             consume(TK(".")); consume(TK("@id"));
-            Str label = parser->prev.str();
+            Str label = prev().str();
             bool ok = co()->add_label(label);
             if(!ok) SyntaxError("label '" + label + "' already exists");
             consume_end_stmt();
         } else if(match(TK("goto"))){ // https://entrian.com/goto/
             if(mode() != EXEC_MODE) SyntaxError("'goto' is only available in EXEC_MODE");
             consume(TK(".")); consume(TK("@id"));
-            emit(OP_GOTO, co()->add_name(parser->prev.str(), NAME_SPECIAL));
+            emit(OP_GOTO, co()->add_name(prev().str(), NAME_SPECIAL));
             consume_end_stmt();
         } else if(match(TK("raise"))){
             consume(TK("@id"));
-            int dummy_t = co()->add_name(parser->prev.str(), NAME_SPECIAL);
+            int dummy_t = co()->add_name(prev().str(), NAME_SPECIAL);
             if(match(TK("(")) && !match(TK(")"))){
                 EXPR(); consume(TK(")"));
             }else{
@@ -1005,7 +811,7 @@ private:
         } else if(match(TK("global"))){
             do {
                 consume(TK("@id"));
-                co()->global_names[parser->prev.str()] = 1;
+                co()->global_names[prev().str()] = 1;
             } while (match(TK(",")));
             consume_end_stmt();
         } else if(match(TK("pass"))){
@@ -1030,10 +836,10 @@ private:
 
     void compile_class(){
         consume(TK("@id"));
-        int cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL);
+        int cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
         int super_cls_name_idx = -1;
         if(match(TK("(")) && match(TK("@id"))){
-            super_cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL);
+            super_cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
             consume(TK(")"));
         }
         if(super_cls_name_idx == -1) emit(OP_LOAD_NONE);
@@ -1059,13 +865,13 @@ private:
             }
 
             consume(TK("@id"));
-            const Str& name = parser->prev.str();
+            const Str& name = prev().str();
             if(func.has_name(name)) SyntaxError("duplicate argument name");
 
             // eat type hints
             if(enable_type_hints && match(TK(":"))) consume(TK("@id"));
 
-            if(state == 0 && peek() == TK("=")) state = 2;
+            if(state == 0 && curr().type == TK("=")) state = 2;
 
             switch (state)
             {
@@ -1075,7 +881,7 @@ private:
                     consume(TK("="));
                     PyObject* value = read_literal();
                     if(value == nullptr){
-                        SyntaxError(Str("expect a literal, not ") + TK_STR(parser->curr.type));
+                        SyntaxError(Str("expect a literal, not ") + TK_STR(curr().type));
                     }
                     func.kwargs.set(name, value);
                     func.kwargs_order.push_back(name);
@@ -1090,11 +896,11 @@ private:
         Function func;
         StrName obj_name;
         consume(TK("@id"));
-        func.name = parser->prev.str();
+        func.name = prev().str();
         if(!co()->_is_compiling_class && match(TK("::"))){
             consume(TK("@id"));
             obj_name = func.name;
-            func.name = parser->prev.str();
+            func.name = prev().str();
         }
         consume(TK("("));
         if (!match(TK(")"))) {
@@ -1104,7 +910,7 @@ private:
         if(match(TK("->"))){
             if(!match(TK("None"))) consume(TK("@id"));
         }
-        func.code = make_sp<CodeObject>(parser->src, func.name.str());
+        func.code = make_sp<CodeObject>(lexer->src, func.name.str());
         this->codes.push(func.code);
         compile_block_body();
         func.code->optimize(vm);
@@ -1132,11 +938,11 @@ private:
     PyObject* read_literal(){
         if(match(TK("-"))){
             consume(TK("@num"));
-            PyObject* val = parser->prev.value;
+            PyObject* val = get_value(prev());
             return vm->num_negated(val);
         }
-        if(match(TK("@num"))) return parser->prev.value;
-        if(match(TK("@str"))) return parser->prev.value;
+        if(match(TK("@num"))) return get_value(prev());
+        if(match(TK("@str"))) return get_value(prev());
         if(match(TK("True"))) return VAR(true);
         if(match(TK("False"))) return VAR(false);
         if(match(TK("None"))) return vm->None;
@@ -1144,23 +950,8 @@ private:
         return nullptr;
     }
 
-    /***** Error Reporter *****/
-    void throw_err(Str type, Str msg){
-        int lineno = parser->curr.line;
-        const char* cursor = parser->curr.start;
-        // if error occurs in lexing, lineno should be `parser->current_line`
-        if(lexing_count > 0){
-            lineno = parser->current_line;
-            cursor = parser->curr_char;
-        }
-        if(parser->peekchar() == '\n') lineno--;
-        auto e = Exception("SyntaxError", msg);
-        e.st_push(parser->src->snapshot(lineno, cursor));
-        throw e;
-    }
-    void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
-    void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
-    void IndentationError(Str msg){ throw_err("IndentationError", msg); }
+    void SyntaxError(Str msg){ lexer->throw_err("SyntaxError", msg, curr().line, curr().start); }
+    void SyntaxError(){ lexer->throw_err("SyntaxError", "invalid syntax", curr().line, curr().start); }
 
 public:
     CodeObject_ compile(){
@@ -1168,11 +959,16 @@ public:
         if(used) UNREACHABLE();
         used = true;
 
-        CodeObject_ code = make_sp<CodeObject>(parser->src, Str("<module>"));
+        tokens = lexer->run();
+        // if(lexer->src->filename == "tests/01_int.py"){
+        //     for(auto& t: tokens) std::cout << t.info() << std::endl;
+        // }
+
+        CodeObject_ code = make_sp<CodeObject>(lexer->src, lexer->src->filename);
         codes.push(code);
 
-        lex_token(); lex_token();
-        match_newlines();
+        advance();          // skip @sof, so prev() is always valid
+        match_newlines();   // skip leading '\n'
 
         if(mode()==EVAL_MODE) {
             EXPR_TUPLE();

+ 108 - 0
src/expr.h

@@ -0,0 +1,108 @@
+#pragma once
+
+#include "codeobject.h"
+#include "common.h"
+#include "parser.h"
+#include "error.h"
+#include "ceval.h"
+#include <memory>
+
+namespace pkpy{
+
+struct Expression;
+typedef std::unique_ptr<Expression> Expression_;
+
+struct Expression{
+    std::vector<Expression_> children;
+    virtual Str to_string() const = 0;
+};
+
+struct NameExpr: Expression{
+    Str name;
+    NameScope scope;
+    NameExpr(Str name, NameScope scope): name(name), scope(scope) {}
+    Str to_string() const override { return name; }
+};
+
+struct GroupExpr: Expression{
+    Expression_ expr;
+    GroupExpr(Expression_ expr): expr(std::move(expr)) {}
+    Str to_string() const override { return "()"; }
+};
+
+struct UnaryExpr: Expression{
+    TokenIndex op;
+    UnaryExpr(TokenIndex op): op(op) {}
+    Str to_string() const override { return TK_STR(op); }
+};
+
+struct NotExpr: Expression{
+    Str to_string() const override { return "not"; }
+};
+
+struct AndExpr: Expression{
+    Str to_string() const override { return "and"; }
+};
+
+struct OrExpr: Expression{
+    Str to_string() const override { return "or"; }
+};
+
+// None, True, False, ...
+struct SpecialValueExpr: Expression{
+    TokenIndex token;
+    SpecialValueExpr(TokenIndex token): token(token) {}
+    Str to_string() const override { return TK_STR(token); }
+};
+
+// @num, @str which needs to invoke OP_LOAD_CONST
+struct LiteralExpr: Expression{
+    PyObject* value;
+    LiteralExpr(PyObject* value): value(value) {}
+    Str to_string() const override { return "literal"; }
+};
+
+struct ListExpr: Expression{
+    Str to_string() const override { return "[]"; }
+};
+
+struct DictExpr: Expression{
+    Str to_string() const override { return "{}"; }
+};
+
+struct LambdaExpr: Expression{
+    Str to_string() const override { return "lambda"; }
+};
+
+struct FStringExpr: Expression{
+    Str to_string() const override { return "@fstr"; }
+};
+
+struct AttribExpr: Expression{
+    Str to_string() const override { return "."; }
+};
+
+struct CallExpr: Expression{
+    Str to_string() const override { return "()"; }
+};
+
+struct BinaryExpr: Expression{
+    TokenIndex op;
+    BinaryExpr(TokenIndex op): op(op) {}
+    Str to_string() const override { return TK_STR(op); }
+};
+
+struct TernaryExpr: Expression{
+    Str to_string() const override { return "?"; }
+};
+
+struct AssignExpr: Expression{
+    Str to_string() const override { return "="; }
+};
+
+struct CommaExpr: Expression{
+    Str to_string() const override { return ","; }
+};
+
+
+} // namespace pkpy

+ 4 - 4
src/frame.h

@@ -58,7 +58,7 @@ struct Frame {
     }
 
     PyObject* pop(){
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
         if(_data.empty()) throw std::runtime_error("_data.empty() is true");
 #endif
         PyObject* v = _data.back();
@@ -67,7 +67,7 @@ struct Frame {
     }
 
     void _pop(){
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
         if(_data.empty()) throw std::runtime_error("_data.empty() is true");
 #endif
         _data.pop_back();
@@ -88,14 +88,14 @@ struct Frame {
     }
 
     PyObject*& top(){
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
         if(_data.empty()) throw std::runtime_error("_data.empty() is true");
 #endif
         return _data.back();
     }
 
     PyObject*& top_1(){
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
         if(_data.size() < 2) throw std::runtime_error("_data.size() < 2");
 #endif
         return _data[_data.size()-2];

+ 3 - 3
src/gc.h

@@ -67,9 +67,9 @@ struct ManagedHeap{
 
     ~ManagedHeap(){
         for(PyObject* obj: _no_gc) delete obj;
-        for(auto& [type, count]: deleted){
-            std::cout << "GC: " << type << "=" << count << std::endl;
-        }
+        // for(auto& [type, count]: deleted){
+        //     std::cout << "GC: " << type << "=" << count << std::endl;
+        // }
     }
 
     int sweep(VM* vm){

+ 510 - 0
src/lexer.h

@@ -0,0 +1,510 @@
+#pragma once
+
+#include "common.h"
+#include "error.h"
+#include "str.h"
+
+namespace pkpy{
+
+typedef uint8_t TokenIndex;
+
+constexpr const char* kTokens[] = {
+    "@eof", "@eol", "@sof",
+    ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
+    "+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
+    "<<", ">>", "&", "|", "^", "?", "@",
+    "==", "!=", ">=", "<=",
+    "+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
+    /** KW_BEGIN **/
+    "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
+    "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
+    "goto", "label",      // extended keywords, not available in cpython
+    "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
+    /** KW_END **/
+    "is not", "not in",
+    "@id", "@num", "@str", "@fstr",
+    "@indent", "@dedent"
+};
+
+using TokenValue = std::variant<std::monostate, i64, f64, Str>;
+const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
+
+constexpr TokenIndex TK(const char token[]) {
+    for(int k=0; k<kTokenCount; k++){
+        const char* i = kTokens[k];
+        const char* j = token;
+        while(*i && *j && *i == *j) { i++; j++;}
+        if(*i == *j) return k;
+    }
+    UNREACHABLE();
+}
+
+#define TK_STR(t) kTokens[t]
+const TokenIndex kTokenKwBegin = TK("class");
+const TokenIndex kTokenKwEnd = TK("raise");
+
+const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
+    std::map<std::string_view, TokenIndex> map;
+    for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
+    return map;
+}();
+
+
+struct Token{
+  TokenIndex type;
+  const char* start;
+  int length;
+  int line;
+  TokenValue value;
+
+  Str str() const { return Str(start, length);}
+
+  Str info() const {
+    StrStream ss;
+    Str raw = str();
+    if (raw == Str("\n")) raw = "\\n";
+    ss << line << ": " << TK_STR(type) << " '" << raw << "'";
+    return ss.str();
+  }
+};
+
+// https://docs.python.org/3/reference/expressions.html
+enum Precedence {
+  PREC_NONE,
+  PREC_ASSIGNMENT,    // =
+  PREC_COMMA,         // ,
+  PREC_TERNARY,       // ?:
+  PREC_LOGICAL_OR,    // or
+  PREC_LOGICAL_AND,   // and
+  PREC_LOGICAL_NOT,   // not
+  PREC_EQUALITY,      // == !=
+  PREC_TEST,          // in / is / is not / not in
+  PREC_COMPARISION,   // < > <= >=
+  PREC_BITWISE_OR,    // |
+  PREC_BITWISE_XOR,   // ^
+  PREC_BITWISE_AND,   // &
+  PREC_BITWISE_SHIFT, // << >>
+  PREC_TERM,          // + -
+  PREC_FACTOR,        // * / % //
+  PREC_UNARY,         // - not
+  PREC_EXPONENT,      // **
+  PREC_CALL,          // ()
+  PREC_SUBSCRIPT,     // []
+  PREC_ATTRIB,        // .index
+  PREC_PRIMARY,
+};
+
+enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
+
+struct Lexer {
+    shared_ptr<SourceData> src;
+    const char* token_start;
+    const char* curr_char;
+    int current_line = 1;
+    std::vector<Token> nexts;
+    stack<int> indents;
+    int brackets_level = 0;
+    bool used = false;
+
+    char peekchar() const{ return *curr_char; }
+
+    bool match_n_chars(int n, char c0){
+        const char* c = curr_char;
+        for(int i=0; i<n; i++){
+            if(*c == '\0') return false;
+            if(*c != c0) return false;
+            c++;
+        }
+        for(int i=0; i<n; i++) eatchar_include_newline();
+        return true;
+    }
+
+    int eat_spaces(){
+        int count = 0;
+        while (true) {
+            switch (peekchar()) {
+                case ' ' : count+=1; break;
+                case '\t': count+=4; break;
+                default: return count;
+            }
+            eatchar();
+        }
+    }
+
+    bool eat_indentation(){
+        if(brackets_level > 0) return true;
+        int spaces = eat_spaces();
+        if(peekchar() == '#') skip_line_comment();
+        if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
+        // https://docs.python.org/3/reference/lexical_analysis.html#indentation
+        if(spaces > indents.top()){
+            indents.push(spaces);
+            nexts.push_back(Token{TK("@indent"), token_start, 0, current_line});
+        } else if(spaces < indents.top()){
+            while(spaces < indents.top()){
+                indents.pop();
+                nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line});
+            }
+            if(spaces != indents.top()){
+                return false;
+            }
+        }
+        return true;
+    }
+
+    char eatchar() {
+        char c = peekchar();
+        if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
+        curr_char++;
+        return c;
+    }
+
+    char eatchar_include_newline() {
+        char c = peekchar();
+        curr_char++;
+        if (c == '\n'){
+            current_line++;
+            src->line_starts.push_back(curr_char);
+        }
+        return c;
+    }
+
+    int eat_name() {
+        curr_char--;
+        while(true){
+            uint8_t c = peekchar();
+            int u8bytes = 0;
+            if((c & 0b10000000) == 0b00000000) u8bytes = 1;
+            else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
+            else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
+            else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
+            else return 1;
+            if(u8bytes == 1){
+                if(isalpha(c) || c=='_' || isdigit(c)) {
+                    curr_char++;
+                    continue;
+                }else{
+                    break;
+                }
+            }
+            // handle multibyte char
+            std::string u8str(curr_char, u8bytes);
+            if(u8str.size() != u8bytes) return 2;
+            uint32_t value = 0;
+            for(int k=0; k < u8bytes; k++){
+                uint8_t b = u8str[k];
+                if(k==0){
+                    if(u8bytes == 2) value = (b & 0b00011111) << 6;
+                    else if(u8bytes == 3) value = (b & 0b00001111) << 12;
+                    else if(u8bytes == 4) value = (b & 0b00000111) << 18;
+                }else{
+                    value |= (b & 0b00111111) << (6*(u8bytes-k-1));
+                }
+            }
+            if(is_unicode_Lo_char(value)) curr_char += u8bytes;
+            else break;
+        }
+
+        int length = (int)(curr_char - token_start);
+        if(length == 0) return 3;
+        std::string_view name(token_start, length);
+
+        if(src->mode == JSON_MODE){
+            if(name == "true"){
+                add_token(TK("True"));
+            } else if(name == "false"){
+                add_token(TK("False"));
+            } else if(name == "null"){
+                add_token(TK("None"));
+            } else {
+                return 4;
+            }
+            return 0;
+        }
+
+        if(kTokenKwMap.count(name)){
+            if(name == "not"){
+                if(strncmp(curr_char, " in", 3) == 0){
+                    curr_char += 3;
+                    add_token(TK("not in"));
+                    return 0;
+                }
+            }else if(name == "is"){
+                if(strncmp(curr_char, " not", 4) == 0){
+                    curr_char += 4;
+                    add_token(TK("is not"));
+                    return 0;
+                }
+            }
+            add_token(kTokenKwMap.at(name));
+        } else {
+            add_token(TK("@id"));
+        }
+        return 0;
+    }
+
+    void skip_line_comment() {
+        char c;
+        while ((c = peekchar()) != '\0') {
+            if (c == '\n') return;
+            eatchar();
+        }
+    }
+    
+    bool matchchar(char c) {
+        if (peekchar() != c) return false;
+        eatchar_include_newline();
+        return true;
+    }
+
+    void add_token(TokenIndex type, TokenValue value={}) {
+        switch(type){
+            case TK("{"): case TK("["): case TK("("): brackets_level++; break;
+            case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
+        }
+        nexts.push_back( Token{
+            type,
+            token_start,
+            (int)(curr_char - token_start),
+            current_line - ((type == TK("@eol")) ? 1 : 0),
+            value
+        });
+    }
+
+    void add_token_2(char c, TokenIndex one, TokenIndex two) {
+        if (matchchar(c)) add_token(two);
+        else add_token(one);
+    }
+
+    Str eat_string_until(char quote, bool raw) {
+        bool quote3 = match_n_chars(2, quote);
+        std::vector<char> buff;
+        while (true) {
+            char c = eatchar_include_newline();
+            if (c == quote){
+                if(quote3 && !match_n_chars(2, quote)){
+                    buff.push_back(c);
+                    continue;
+                }
+                break;
+            }
+            if (c == '\0'){
+                if(quote3 && src->mode == REPL_MODE){
+                    throw NeedMoreLines(false);
+                }
+                SyntaxError("EOL while scanning string literal");
+            }
+            if (c == '\n'){
+                if(!quote3) SyntaxError("EOL while scanning string literal");
+                else{
+                    buff.push_back(c);
+                    continue;
+                }
+            }
+            if (!raw && c == '\\') {
+                switch (eatchar_include_newline()) {
+                    case '"':  buff.push_back('"');  break;
+                    case '\'': buff.push_back('\''); break;
+                    case '\\': buff.push_back('\\'); break;
+                    case 'n':  buff.push_back('\n'); break;
+                    case 'r':  buff.push_back('\r'); break;
+                    case 't':  buff.push_back('\t'); break;
+                    default: SyntaxError("invalid escape char");
+                }
+            } else {
+                buff.push_back(c);
+            }
+        }
+        return Str(buff.data(), buff.size());
+    }
+
+    void eat_string(char quote, StringType type) {
+        Str s = eat_string_until(quote, type == RAW_STRING);
+        if(type == F_STRING){
+            add_token(TK("@fstr"), s);
+        }else{
+            add_token(TK("@str"), s);
+        }
+    }
+
+    void eat_number() {
+        static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
+        std::smatch m;
+
+        const char* i = token_start;
+        while(*i != '\n' && *i != '\0') i++;
+        std::string s = std::string(token_start, i);
+
+        try{
+            if (std::regex_search(s, m, pattern)) {
+                // here is m.length()-1, since the first char was eaten by lex_token()
+                for(int j=0; j<m.length()-1; j++) eatchar();
+
+                int base = 10;
+                size_t size;
+                if (m[1].matched) base = 16;
+                if (m[2].matched) {
+                    if(base == 16) SyntaxError("hex literal should not contain a dot");
+                    add_token(TK("@num"), S_TO_FLOAT(m[0], &size));
+                } else {
+                    add_token(TK("@num"), S_TO_INT(m[0], &size, base));
+                }
+                if (size != m.length()) UNREACHABLE();
+            }
+        }catch(std::exception& _){
+            SyntaxError("invalid number literal");
+        } 
+    }
+
+    bool lex_one_token() {
+        while (peekchar() != '\0') {
+            token_start = curr_char;
+            char c = eatchar_include_newline();
+            switch (c) {
+                case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
+                case '#': skip_line_comment(); break;
+                case '{': add_token(TK("{")); return true;
+                case '}': add_token(TK("}")); return true;
+                case ',': add_token(TK(",")); return true;
+                case ':': add_token_2(':', TK(":"), TK("::")); return true;
+                case ';': add_token(TK(";")); return true;
+                case '(': add_token(TK("(")); return true;
+                case ')': add_token(TK(")")); return true;
+                case '[': add_token(TK("[")); return true;
+                case ']': add_token(TK("]")); return true;
+                case '@': add_token(TK("@")); return true;
+                case '%': add_token_2('=', TK("%"), TK("%=")); return true;
+                case '&': add_token_2('=', TK("&"), TK("&=")); return true;
+                case '|': add_token_2('=', TK("|"), TK("|=")); return true;
+                case '^': add_token_2('=', TK("^"), TK("^=")); return true;
+                case '?': add_token(TK("?")); return true;
+                case '.': {
+                    if(matchchar('.')) {
+                        if(matchchar('.')) {
+                            add_token(TK("..."));
+                        } else {
+                            SyntaxError("invalid token '..'");
+                        }
+                    } else {
+                        add_token(TK("."));
+                    }
+                    return true;
+                }
+                case '=': add_token_2('=', TK("="), TK("==")); return true;
+                case '+': add_token_2('=', TK("+"), TK("+=")); return true;
+                case '>': {
+                    if(matchchar('=')) add_token(TK(">="));
+                    else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
+                    else add_token(TK(">"));
+                    return true;
+                }
+                case '<': {
+                    if(matchchar('=')) add_token(TK("<="));
+                    else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
+                    else add_token(TK("<"));
+                    return true;
+                }
+                case '-': {
+                    if(matchchar('=')) add_token(TK("-="));
+                    else if(matchchar('>')) add_token(TK("->"));
+                    else add_token(TK("-"));
+                    return true;
+                }
+                case '!':
+                    if(matchchar('=')) add_token(TK("!="));
+                    else SyntaxError("expected '=' after '!'");
+                    break;
+                case '*':
+                    if (matchchar('*')) {
+                        add_token(TK("**"));  // '**'
+                    } else {
+                        add_token_2('=', TK("*"), TK("*="));
+                    }
+                    return true;
+                case '/':
+                    if(matchchar('/')) {
+                        add_token_2('=', TK("//"), TK("//="));
+                    } else {
+                        add_token_2('=', TK("/"), TK("/="));
+                    }
+                    return true;
+                case '\r': break;       // just ignore '\r'
+                case ' ': case '\t': eat_spaces(); break;
+                case '\n': {
+                    add_token(TK("@eol"));
+                    if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
+                    return true;
+                }
+                default: {
+                    if(c == 'f'){
+                        if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
+                        if(matchchar('"')) {eat_string('"', F_STRING); return true;}
+                    }else if(c == 'r'){
+                        if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
+                        if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
+                    }
+                    if (c >= '0' && c <= '9') {
+                        eat_number();
+                        return true;
+                    }
+                    switch (eat_name())
+                    {
+                        case 0: break;
+                        case 1: SyntaxError("invalid char: " + std::string(1, c));
+                        case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
+                        case 3: SyntaxError("@id contains invalid char"); break;
+                        case 4: SyntaxError("invalid JSON token"); break;
+                        default: UNREACHABLE();
+                    }
+                    return true;
+                }
+            }
+        }
+
+        token_start = curr_char;
+        while(indents.size() > 1){
+            indents.pop();
+            add_token(TK("@dedent"));
+            return true;
+        }
+        add_token(TK("@eof"));
+        return false;
+    }
+
+    /***** Error Reporter *****/
+    void throw_err(Str type, Str msg){
+        int lineno = current_line;
+        const char* cursor = curr_char;
+        if(peekchar() == '\n'){
+            lineno--;
+            cursor--;
+        }
+        throw_err(type, msg, lineno, cursor);
+    }
+
+    void throw_err(Str type, Str msg, int lineno, const char* cursor){
+        auto e = Exception("SyntaxError", msg);
+        e.st_push(src->snapshot(lineno, cursor));
+        throw e;
+    }
+    void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
+    void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
+    void IndentationError(Str msg){ throw_err("IndentationError", msg); }
+
+    Lexer(shared_ptr<SourceData> src) {
+        this->src = src;
+        this->token_start = src->source;
+        this->curr_char = src->source;
+        this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line});
+        this->indents.push(0);
+    }
+
+    std::vector<Token> run() {
+        if(used) UNREACHABLE();
+        used = true;
+        while (lex_one_token());
+        return std::move(nexts);
+    }
+};
+
+} // namespace pkpy

+ 0 - 302
src/parser.h

@@ -1,302 +0,0 @@
-#pragma once
-
-#include "error.h"
-#include "obj.h"
-
-namespace pkpy{
-
-typedef uint8_t TokenIndex;
-
-constexpr const char* kTokens[] = {
-    "@error", "@eof", "@eol", "@sof",
-    ".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
-    "+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
-    "<<", ">>", "&", "|", "^", "?", "@",
-    "==", "!=", ">=", "<=",
-    "+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
-    /** KW_BEGIN **/
-    "class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
-    "None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
-    "goto", "label",      // extended keywords, not available in cpython
-    "while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
-    /** KW_END **/
-    "is not", "not in",
-    "@id", "@num", "@str", "@fstr",
-    "@indent", "@dedent"
-};
-
-const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
-
-constexpr TokenIndex TK(const char token[]) {
-    for(int k=0; k<kTokenCount; k++){
-        const char* i = kTokens[k];
-        const char* j = token;
-        while(*i && *j && *i == *j) { i++; j++;}
-        if(*i == *j) return k;
-    }
-    UNREACHABLE();
-}
-
-#define TK_STR(t) kTokens[t]
-const TokenIndex kTokenKwBegin = TK("class");
-const TokenIndex kTokenKwEnd = TK("raise");
-
-const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
-    std::map<std::string_view, TokenIndex> map;
-    for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
-    return map;
-}();
-
-
-struct Token{
-  TokenIndex type;
-
-  const char* start;
-  int length;
-  int line;
-  PyObject* value;
-
-  Str str() const { return Str(start, length);}
-
-  Str info() const {
-    StrStream ss;
-    Str raw = str();
-    if (raw == Str("\n")) raw = "\\n";
-    ss << line << ": " << TK_STR(type) << " '" << raw << "'";
-    return ss.str();
-  }
-};
-
-// https://docs.python.org/3/reference/expressions.html
-enum Precedence {
-  PREC_NONE,
-  PREC_ASSIGNMENT,    // =
-  PREC_COMMA,         // ,
-  PREC_TERNARY,       // ?:
-  PREC_LOGICAL_OR,    // or
-  PREC_LOGICAL_AND,   // and
-  PREC_LOGICAL_NOT,   // not
-  PREC_EQUALITY,      // == !=
-  PREC_TEST,          // in / is / is not / not in
-  PREC_COMPARISION,   // < > <= >=
-  PREC_BITWISE_OR,    // |
-  PREC_BITWISE_XOR,   // ^
-  PREC_BITWISE_AND,   // &
-  PREC_BITWISE_SHIFT, // << >>
-  PREC_TERM,          // + -
-  PREC_FACTOR,        // * / % //
-  PREC_UNARY,         // - not
-  PREC_EXPONENT,      // **
-  PREC_CALL,          // ()
-  PREC_SUBSCRIPT,     // []
-  PREC_ATTRIB,        // .index
-  PREC_PRIMARY,
-};
-
-// The context of the parsing phase for the compiler.
-struct Parser {
-    shared_ptr<SourceData> src;
-
-    const char* token_start;
-    const char* curr_char;
-    int current_line = 1;
-    Token prev, curr;
-    queue<Token> nexts;
-    stack<int> indents;
-
-    int brackets_level = 0;
-
-    Token next_token(){
-        if(nexts.empty()){
-            return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
-        }
-        Token t = nexts.front();
-        if(t.type == TK("@eof") && indents.size()>1){
-            nexts.pop();
-            indents.pop();
-            return Token{TK("@dedent"), token_start, 0, current_line};
-        }
-        nexts.pop();
-        return t;
-    }
-
-    char peekchar() const{ return *curr_char; }
-
-    bool match_n_chars(int n, char c0){
-        const char* c = curr_char;
-        for(int i=0; i<n; i++){
-            if(*c == '\0') return false;
-            if(*c != c0) return false;
-            c++;
-        }
-        for(int i=0; i<n; i++) eatchar_include_newline();
-        return true;
-    }
-
-    int eat_spaces(){
-        int count = 0;
-        while (true) {
-            switch (peekchar()) {
-                case ' ' : count+=1; break;
-                case '\t': count+=4; break;
-                default: return count;
-            }
-            eatchar();
-        }
-    }
-
-    bool eat_indentation(){
-        if(brackets_level > 0) return true;
-        int spaces = eat_spaces();
-        if(peekchar() == '#') skip_line_comment();
-        if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
-        // https://docs.python.org/3/reference/lexical_analysis.html#indentation
-        if(spaces > indents.top()){
-            indents.push(spaces);
-            nexts.push(Token{TK("@indent"), token_start, 0, current_line});
-        } else if(spaces < indents.top()){
-            while(spaces < indents.top()){
-                indents.pop();
-                nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
-            }
-            if(spaces != indents.top()){
-                return false;
-            }
-        }
-        return true;
-    }
-
-    char eatchar() {
-        char c = peekchar();
-        if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
-        curr_char++;
-        return c;
-    }
-
-    char eatchar_include_newline() {
-        char c = peekchar();
-        curr_char++;
-        if (c == '\n'){
-            current_line++;
-            src->line_starts.push_back(curr_char);
-        }
-        return c;
-    }
-
-    int eat_name() {
-        curr_char--;
-        while(true){
-            uint8_t c = peekchar();
-            int u8bytes = 0;
-            if((c & 0b10000000) == 0b00000000) u8bytes = 1;
-            else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
-            else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
-            else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
-            else return 1;
-            if(u8bytes == 1){
-                if(isalpha(c) || c=='_' || isdigit(c)) {
-                    curr_char++;
-                    continue;
-                }else{
-                    break;
-                }
-            }
-            // handle multibyte char
-            std::string u8str(curr_char, u8bytes);
-            if(u8str.size() != u8bytes) return 2;
-            uint32_t value = 0;
-            for(int k=0; k < u8bytes; k++){
-                uint8_t b = u8str[k];
-                if(k==0){
-                    if(u8bytes == 2) value = (b & 0b00011111) << 6;
-                    else if(u8bytes == 3) value = (b & 0b00001111) << 12;
-                    else if(u8bytes == 4) value = (b & 0b00000111) << 18;
-                }else{
-                    value |= (b & 0b00111111) << (6*(u8bytes-k-1));
-                }
-            }
-            if(is_unicode_Lo_char(value)) curr_char += u8bytes;
-            else break;
-        }
-
-        int length = (int)(curr_char - token_start);
-        if(length == 0) return 3;
-        std::string_view name(token_start, length);
-
-        if(src->mode == JSON_MODE){
-            if(name == "true"){
-                set_next_token(TK("True"));
-            } else if(name == "false"){
-                set_next_token(TK("False"));
-            } else if(name == "null"){
-                set_next_token(TK("None"));
-            } else {
-                return 4;
-            }
-            return 0;
-        }
-
-        if(kTokenKwMap.count(name)){
-            if(name == "not"){
-                if(strncmp(curr_char, " in", 3) == 0){
-                    curr_char += 3;
-                    set_next_token(TK("not in"));
-                    return 0;
-                }
-            }else if(name == "is"){
-                if(strncmp(curr_char, " not", 4) == 0){
-                    curr_char += 4;
-                    set_next_token(TK("is not"));
-                    return 0;
-                }
-            }
-            set_next_token(kTokenKwMap.at(name));
-        } else {
-            set_next_token(TK("@id"));
-        }
-        return 0;
-    }
-
-    void skip_line_comment() {
-        char c;
-        while ((c = peekchar()) != '\0') {
-            if (c == '\n') return;
-            eatchar();
-        }
-    }
-    
-    bool matchchar(char c) {
-        if (peekchar() != c) return false;
-        eatchar_include_newline();
-        return true;
-    }
-
-    void set_next_token(TokenIndex type, PyObject* value=nullptr) {
-        switch(type){
-            case TK("{"): case TK("["): case TK("("): brackets_level++; break;
-            case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
-        }
-        nexts.push( Token{
-            type,
-            token_start,
-            (int)(curr_char - token_start),
-            current_line - ((type == TK("@eol")) ? 1 : 0),
-            value
-        });
-    }
-
-    void set_next_token_2(char c, TokenIndex one, TokenIndex two) {
-        if (matchchar(c)) set_next_token(two);
-        else set_next_token(one);
-    }
-
-    Parser(shared_ptr<SourceData> src) {
-        this->src = src;
-        this->token_start = src->source;
-        this->curr_char = src->source;
-        this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
-        this->indents.push(0);
-    }
-};
-
-} // namespace pkpy

+ 2 - 0
src/pocketpy.h

@@ -760,6 +760,7 @@ inline void add_module_gc(VM* vm){
 
 inline void VM::post_init(){
     init_builtins(this);
+#if !DEBUG_NO_BUILTIN_MODULES
     add_module_sys(this);
     add_module_time(this);
     add_module_json(this);
@@ -793,6 +794,7 @@ inline void VM::post_init(){
         const PyTypeInfo& info = vm->_all_types[OBJ_GET(Type, args[0])];
         return VAR(info.name);
     }));
+#endif
 }
 
 }   // namespace pkpy

+ 2 - 2
src/vm.h

@@ -93,7 +93,7 @@ public:
     }
 
     Frame* top_frame() const {
-#if PK_EXTRA_CHECK
+#if DEBUG_EXTRA_CHECK
         if(callstack.empty()) UNREACHABLE();
 #endif
         return callstack.top().get();
@@ -166,7 +166,7 @@ public:
         if(_module == nullptr) _module = _main;
         try {
             CodeObject_ code = compile(source, filename, mode);
-            if(_module == _main) std::cout << disassemble(code) << '\n';
+            // if(_module == _main) std::cout << disassemble(code) << '\n';
             return _exec(code, _module);
         }catch (const Exception& e){
             *_stderr << e.summary() << '\n';