blueloveTH 1 an în urmă
părinte
comite
fa31f4c5df

+ 59 - 5
include/pocketpy/compiler/lexer.h

@@ -9,11 +9,6 @@ extern "C" {
 
 extern const char* pk_TokenSymbols[];
 
-typedef struct pk_TokenDeserializer {
-    const char* curr;
-    const char* source;
-} pk_TokenDeserializer;
-
 typedef enum TokenIndex{
     TK_EOF, TK_EOL, TK_SOF,
     TK_ID, TK_NUM, TK_STR, TK_FSTR, TK_LONG, TK_BYTES, TK_IMAG,
@@ -39,6 +34,64 @@ typedef enum TokenIndex{
     TK__COUNT__
 } TokenIndex;
 
+typedef struct TokenValue {
+    int index;
+    union {
+        int64_t _i64;   // 0
+        double _f64;    // 1
+        py_Str _str;    // 2
+    };
+} TokenValue;
+
+typedef struct Token {
+    TokenIndex type;
+    const char* start;
+    int length;
+    int line;
+    int brackets_level;
+    TokenValue value;
+} Token;
+
+// https://docs.python.org/3/reference/expressions.html#operator-precedence
+enum Precedence {
+    PREC_LOWEST,
+    PREC_LAMBDA,       // lambda
+    PREC_TERNARY,      // ?:
+    PREC_LOGICAL_OR,   // or
+    PREC_LOGICAL_AND,  // and
+    PREC_LOGICAL_NOT,  // not
+    /* https://docs.python.org/3/reference/expressions.html#comparisons
+     * Unlike C, all comparison operations in Python have the same priority,
+     * which is lower than that of any arithmetic, shifting or bitwise operation.
+     * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
+     */
+    PREC_COMPARISION,    // < > <= >= != ==, in / is / is not / not in
+    PREC_BITWISE_OR,     // |
+    PREC_BITWISE_XOR,    // ^
+    PREC_BITWISE_AND,    // &
+    PREC_BITWISE_SHIFT,  // << >>
+    PREC_TERM,           // + -
+    PREC_FACTOR,         // * / % // @
+    PREC_UNARY,          // - not ~
+    PREC_EXPONENT,       // **
+    PREC_PRIMARY,        // f() x[] a.b 1:2
+    PREC_HIGHEST,
+};
+
+enum StringType {
+    NORMAL_STRING,
+    RAW_STRING,
+    F_STRING,
+    NORMAL_BYTES
+};
+
+#define is_raw_string_used(t) ((t) == TK_ID || (t) == TK_LONG)
+
+typedef struct pk_TokenDeserializer {
+    const char* curr;
+    const char* source;
+} pk_TokenDeserializer;
+
 void pk_TokenDeserializer__ctor(pk_TokenDeserializer* self, const char* source);
 bool pk_TokenDeserializer__match_char(pk_TokenDeserializer* self, char c);
 c11_string pk_TokenDeserializer__read_string(pk_TokenDeserializer* self, char c);
@@ -47,6 +100,7 @@ int pk_TokenDeserializer__read_count(pk_TokenDeserializer* self);
 int64_t pk_TokenDeserializer__read_uint(pk_TokenDeserializer* self, char c);
 double pk_TokenDeserializer__read_float(pk_TokenDeserializer* self, char c);
 
+
 #ifdef __cplusplus
 }
 #endif

+ 0 - 44
include/pocketpy/compiler/lexer.hpp

@@ -8,50 +8,6 @@
 
 namespace pkpy {
 
-using TokenValue = std::variant<std::monostate, i64, f64, Str>;
-
-constexpr inline bool is_raw_string_used(TokenIndex t) noexcept{ return t == TK_ID || t == TK_LONG; }
-
-struct Token {
-    TokenIndex type;
-    const char* start;
-    int length;
-    int line;
-    int brackets_level;
-    TokenValue value;
-
-    Str str() const noexcept{ return Str(start, length); }
-
-    std::string_view sv() const noexcept{ return std::string_view(start, length); }
-};
-
-// https://docs.python.org/3/reference/expressions.html#operator-precedence
-enum Precedence {
-    PREC_LOWEST,
-    PREC_LAMBDA,       // lambda
-    PREC_TERNARY,      // ?:
-    PREC_LOGICAL_OR,   // or
-    PREC_LOGICAL_AND,  // and
-    PREC_LOGICAL_NOT,  // not
-    /* https://docs.python.org/3/reference/expressions.html#comparisons
-     * Unlike C, all comparison operations in Python have the same priority,
-     * which is lower than that of any arithmetic, shifting or bitwise operation.
-     * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics.
-     */
-    PREC_COMPARISION,    // < > <= >= != ==, in / is / is not / not in
-    PREC_BITWISE_OR,     // |
-    PREC_BITWISE_XOR,    // ^
-    PREC_BITWISE_AND,    // &
-    PREC_BITWISE_SHIFT,  // << >>
-    PREC_TERM,           // + -
-    PREC_FACTOR,         // * / % // @
-    PREC_UNARY,          // - not ~
-    PREC_EXPONENT,       // **
-    PREC_PRIMARY,        // f() x[] a.b 1:2
-    PREC_HIGHEST,
-};
-
-enum class StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES };
 
 struct Lexer {
     PK_ALWAYS_PASS_BY_POINTER(Lexer)

+ 3 - 3
include/pocketpy/objects/codeobject.h

@@ -68,7 +68,7 @@ typedef struct BytecodeEx {
 } BytecodeEx;
 
 typedef struct CodeObject {
-    pkpy_SourceData_ src;
+    pk_SourceData_ src;
     py_Str name;
 
     c11_vector/*T=Bytecode*/                codes;
@@ -88,7 +88,7 @@ typedef struct CodeObject {
     int end_line;
 } CodeObject;
 
-CodeObject* CodeObject__new(pkpy_SourceData_ src, c11_string name);
+CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name);
 void CodeObject__delete(CodeObject* self);
 void CodeObject__gc_mark(const CodeObject* self);
 
@@ -117,7 +117,7 @@ typedef struct FuncDecl {
 
 typedef FuncDecl* FuncDecl_;
 
-FuncDecl_ FuncDecl__rcnew(pkpy_SourceData_ src, c11_string name);
+FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name);
 void FuncDecl__dtor(FuncDecl* self);
 void FuncDecl__add_kwarg(FuncDecl* self, int index, uint16_t key, const PyVar* value);
 void FuncDecl__gc_mark(const FuncDecl* self);

+ 3 - 3
include/pocketpy/objects/error.h

@@ -10,7 +10,7 @@ extern "C" {
 #endif
 
 typedef struct pkpy_ExceptionFrame {
-    pkpy_SourceData_ src;
+    pk_SourceData_ src;
     int lineno;
     const char* cursor;
     py_Str name;
@@ -31,12 +31,12 @@ typedef struct pkpy_Exception {
 
 void pkpy_Exception__ctor(pkpy_Exception* self, StrName type);
 void pkpy_Exception__dtor(pkpy_Exception* self);
-void pkpy_Exception__stpush(pkpy_Exception* self, pkpy_SourceData_ src, int lineno, const char* cursor, const char* name);
+void pkpy_Exception__stpush(pkpy_Exception* self, pk_SourceData_ src, int lineno, const char* cursor, const char* name);
 py_Str pkpy_Exception__summary(pkpy_Exception* self);
 
 struct Error{
     const char* type;
-    pkpy_SourceData_ src;
+    pk_SourceData_ src;
     int lineno;
     const char* cursor;
     char msg[100];

+ 5 - 8
include/pocketpy/objects/sourcedata.h

@@ -11,7 +11,7 @@ extern "C" {
 
 enum CompileMode { EXEC_MODE, EVAL_MODE, REPL_MODE, JSON_MODE, CELL_MODE };
 
-struct pkpy_SourceData {
+struct pk_SourceData {
     RefCounted rc;
     enum CompileMode mode;
     bool is_precompiled;
@@ -23,14 +23,11 @@ struct pkpy_SourceData {
     c11_vector/*T=py_Str*/ _precompiled_tokens;
 };
 
-typedef struct pkpy_SourceData* pkpy_SourceData_;
+typedef struct pk_SourceData* pk_SourceData_;
 
-pkpy_SourceData_ pkpy_SourceData__rcnew(c11_string source, const py_Str *filename, enum CompileMode mode);
-void pkpy_SourceData__ctor(struct pkpy_SourceData *self, c11_string source, const py_Str *filename, enum CompileMode mode);
-void pkpy_SourceData__dtor(struct pkpy_SourceData* self);
-
-bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, const char** st, const char** ed);
-py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData *self, int lineno, const char *cursor, const char *name);
+pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode);
+bool pk_SourceData__get_line(const struct pk_SourceData* self, int lineno, const char** st, const char** ed);
+py_Str pk_SourceData__snapshot(const struct pk_SourceData *self, int lineno, const char *cursor, const char *name);
 
 #ifdef __cplusplus
 }

+ 25 - 24
src/common/sourcedata.c

@@ -4,40 +4,33 @@
 #include <stdlib.h>
 #include <string.h>
 
-pkpy_SourceData_ pkpy_SourceData__rcnew(c11_string source, const py_Str* filename, enum CompileMode mode) {
-    pkpy_SourceData_ self = malloc(sizeof(struct pkpy_SourceData));
-    pkpy_SourceData__ctor(self, source, filename, mode);
-    self->rc.count = 1;
-    self->rc.dtor = (void(*)(void*))pkpy_SourceData__dtor;
-    return self;
-}
-
-void pkpy_SourceData__ctor(struct pkpy_SourceData* self,
-                           c11_string source,       // may not be null-terminated
-                           const py_Str* filename,
+void pk_SourceData__ctor(struct pk_SourceData* self,
+                           const char* source,
+                           const char* filename,
                            enum CompileMode mode) {
-    self->filename = py_Str__copy(filename);  // OPTIMIZEME?
+    py_Str__ctor(&self->filename, filename);
     self->mode = mode;
     c11_vector__ctor(&self->line_starts, sizeof(const char*));
     c11_vector__ctor(&self->_precompiled_tokens, sizeof(py_Str));
 
-    int index = 0;
     // Skip utf8 BOM if there is any.
-    if (source.size >= 3 && strncmp(source.data, "\xEF\xBB\xBF", 3) == 0) index += 3;
+    if(strncmp(source, "\xEF\xBB\xBF", 3) == 0) source += 3;
     // Drop all '\r'
     pk_SStream ss;
-    pk_SStream__ctor2(&ss, source.size + 1);
-    while(index < source.size){
-        char c = source.data[index];
+    pk_SStream__ctor(&ss);
+    while(true){
+        char c = *source;
+        if(c == '\0') break;
         if(c != '\r') pk_SStream__write_char(&ss, c);
-        index++;
+        source++;
     }
     self->source = pk_SStream__submit(&ss);
-    self->is_precompiled = (strncmp(py_Str__data(&self->source), "pkpy:", 5) == 0);
-    c11_vector__push(const char*, &self->line_starts, py_Str__data(&self->source));
+    source = py_Str__data(&self->source);
+    self->is_precompiled = (strncmp(source, "pkpy:", 5) == 0);
+    c11_vector__push(const char*, &self->line_starts, source);
 }
 
-void pkpy_SourceData__dtor(struct pkpy_SourceData* self) {
+void pk_SourceData__dtor(struct pk_SourceData* self) {
     py_Str__dtor(&self->filename);
     py_Str__dtor(&self->source);
     c11_vector__dtor(&self->line_starts);
@@ -48,7 +41,15 @@ void pkpy_SourceData__dtor(struct pkpy_SourceData* self) {
     c11_vector__dtor(&self->_precompiled_tokens);
 }
 
-bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, const char** st, const char** ed) {
+pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode) {
+    pk_SourceData_ self = malloc(sizeof(struct pk_SourceData));
+    pk_SourceData__ctor(self, source, filename, mode);
+    self->rc.count = 1;
+    self->rc.dtor = (void(*)(void*))pk_SourceData__dtor;
+    return self;
+}
+
+bool pk_SourceData__get_line(const struct pk_SourceData* self, int lineno, const char** st, const char** ed) {
     if(self->is_precompiled || lineno == -1) { return false; }
     lineno -= 1;
     if(lineno < 0) lineno = 0;
@@ -62,7 +63,7 @@ bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, c
     return true;
 }
 
-py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData* self, int lineno, const char* cursor, const char* name) {
+py_Str pk_SourceData__snapshot(const struct pk_SourceData* self, int lineno, const char* cursor, const char* name) {
     pk_SStream ss;
     pk_SStream__ctor(&ss);
 
@@ -85,7 +86,7 @@ py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData* self, int lineno,
     if(!self->is_precompiled) {
         pk_SStream__write_char(&ss, '\n');
         const char *st = NULL, *ed;
-        if(pkpy_SourceData__get_line(self, lineno, &st, &ed)) {
+        if(pk_SourceData__get_line(self, lineno, &st, &ed)) {
             while(st < ed && isblank(*st))
                 ++st;
             if(st < ed) {

+ 1 - 0
src/common/sstream.c

@@ -152,6 +152,7 @@ void pk_SStream__write_any(pk_SStream* self, const char* fmt, const pk_AnyStr* a
 
 py_Str pk_SStream__submit(pk_SStream* self) {
     c11_vector__push(char, &self->data, '\0');
+    // TODO: optimize c11__isascii
     py_Str retval = {
         .size = self->data.count - 1,
         .is_ascii = c11__isascii((char*)self->data.data, self->data.count),

+ 1 - 0
src/compiler/lexer.c

@@ -2,6 +2,7 @@
 #include "pocketpy/common/str.h"
 #include "pocketpy/common/smallmap.h"
 #include "pocketpy/compiler/lexer.h"
+#include "pocketpy/objects/sourcedata.h"
 
 const char* pk_TokenSymbols[] = {
     "@eof", "@eol", "@sof",

+ 187 - 0
src/compiler/lexer2.c

@@ -0,0 +1,187 @@
+#include "pocketpy/compiler/lexer.h"
+#include "pocketpy/objects/sourcedata.h"
+
+typedef struct pk_Lexer{
+    pk_SourceData_ src;
+    const char* token_start;
+    const char* curr_char;
+    int current_line;
+    int brackets_level;
+
+    c11_vector/*T=Token*/ nexts;
+    c11_vector/*T=int*/ indents;
+} pk_Lexer;
+
+const static TokenValue EmptyTokenValue;
+
+void pk_Lexer__ctor(pk_Lexer* self, pk_SourceData_ src){
+    PK_INCREF(src);
+    self->src = src;
+    self->curr_char = self->token_start = py_Str__data(&src->source);
+    self->current_line = 1;
+    self->brackets_level = 0;
+    c11_vector__ctor(&self->nexts, sizeof(Token));
+    c11_vector__ctor(&self->indents, sizeof(int));
+}
+
+void pk_Lexer__dtor(pk_Lexer* self){
+    PK_DECREF(self->src);
+    c11_vector__dtor(&self->nexts);
+    c11_vector__dtor(&self->indents);
+}
+
+void* pk_Lexer__run(pk_SourceData_ src, void** out_tokens){
+    pk_Lexer lexer;
+    pk_Lexer__ctor(&lexer, src);
+
+    if(src->is_precompiled) {
+        pk_Lexer__dtor(&lexer);
+        return from_precompiled();
+    }
+    // push initial tokens
+    Token sof = {TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue};
+    c11_vector__push(Token, &lexer.nexts, sof);
+    c11_vector__push(int, &lexer.indents, 0);
+
+    bool eof = false;
+    while(!eof) {
+        void* err = lex_one_token(&eof);
+        if(err){
+            pk_Lexer__dtor(&lexer);
+            return err;
+        }
+    }
+    pk_Lexer__dtor(&lexer);
+    return NULL;
+}
+
+char eatchar(pk_Lexer* self){
+    char c = *self->curr_char;
+    assert(c != '\n');  // eatchar() cannot consume a newline
+    self->curr_char++;
+    return c;
+}
+
+char eatchar_include_newline(pk_Lexer* self){
+    char c = *self->curr_char;
+    self->curr_char++;
+    if(c == '\n') {
+        self->current_line++;
+        c11_vector__push(const char*, &self->src->line_starts, self->curr_char);
+    }
+    return c;
+}
+
+int eat_spaces(pk_Lexer* self){
+    int count = 0;
+    while(true) {
+        switch(*self->curr_char) {
+            case ' ': count += 1; break;
+            case '\t': count += 4; break;
+            default: return count;
+        }
+        eatchar(self);
+    }
+}
+
+bool matchchar(pk_Lexer* self, char c){
+    if(*self->curr_char != c) return false;
+    eatchar_include_newline(self);
+    return true;
+}
+
+bool match_n_chars(pk_Lexer* self, int n, char c0){
+    const char* c = self->curr_char;
+    for(int i = 0; i < n; i++) {
+        if(*c == '\0') return false;
+        if(*c != c0) return false;
+        c++;
+    }
+    for(int i = 0; i < n; i++)
+        eatchar_include_newline(self);
+    return true;
+}
+
+bool match_string(pk_Lexer* self, const char* s){
+    int s_len = strlen(s);
+    if(strncmp(self->curr_char, s, s_len) == 0){
+        for(int i = 0; i < s_len; i++)
+            eatchar_include_newline(self);
+    }
+    return ok;
+}
+
+void skip_line_comment(pk_Lexer* self){
+    while(*self->curr_char) {
+        if(*self->curr_char == '\n') return;
+        eatchar(self);
+    }
+}
+
+void add_token(pk_Lexer* self, TokenIndex type, TokenValue value){
+    switch(type) {
+        case TK_LBRACE:
+        case TK_LBRACKET:
+        case TK_LPAREN: self->brackets_level++; break;
+        case TK_RPAREN:
+        case TK_RBRACKET:
+        case TK_RBRACE: self->brackets_level--; break;
+        default: break;
+    }
+    Token token = {type,
+                       self->token_start,
+                       (int)(self->curr_char - self->token_start),
+                       self->current_line - ((type == TK_EOL) ? 1 : 0),
+                       self->brackets_level,
+                       value};
+    // handle "not in", "is not", "yield from"
+    if(self->nexts.count > 0) {
+        Token* back = &c11_vector__back(Token, &self->nexts);
+        if(back->type == TK_NOT_KW && type == TK_IN) {
+            back->type = TK_NOT_IN;
+            return;
+        }
+        if(back->type == TK_IS && type == TK_NOT_KW) {
+            back->type = TK_IS_NOT;
+            return;
+        }
+        if(back->type == TK_YIELD && type == TK_FROM) {
+            back->type = TK_YIELD_FROM;
+            return;
+        }
+        c11_vector__push(Token, &self->nexts, token);
+    }
+}
+
+
+void add_token_2(pk_Lexer* self, char c, TokenIndex one, TokenIndex two){
+    if(matchchar(self, c))
+        add_token(self, two, EmptyTokenValue);
+    else
+        add_token(self, one, EmptyTokenValue);
+}
+
+bool eat_indentation(pk_Lexer* self){
+    if(self->brackets_level > 0) return true;
+    int spaces = eat_spaces(self);
+    if(*self->curr_char == '#') skip_line_comment();
+    if(*self->curr_char == '\0' || *self->curr_char == '\n'){
+        return true;
+    }
+    // https://docs.python.org/3/reference/lexical_analysis.html#indentation
+    int indents_back = c11_vector__back(int, &self->indents);
+    if(spaces > indents_back) {
+        c11_vector__push(int, &self->indents, spaces);
+        Token t = {TK_INDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
+        c11_vector__push(Token, &self->nexts, t);
+    } else if(spaces < indents_back) {
+        do {
+            c11_vector__pop(int, &self->indents);
+            Token t = {TK_DEDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
+            c11_vector__push(Token, &self->nexts, t);
+            indents_back = c11_vector__back(int, &self->indents);
+        } while(spaces < indents_back);
+        if(spaces != indents_back) { return false; }
+    }
+    return true;
+}

+ 2 - 2
src/error.c

@@ -23,7 +23,7 @@ void pkpy_Exception__dtor(pkpy_Exception* self){
     c11_vector__dtor(&self->stacktrace);
 }
 
-void pkpy_Exception__stpush(pkpy_Exception* self, pkpy_SourceData_ src, int lineno, const char* cursor, const char* name){
+void pkpy_Exception__stpush(pkpy_Exception* self, pk_SourceData_ src, int lineno, const char* cursor, const char* name){
     if(self->stacktrace.count >= 7) return;
     PK_INCREF(src);
     pkpy_ExceptionFrame* frame = c11_vector__emplace(&self->stacktrace);
@@ -42,7 +42,7 @@ py_Str pkpy_Exception__summary(pkpy_Exception* self){
     }
     for(int i=self->stacktrace.count-1; i >= 0; i--) {
         pkpy_ExceptionFrame* frame = c11__at(pkpy_ExceptionFrame, &self->stacktrace, i);
-        py_Str s = pkpy_SourceData__snapshot(frame->src, frame->lineno, frame->cursor, py_Str__data(&frame->name));
+        py_Str s = pk_SourceData__snapshot(frame->src, frame->lineno, frame->cursor, py_Str__data(&frame->name));
         pk_SStream__write_Str(&ss, &s);
         py_Str__dtor(&s);
         pk_SStream__write_cstr(&ss, "\n");

+ 2 - 2
src/objects/codeobject.c

@@ -13,7 +13,7 @@ bool Bytecode__is_forward_jump(const Bytecode* self) {
     return self->op >= OP_JUMP_FORWARD && self->op <= OP_LOOP_BREAK;
 }
 
-FuncDecl_ FuncDecl__rcnew(pkpy_SourceData_ src, c11_string name){
+FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name){
     FuncDecl* self = malloc(sizeof(FuncDecl));
     self->rc.count = 1;
     self->rc.dtor = (void (*)(void*))FuncDecl__dtor;
@@ -46,7 +46,7 @@ void FuncDecl__add_kwarg(FuncDecl* self, int index, uint16_t key, const PyVar* v
     c11_vector__push(FuncDeclKwArg, &self->kwargs, item);
 }
 
-CodeObject* CodeObject__new(pkpy_SourceData_ src, c11_string name){
+CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name){
     CodeObject* self = malloc(sizeof(CodeObject));
     self->src = src; PK_INCREF(src);
     py_Str__ctor2(&self->name, name.data, name.size);