Browse Source

improve `chr`

blueloveTH 1 year ago
parent
commit
e25cc48463
5 changed files with 52 additions and 5 deletions
  1. 1 0
      include/pocketpy/common/str.h
  2. 32 0
      src/common/str.c
  3. 2 2
      src/interpreter/vm.c
  4. 10 3
      src/public/modules.c
  5. 7 0
      tests/04_str.py

+ 1 - 0
include/pocketpy/common/str.h

@@ -66,6 +66,7 @@ int c11__byte_index_to_unicode(const char* data, int n);
 bool c11__is_unicode_Lo_char(int c);
 int c11__u8_header(unsigned char c, bool suppress);
 int c11__u8_value(int u8bytes, const char* data);
+int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]);
 
 typedef enum IntParsingResult {
     IntParsing_SUCCESS,

+ 32 - 0
src/common/str.c

@@ -316,6 +316,38 @@ int c11__u8_value(int u8bytes, const char* data) {
     return (int)value;
 }
 
+int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]) {
+    int length = 0;
+
+    if(utf32_char <= 0x7F) {
+        // 1-byte UTF-8
+        utf8_output[0] = (char)utf32_char;
+        length = 1;
+    } else if(utf32_char <= 0x7FF) {
+        // 2-byte UTF-8
+        utf8_output[0] = (char)(0xC0 | ((utf32_char >> 6) & 0x1F));
+        utf8_output[1] = (char)(0x80 | (utf32_char & 0x3F));
+        length = 2;
+    } else if(utf32_char <= 0xFFFF) {
+        // 3-byte UTF-8
+        utf8_output[0] = (char)(0xE0 | ((utf32_char >> 12) & 0x0F));
+        utf8_output[1] = (char)(0x80 | ((utf32_char >> 6) & 0x3F));
+        utf8_output[2] = (char)(0x80 | (utf32_char & 0x3F));
+        length = 3;
+    } else if(utf32_char <= 0x10FFFF) {
+        // 4-byte UTF-8
+        utf8_output[0] = (char)(0xF0 | ((utf32_char >> 18) & 0x07));
+        utf8_output[1] = (char)(0x80 | ((utf32_char >> 12) & 0x3F));
+        utf8_output[2] = (char)(0x80 | ((utf32_char >> 6) & 0x3F));
+        utf8_output[3] = (char)(0x80 | (utf32_char & 0x3F));
+        length = 4;
+    } else {
+        // Invalid UTF-32 character
+        return -1;
+    }
+    return length;
+}
+
 IntParsingResult c11__parse_uint(c11_sv text, int64_t* out, int base) {
     *out = 0;
 

+ 2 - 2
src/interpreter/vm.c

@@ -74,7 +74,7 @@ void VM__ctor(VM* self) {
 
     self->recursion_depth = 0;
     self->max_recursion_depth = 1000;
-    
+
     self->is_curr_exc_handled = false;
 
     self->ctx = NULL;
@@ -92,7 +92,7 @@ void VM__ctor(VM* self) {
         char* p = py_newstrn(&self->ascii_literals[i], 1);
         *p = i;
     }
-    py_newstrn(&self->ascii_literals[128], 0);
+    py_newstrn(&self->ascii_literals[128], 0);  // empty string
 
     // 0: unused
     void* placeholder = TypeList__emplace(&self->types);

+ 10 - 3
src/public/modules.c

@@ -449,9 +449,16 @@ static bool builtins_delattr(int argc, py_Ref argv) {
 static bool builtins_chr(int argc, py_Ref argv) {
     PY_CHECK_ARGC(1);
     PY_CHECK_ARG_TYPE(0, tp_int);
-    py_i64 val = py_toint(py_arg(0));
-    if(val < 0 || val > 128) { return ValueError("chr() arg not in range(128)"); }
-    py_assign(py_retval(), &pk_current_vm->ascii_literals[val]);
+    uint32_t val = py_toint(py_arg(0));
+    if(val >= 0 && val < 128) {
+        py_assign(py_retval(), &pk_current_vm->ascii_literals[val]);
+    } else {
+        // convert to utf-8
+        char utf8[4];
+        int len = c11__u32_to_u8(val, utf8);
+        if(len == -1) return ValueError("invalid unicode code point: %d", val);
+        py_newstrv(py_retval(), (c11_sv){utf8, len});
+    }
     return true;
 }
 

+ 7 - 0
tests/04_str.py

@@ -191,6 +191,13 @@ assert (1 == '1') is False
 assert 1 == 1.0
 
 assert chr(97) is 'a'
+assert ord('a') == 97
+
+assert ord('🥕') == 0x1f955
+assert chr(0x1f955) == '🥕'
+
+assert ord('测') == 27979
+assert chr(27979) == '测'
 
 exit()