lexer.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. #include "pocketpy/lexer.h"
  2. namespace pkpy{
  3. bool Lexer::match_n_chars(int n, char c0){
  4. const char* c = curr_char;
  5. for(int i=0; i<n; i++){
  6. if(*c == '\0') return false;
  7. if(*c != c0) return false;
  8. c++;
  9. }
  10. for(int i=0; i<n; i++) eatchar_include_newline();
  11. return true;
  12. }
  13. bool Lexer::match_string(const char* s){
  14. int s_len = strlen(s);
  15. bool ok = strncmp(curr_char, s, s_len) == 0;
  16. if(ok) for(int i=0; i<s_len; i++) eatchar_include_newline();
  17. return ok;
  18. }
  19. int Lexer::eat_spaces(){
  20. int count = 0;
  21. while (true) {
  22. switch (peekchar()) {
  23. case ' ' : count+=1; break;
  24. case '\t': count+=4; break;
  25. default: return count;
  26. }
  27. eatchar();
  28. }
  29. }
  30. bool Lexer::eat_indentation(){
  31. if(brackets_level > 0) return true;
  32. int spaces = eat_spaces();
  33. if(peekchar() == '#') skip_line_comment();
  34. if(peekchar() == '\0' || peekchar() == '\n') return true;
  35. // https://docs.python.org/3/reference/lexical_analysis.html#indentation
  36. if(spaces > indents.top()){
  37. indents.push(spaces);
  38. nexts.push_back(Token{TK("@indent"), token_start, 0, current_line, brackets_level});
  39. } else if(spaces < indents.top()){
  40. while(spaces < indents.top()){
  41. indents.pop();
  42. nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line, brackets_level});
  43. }
  44. if(spaces != indents.top()){
  45. return false;
  46. }
  47. }
  48. return true;
  49. }
  50. char Lexer::eatchar() {
  51. char c = peekchar();
  52. if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
  53. curr_char++;
  54. return c;
  55. }
  56. char Lexer::eatchar_include_newline() {
  57. char c = peekchar();
  58. curr_char++;
  59. if (c == '\n'){
  60. current_line++;
  61. src->line_starts.push_back(curr_char);
  62. }
  63. return c;
  64. }
  65. int Lexer::eat_name() {
  66. curr_char--;
  67. while(true){
  68. unsigned char c = peekchar();
  69. int u8bytes = utf8len(c, true);
  70. if(u8bytes == 0) return 1;
  71. if(u8bytes == 1){
  72. if(isalpha(c) || c=='_' || isdigit(c)) {
  73. curr_char++;
  74. continue;
  75. }else{
  76. break;
  77. }
  78. }
  79. // handle multibyte char
  80. std::string u8str(curr_char, u8bytes);
  81. if(u8str.size() != u8bytes) return 2;
  82. uint32_t value = 0;
  83. for(int k=0; k < u8bytes; k++){
  84. uint8_t b = u8str[k];
  85. if(k==0){
  86. if(u8bytes == 2) value = (b & 0b00011111) << 6;
  87. else if(u8bytes == 3) value = (b & 0b00001111) << 12;
  88. else if(u8bytes == 4) value = (b & 0b00000111) << 18;
  89. }else{
  90. value |= (b & 0b00111111) << (6*(u8bytes-k-1));
  91. }
  92. }
  93. if(is_unicode_Lo_char(value)) curr_char += u8bytes;
  94. else break;
  95. }
  96. int length = (int)(curr_char - token_start);
  97. if(length == 0) return 3;
  98. std::string_view name(token_start, length);
  99. if(src->mode == JSON_MODE){
  100. if(name == "true"){
  101. add_token(TK("True"));
  102. } else if(name == "false"){
  103. add_token(TK("False"));
  104. } else if(name == "null"){
  105. add_token(TK("None"));
  106. } else {
  107. return 4;
  108. }
  109. return 0;
  110. }
  111. if(kTokenKwMap.count(name)){
  112. add_token(kTokenKwMap.at(name));
  113. } else {
  114. add_token(TK("@id"));
  115. }
  116. return 0;
  117. }
  118. void Lexer::skip_line_comment() {
  119. char c;
  120. while ((c = peekchar()) != '\0') {
  121. if (c == '\n') return;
  122. eatchar();
  123. }
  124. }
  125. bool Lexer::matchchar(char c) {
  126. if (peekchar() != c) return false;
  127. eatchar_include_newline();
  128. return true;
  129. }
  130. void Lexer::add_token(TokenIndex type, TokenValue value) {
  131. switch(type){
  132. case TK("{"): case TK("["): case TK("("): brackets_level++; break;
  133. case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
  134. }
  135. auto token = Token{
  136. type,
  137. token_start,
  138. (int)(curr_char - token_start),
  139. current_line - ((type == TK("@eol")) ? 1 : 0),
  140. brackets_level,
  141. value
  142. };
  143. // handle "not in", "is not", "yield from"
  144. if(!nexts.empty()){
  145. auto& back = nexts.back();
  146. if(back.type == TK("not") && type == TK("in")){
  147. back.type = TK("not in");
  148. return;
  149. }
  150. if(back.type == TK("is") && type == TK("not")){
  151. back.type = TK("is not");
  152. return;
  153. }
  154. if(back.type == TK("yield") && type == TK("from")){
  155. back.type = TK("yield from");
  156. return;
  157. }
  158. nexts.push_back(token);
  159. }
  160. }
  161. void Lexer::add_token_2(char c, TokenIndex one, TokenIndex two) {
  162. if (matchchar(c)) add_token(two);
  163. else add_token(one);
  164. }
  165. Str Lexer::eat_string_until(char quote, bool raw) {
  166. bool quote3 = match_n_chars(2, quote);
  167. std::vector<char> buff;
  168. while (true) {
  169. char c = eatchar_include_newline();
  170. if (c == quote){
  171. if(quote3 && !match_n_chars(2, quote)){
  172. buff.push_back(c);
  173. continue;
  174. }
  175. break;
  176. }
  177. if (c == '\0'){
  178. if(quote3 && src->mode == REPL_MODE){
  179. throw NeedMoreLines(false);
  180. }
  181. SyntaxError("EOL while scanning string literal");
  182. }
  183. if (c == '\n'){
  184. if(!quote3) SyntaxError("EOL while scanning string literal");
  185. else{
  186. buff.push_back(c);
  187. continue;
  188. }
  189. }
  190. if (!raw && c == '\\') {
  191. switch (eatchar_include_newline()) {
  192. case '"': buff.push_back('"'); break;
  193. case '\'': buff.push_back('\''); break;
  194. case '\\': buff.push_back('\\'); break;
  195. case 'n': buff.push_back('\n'); break;
  196. case 'r': buff.push_back('\r'); break;
  197. case 't': buff.push_back('\t'); break;
  198. case 'x': {
  199. char hex[3] = {eatchar(), eatchar(), '\0'};
  200. size_t parsed;
  201. char code;
  202. try{
  203. code = (char)Number::stoi(hex, &parsed, 16);
  204. }catch(...){
  205. SyntaxError("invalid hex char");
  206. }
  207. if (parsed != 2) SyntaxError("invalid hex char");
  208. buff.push_back(code);
  209. } break;
  210. default: SyntaxError("invalid escape char");
  211. }
  212. } else {
  213. buff.push_back(c);
  214. }
  215. }
  216. return Str(buff.data(), buff.size());
  217. }
  218. void Lexer::eat_string(char quote, StringType type) {
  219. Str s = eat_string_until(quote, type == RAW_STRING);
  220. if(type == F_STRING){
  221. add_token(TK("@fstr"), s);
  222. return;
  223. }
  224. if(type == NORMAL_BYTES){
  225. add_token(TK("@bytes"), s);
  226. return;
  227. }
  228. add_token(TK("@str"), s);
  229. }
  230. void Lexer::eat_number() {
  231. static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?(L)?");
  232. std::smatch m;
  233. const char* i = token_start;
  234. while(*i != '\n' && *i != '\0') i++;
  235. std::string s = std::string(token_start, i);
  236. bool ok = std::regex_search(s, m, pattern);
  237. PK_ASSERT(ok);
  238. // here is m.length()-1, since the first char was eaten by lex_token()
  239. for(int j=0; j<m.length()-1; j++) eatchar();
  240. if(m[3].matched){
  241. add_token(TK("@long"));
  242. return;
  243. }
  244. if(m[1].matched && m[2].matched){
  245. SyntaxError("hex literal should not contain a dot");
  246. }
  247. try{
  248. int base = 10;
  249. size_t size;
  250. if (m[1].matched) base = 16;
  251. if (m[2].matched) {
  252. PK_ASSERT(base == 10);
  253. add_token(TK("@num"), Number::stof(m[0], &size));
  254. } else {
  255. add_token(TK("@num"), Number::stoi(m[0], &size, base));
  256. }
  257. PK_ASSERT((int)size == (int)m.length());
  258. }catch(...){
  259. SyntaxError("invalid number literal");
  260. }
  261. }
  262. bool Lexer::lex_one_token() {
  263. while (peekchar() != '\0') {
  264. token_start = curr_char;
  265. char c = eatchar_include_newline();
  266. switch (c) {
  267. case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
  268. case '#': skip_line_comment(); break;
  269. case '~': add_token(TK("~")); return true;
  270. case '{': add_token(TK("{")); return true;
  271. case '}': add_token(TK("}")); return true;
  272. case ',': add_token(TK(",")); return true;
  273. case ':': add_token(TK(":")); return true;
  274. case ';': add_token(TK(";")); return true;
  275. case '(': add_token(TK("(")); return true;
  276. case ')': add_token(TK(")")); return true;
  277. case '[': add_token(TK("[")); return true;
  278. case ']': add_token(TK("]")); return true;
  279. case '@': add_token(TK("@")); return true;
  280. case '$': {
  281. for(int i=TK("$goto"); i<=TK("$label"); i++){
  282. // +1 to skip the '$'
  283. if(match_string(TK_STR(i) + 1)){
  284. add_token((TokenIndex)i);
  285. return true;
  286. }
  287. }
  288. SyntaxError("invalid special token");
  289. } return false;
  290. case '%': add_token_2('=', TK("%"), TK("%=")); return true;
  291. case '&': add_token_2('=', TK("&"), TK("&=")); return true;
  292. case '|': add_token_2('=', TK("|"), TK("|=")); return true;
  293. case '^': add_token_2('=', TK("^"), TK("^=")); return true;
  294. case '?': add_token(TK("?")); return true;
  295. case '.': {
  296. if(matchchar('.')) {
  297. if(matchchar('.')) {
  298. add_token(TK("..."));
  299. } else {
  300. SyntaxError("invalid token '..'");
  301. }
  302. } else {
  303. add_token(TK("."));
  304. }
  305. return true;
  306. }
  307. case '=': add_token_2('=', TK("="), TK("==")); return true;
  308. case '+':
  309. if(matchchar('+')){
  310. add_token(TK("++"));
  311. }else{
  312. add_token_2('=', TK("+"), TK("+="));
  313. }
  314. return true;
  315. case '>': {
  316. if(matchchar('=')) add_token(TK(">="));
  317. else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
  318. else add_token(TK(">"));
  319. return true;
  320. }
  321. case '<': {
  322. if(matchchar('=')) add_token(TK("<="));
  323. else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
  324. else add_token(TK("<"));
  325. return true;
  326. }
  327. case '-': {
  328. if(matchchar('-')){
  329. add_token(TK("--"));
  330. }else{
  331. if(matchchar('=')) add_token(TK("-="));
  332. else if(matchchar('>')) add_token(TK("->"));
  333. else add_token(TK("-"));
  334. }
  335. return true;
  336. }
  337. case '!':
  338. if(matchchar('=')) add_token(TK("!="));
  339. else SyntaxError("expected '=' after '!'");
  340. break;
  341. case '*':
  342. if (matchchar('*')) {
  343. add_token(TK("**")); // '**'
  344. } else {
  345. add_token_2('=', TK("*"), TK("*="));
  346. }
  347. return true;
  348. case '/':
  349. if(matchchar('/')) {
  350. add_token_2('=', TK("//"), TK("//="));
  351. } else {
  352. add_token_2('=', TK("/"), TK("/="));
  353. }
  354. return true;
  355. case ' ': case '\t': eat_spaces(); break;
  356. case '\n': {
  357. add_token(TK("@eol"));
  358. if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
  359. return true;
  360. }
  361. default: {
  362. if(c == 'f'){
  363. if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
  364. if(matchchar('"')) {eat_string('"', F_STRING); return true;}
  365. }else if(c == 'r'){
  366. if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
  367. if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
  368. }else if(c == 'b'){
  369. if(matchchar('\'')) {eat_string('\'', NORMAL_BYTES); return true;}
  370. if(matchchar('"')) {eat_string('"', NORMAL_BYTES); return true;}
  371. }
  372. if (c >= '0' && c <= '9') {
  373. eat_number();
  374. return true;
  375. }
  376. switch (eat_name())
  377. {
  378. case 0: break;
  379. case 1: SyntaxError("invalid char: " + std::string(1, c)); break;
  380. case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c)); break;
  381. case 3: SyntaxError("@id contains invalid char"); break;
  382. case 4: SyntaxError("invalid JSON token"); break;
  383. default: FATAL_ERROR();
  384. }
  385. return true;
  386. }
  387. }
  388. }
  389. token_start = curr_char;
  390. while(indents.size() > 1){
  391. indents.pop();
  392. add_token(TK("@dedent"));
  393. return true;
  394. }
  395. add_token(TK("@eof"));
  396. return false;
  397. }
  398. void Lexer::throw_err(Str type, Str msg){
  399. int lineno = current_line;
  400. const char* cursor = curr_char;
  401. if(peekchar() == '\n'){
  402. lineno--;
  403. cursor--;
  404. }
  405. throw_err(type, msg, lineno, cursor);
  406. }
  407. void Lexer::throw_err(Str type, Str msg, int lineno, const char* cursor){
  408. auto e = Exception(type, msg);
  409. e.st_push(src->snapshot(lineno, cursor));
  410. throw e;
  411. }
  412. Lexer::Lexer(shared_ptr<SourceData> src) {
  413. this->src = src;
  414. this->token_start = src->source.c_str();
  415. this->curr_char = src->source.c_str();
  416. this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line, brackets_level});
  417. this->indents.push(0);
  418. }
  419. std::vector<Token> Lexer::run() {
  420. if(used) FATAL_ERROR();
  421. used = true;
  422. while (lex_one_token());
  423. return std::move(nexts);
  424. }
  425. } // namespace pkpy