physfs_unicode.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. #define __PHYSICSFS_INTERNAL__
  2. #include "physfs_internal.h"
  3. /*
  4. * From rfc3629, the UTF-8 spec:
  5. * https://www.ietf.org/rfc/rfc3629.txt
  6. *
  7. * Char. number range | UTF-8 octet sequence
  8. * (hexadecimal) | (binary)
  9. * --------------------+---------------------------------------------
  10. * 0000 0000-0000 007F | 0xxxxxxx
  11. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  12. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  13. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  14. */
  15. /*
  16. * This may not be the best value, but it's one that isn't represented
  17. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  18. * value from utf8codepoint() if there's bogus bits in the
  19. * stream. utf8codepoint() will turn this value into something
  20. * reasonable (like a question mark), for text that wants to try to recover,
  21. * whereas utf8valid() will use the value to determine if a string has bad
  22. * bits.
  23. */
  24. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  25. /*
  26. * This is the codepoint we currently return when there was bogus bits in a
  27. * UTF-8 string. May not fly in Asian locales?
  28. */
  29. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  30. static PHYSFS_uint32 utf8codepoint(const char **_str)
  31. {
  32. const char *str = *_str;
  33. PHYSFS_uint32 retval = 0;
  34. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  35. PHYSFS_uint32 octet2, octet3, octet4;
  36. if (octet == 0) /* null terminator, end of string. */
  37. return 0;
  38. else if (octet < 128) /* one octet char: 0 to 127 */
  39. {
  40. (*_str)++; /* skip to next possible start of codepoint. */
  41. return octet;
  42. } /* else if */
  43. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  44. {
  45. /*
  46. * Apparently each of these is supposed to be flagged as a bogus
  47. * char, instead of just resyncing to the next valid codepoint.
  48. */
  49. (*_str)++; /* skip to next possible start of codepoint. */
  50. return UNICODE_BOGUS_CHAR_VALUE;
  51. } /* else if */
  52. else if (octet < 224) /* two octets */
  53. {
  54. (*_str)++; /* advance at least one byte in case of an error */
  55. octet -= (128+64);
  56. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  57. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  58. return UNICODE_BOGUS_CHAR_VALUE;
  59. *_str += 1; /* skip to next possible start of codepoint. */
  60. retval = ((octet << 6) | (octet2 - 128));
  61. if ((retval >= 0x80) && (retval <= 0x7FF))
  62. return retval;
  63. } /* else if */
  64. else if (octet < 240) /* three octets */
  65. {
  66. (*_str)++; /* advance at least one byte in case of an error */
  67. octet -= (128+64+32);
  68. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  69. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  70. return UNICODE_BOGUS_CHAR_VALUE;
  71. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  72. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  73. return UNICODE_BOGUS_CHAR_VALUE;
  74. *_str += 2; /* skip to next possible start of codepoint. */
  75. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  76. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  77. switch (retval)
  78. {
  79. case 0xD800:
  80. case 0xDB7F:
  81. case 0xDB80:
  82. case 0xDBFF:
  83. case 0xDC00:
  84. case 0xDF80:
  85. case 0xDFFF:
  86. return UNICODE_BOGUS_CHAR_VALUE;
  87. } /* switch */
  88. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  89. if ((retval >= 0x800) && (retval <= 0xFFFD))
  90. return retval;
  91. } /* else if */
  92. else if (octet < 248) /* four octets */
  93. {
  94. (*_str)++; /* advance at least one byte in case of an error */
  95. octet -= (128+64+32+16);
  96. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  97. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  98. return UNICODE_BOGUS_CHAR_VALUE;
  99. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  100. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  101. return UNICODE_BOGUS_CHAR_VALUE;
  102. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  103. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  104. return UNICODE_BOGUS_CHAR_VALUE;
  105. *_str += 3; /* skip to next possible start of codepoint. */
  106. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  107. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  108. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  109. return retval;
  110. } /* else if */
  111. /*
  112. * Five and six octet sequences became illegal in rfc3629.
  113. * We throw the codepoint away, but parse them to make sure we move
  114. * ahead the right number of bytes and don't overflow the buffer.
  115. */
  116. else if (octet < 252) /* five octets */
  117. {
  118. (*_str)++; /* advance at least one byte in case of an error */
  119. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  120. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  121. return UNICODE_BOGUS_CHAR_VALUE;
  122. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  123. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  124. return UNICODE_BOGUS_CHAR_VALUE;
  125. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  126. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  127. return UNICODE_BOGUS_CHAR_VALUE;
  128. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  129. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  130. return UNICODE_BOGUS_CHAR_VALUE;
  131. *_str += 4; /* skip to next possible start of codepoint. */
  132. return UNICODE_BOGUS_CHAR_VALUE;
  133. } /* else if */
  134. else /* six octets */
  135. {
  136. (*_str)++; /* advance at least one byte in case of an error */
  137. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  138. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  139. return UNICODE_BOGUS_CHAR_VALUE;
  140. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  141. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  142. return UNICODE_BOGUS_CHAR_VALUE;
  143. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  144. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  145. return UNICODE_BOGUS_CHAR_VALUE;
  146. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  147. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  148. return UNICODE_BOGUS_CHAR_VALUE;
  149. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  150. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  151. return UNICODE_BOGUS_CHAR_VALUE;
  152. *_str += 6; /* skip to next possible start of codepoint. */
  153. return UNICODE_BOGUS_CHAR_VALUE;
  154. } /* else if */
  155. return UNICODE_BOGUS_CHAR_VALUE;
  156. } /* utf8codepoint */
  157. void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  158. {
  159. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  160. while (len >= sizeof (PHYSFS_uint32))
  161. {
  162. PHYSFS_uint32 cp = utf8codepoint(&src);
  163. if (cp == 0)
  164. break;
  165. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  166. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  167. *(dst++) = cp;
  168. len -= sizeof (PHYSFS_uint32);
  169. } /* while */
  170. *dst = 0;
  171. } /* PHYSFS_utf8ToUcs4 */
  172. void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  173. {
  174. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  175. while (len >= sizeof (PHYSFS_uint16))
  176. {
  177. PHYSFS_uint32 cp = utf8codepoint(&src);
  178. if (cp == 0)
  179. break;
  180. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  181. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  182. if (cp > 0xFFFF) /* UTF-16 surrogates (bogus chars in UCS-2) */
  183. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  184. *(dst++) = cp;
  185. len -= sizeof (PHYSFS_uint16);
  186. } /* while */
  187. *dst = 0;
  188. } /* PHYSFS_utf8ToUcs2 */
  189. void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  190. {
  191. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  192. while (len >= sizeof (PHYSFS_uint16))
  193. {
  194. PHYSFS_uint32 cp = utf8codepoint(&src);
  195. if (cp == 0)
  196. break;
  197. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  198. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  199. if (cp > 0xFFFF) /* encode as surrogate pair */
  200. {
  201. if (len < (sizeof (PHYSFS_uint16) * 2))
  202. break; /* not enough room for the pair, stop now. */
  203. cp -= 0x10000; /* Make this a 20-bit value */
  204. *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
  205. len -= sizeof (PHYSFS_uint16);
  206. cp = 0xDC00 + (cp & 0x3FF);
  207. } /* if */
  208. *(dst++) = cp;
  209. len -= sizeof (PHYSFS_uint16);
  210. } /* while */
  211. *dst = 0;
  212. } /* PHYSFS_utf8ToUtf16 */
  213. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  214. {
  215. char *dst = *_dst;
  216. PHYSFS_uint64 len = *_len;
  217. if (len == 0)
  218. return;
  219. if (cp > 0x10FFFF)
  220. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  221. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  222. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  223. else
  224. {
  225. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  226. switch (cp)
  227. {
  228. case 0xD800:
  229. case 0xDB7F:
  230. case 0xDB80:
  231. case 0xDBFF:
  232. case 0xDC00:
  233. case 0xDF80:
  234. case 0xDFFF:
  235. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  236. } /* switch */
  237. } /* else */
  238. /* Do the encoding... */
  239. if (cp < 0x80)
  240. {
  241. *(dst++) = (char) cp;
  242. len--;
  243. } /* if */
  244. else if (cp < 0x800)
  245. {
  246. if (len < 2)
  247. len = 0;
  248. else
  249. {
  250. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  251. *(dst++) = (char) (cp & 0x3F) | 128;
  252. len -= 2;
  253. } /* else */
  254. } /* else if */
  255. else if (cp < 0x10000)
  256. {
  257. if (len < 3)
  258. len = 0;
  259. else
  260. {
  261. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  262. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  263. *(dst++) = (char) (cp & 0x3F) | 128;
  264. len -= 3;
  265. } /* else */
  266. } /* else if */
  267. else
  268. {
  269. if (len < 4)
  270. len = 0;
  271. else
  272. {
  273. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  274. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  275. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  276. *(dst++) = (char) (cp & 0x3F) | 128;
  277. len -= 4;
  278. } /* else if */
  279. } /* else */
  280. *_dst = dst;
  281. *_len = len;
  282. } /* utf8fromcodepoint */
  283. #define UTF8FROMTYPE(typ, src, dst, len) \
  284. if (len == 0) return; \
  285. len--; \
  286. while (len) \
  287. { \
  288. const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
  289. if (cp == 0) break; \
  290. utf8fromcodepoint(cp, &dst, &len); \
  291. } \
  292. *dst = '\0'; \
  293. void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  294. {
  295. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  296. } /* PHYSFS_utf8FromUcs4 */
  297. void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  298. {
  299. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  300. } /* PHYSFS_utf8FromUcs2 */
  301. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  302. void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
  303. {
  304. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  305. } /* PHYSFS_utf8FromLatin1 */
  306. #undef UTF8FROMTYPE
  307. void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  308. {
  309. if (len == 0)
  310. return;
  311. len--;
  312. while (len)
  313. {
  314. PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
  315. if (cp == 0)
  316. break;
  317. /* Orphaned second half of surrogate pair? */
  318. if ((cp >= 0xDC00) && (cp <= 0xDFFF))
  319. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  320. else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
  321. {
  322. const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
  323. if ((pair < 0xDC00) || (pair > 0xDFFF))
  324. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  325. else
  326. {
  327. src++; /* eat the other surrogate. */
  328. cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
  329. } /* else */
  330. } /* else if */
  331. utf8fromcodepoint(cp, &dst, &len);
  332. } /* while */
  333. *dst = '\0';
  334. } /* PHYSFS_utf8FromUtf16 */
  335. typedef struct CaseFoldMapping
  336. {
  337. PHYSFS_uint32 from;
  338. PHYSFS_uint32 to0;
  339. PHYSFS_uint32 to1;
  340. PHYSFS_uint32 to2;
  341. } CaseFoldMapping;
  342. typedef struct CaseFoldHashBucket
  343. {
  344. const PHYSFS_uint8 count;
  345. const CaseFoldMapping *list;
  346. } CaseFoldHashBucket;
  347. #include "physfs_casefolding.h"
  348. static void locate_case_fold_mapping(const PHYSFS_uint32 from,
  349. PHYSFS_uint32 *to)
  350. {
  351. PHYSFS_uint32 i;
  352. const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
  353. const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
  354. const CaseFoldMapping *mapping = bucket->list;
  355. for (i = 0; i < bucket->count; i++, mapping++)
  356. {
  357. if (mapping->from == from)
  358. {
  359. to[0] = mapping->to0;
  360. to[1] = mapping->to1;
  361. to[2] = mapping->to2;
  362. return;
  363. } /* if */
  364. } /* for */
  365. /* Not found...there's no remapping for this codepoint. */
  366. to[0] = from;
  367. to[1] = 0;
  368. to[2] = 0;
  369. } /* locate_case_fold_mapping */
  370. static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
  371. {
  372. PHYSFS_uint32 folded1[3], folded2[3];
  373. if (cp1 == cp2)
  374. return 0; /* obviously matches. */
  375. locate_case_fold_mapping(cp1, folded1);
  376. locate_case_fold_mapping(cp2, folded2);
  377. if (folded1[0] < folded2[0])
  378. return -1;
  379. else if (folded1[0] > folded2[0])
  380. return 1;
  381. else if (folded1[1] < folded2[1])
  382. return -1;
  383. else if (folded1[1] > folded2[1])
  384. return 1;
  385. else if (folded1[2] < folded2[2])
  386. return -1;
  387. else if (folded1[2] > folded2[2])
  388. return 1;
  389. return 0; /* complete match. */
  390. } /* utf8codepointcmp */
  391. int __PHYSFS_utf8stricmp(const char *str1, const char *str2)
  392. {
  393. while (1)
  394. {
  395. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  396. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  397. const int rc = utf8codepointcmp(cp1, cp2);
  398. if (rc != 0)
  399. return rc;
  400. else if (cp1 == 0)
  401. break; /* complete match. */
  402. } /* while */
  403. return 0;
  404. } /* __PHYSFS_utf8stricmp */
  405. int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
  406. {
  407. while (n > 0)
  408. {
  409. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  410. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  411. const int rc = utf8codepointcmp(cp1, cp2);
  412. if (rc != 0)
  413. return rc;
  414. else if (cp1 == 0)
  415. return 0;
  416. n--;
  417. } /* while */
  418. return 0; /* matched to n chars. */
  419. } /* __PHYSFS_utf8strnicmp */
  420. int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
  421. {
  422. while (1)
  423. {
  424. const char ch1 = *(str1++);
  425. const char ch2 = *(str2++);
  426. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  427. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  428. if (cp1 < cp2)
  429. return -1;
  430. else if (cp1 > cp2)
  431. return 1;
  432. else if (cp1 == 0) /* they're both null chars? */
  433. break;
  434. } /* while */
  435. return 0;
  436. } /* __PHYSFS_stricmpASCII */
  437. int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
  438. {
  439. while (n-- > 0)
  440. {
  441. const char ch1 = *(str1++);
  442. const char ch2 = *(str2++);
  443. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  444. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  445. if (cp1 < cp2)
  446. return -1;
  447. else if (cp1 > cp2)
  448. return 1;
  449. else if (cp1 == 0) /* they're both null chars? */
  450. return 0;
  451. } /* while */
  452. return 0;
  453. } /* __PHYSFS_strnicmpASCII */
  454. /* end of physfs_unicode.c ... */