physfs_unicode.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. #define __PHYSICSFS_INTERNAL__
  2. #include "physfs_internal.h"
  3. /*
  4. * From rfc3629, the UTF-8 spec:
  5. * http://www.ietf.org/rfc/rfc3629.txt
  6. *
  7. * Char. number range | UTF-8 octet sequence
  8. * (hexadecimal) | (binary)
  9. * --------------------+---------------------------------------------
  10. * 0000 0000-0000 007F | 0xxxxxxx
  11. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  12. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  13. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  14. */
  15. /*
  16. * This may not be the best value, but it's one that isn't represented
  17. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  18. * value from utf8codepoint() if there's bogus bits in the
  19. * stream. utf8codepoint() will turn this value into something
  20. * reasonable (like a question mark), for text that wants to try to recover,
  21. * whereas utf8valid() will use the value to determine if a string has bad
  22. * bits.
  23. */
  24. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  25. /*
  26. * This is the codepoint we currently return when there was bogus bits in a
  27. * UTF-8 string. May not fly in Asian locales?
  28. */
  29. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  30. static PHYSFS_uint32 utf8codepoint(const char **_str)
  31. {
  32. const char *str = *_str;
  33. PHYSFS_uint32 retval = 0;
  34. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  35. PHYSFS_uint32 octet2, octet3, octet4;
  36. if (octet == 0) /* null terminator, end of string. */
  37. return 0;
  38. else if (octet < 128) /* one octet char: 0 to 127 */
  39. {
  40. (*_str)++; /* skip to next possible start of codepoint. */
  41. return octet;
  42. } /* else if */
  43. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  44. {
  45. /*
  46. * Apparently each of these is supposed to be flagged as a bogus
  47. * char, instead of just resyncing to the next valid codepoint.
  48. */
  49. (*_str)++; /* skip to next possible start of codepoint. */
  50. return UNICODE_BOGUS_CHAR_VALUE;
  51. } /* else if */
  52. else if (octet < 224) /* two octets */
  53. {
  54. octet -= (128+64);
  55. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  56. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  57. return UNICODE_BOGUS_CHAR_VALUE;
  58. *_str += 2; /* skip to next possible start of codepoint. */
  59. retval = ((octet << 6) | (octet2 - 128));
  60. if ((retval >= 0x80) && (retval <= 0x7FF))
  61. return retval;
  62. } /* else if */
  63. else if (octet < 240) /* three octets */
  64. {
  65. octet -= (128+64+32);
  66. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  67. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  68. return UNICODE_BOGUS_CHAR_VALUE;
  69. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  70. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  71. return UNICODE_BOGUS_CHAR_VALUE;
  72. *_str += 3; /* skip to next possible start of codepoint. */
  73. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  74. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  75. switch (retval)
  76. {
  77. case 0xD800:
  78. case 0xDB7F:
  79. case 0xDB80:
  80. case 0xDBFF:
  81. case 0xDC00:
  82. case 0xDF80:
  83. case 0xDFFF:
  84. return UNICODE_BOGUS_CHAR_VALUE;
  85. } /* switch */
  86. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  87. if ((retval >= 0x800) && (retval <= 0xFFFD))
  88. return retval;
  89. } /* else if */
  90. else if (octet < 248) /* four octets */
  91. {
  92. octet -= (128+64+32+16);
  93. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  94. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  95. return UNICODE_BOGUS_CHAR_VALUE;
  96. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  97. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  98. return UNICODE_BOGUS_CHAR_VALUE;
  99. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  100. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  101. return UNICODE_BOGUS_CHAR_VALUE;
  102. *_str += 4; /* skip to next possible start of codepoint. */
  103. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  104. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  105. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  106. return retval;
  107. } /* else if */
  108. /*
  109. * Five and six octet sequences became illegal in rfc3629.
  110. * We throw the codepoint away, but parse them to make sure we move
  111. * ahead the right number of bytes and don't overflow the buffer.
  112. */
  113. else if (octet < 252) /* five octets */
  114. {
  115. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  116. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  117. return UNICODE_BOGUS_CHAR_VALUE;
  118. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  119. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  120. return UNICODE_BOGUS_CHAR_VALUE;
  121. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  122. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  123. return UNICODE_BOGUS_CHAR_VALUE;
  124. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  125. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  126. return UNICODE_BOGUS_CHAR_VALUE;
  127. *_str += 5; /* skip to next possible start of codepoint. */
  128. return UNICODE_BOGUS_CHAR_VALUE;
  129. } /* else if */
  130. else /* six octets */
  131. {
  132. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  133. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  134. return UNICODE_BOGUS_CHAR_VALUE;
  135. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  136. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  137. return UNICODE_BOGUS_CHAR_VALUE;
  138. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  139. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  140. return UNICODE_BOGUS_CHAR_VALUE;
  141. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  142. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  143. return UNICODE_BOGUS_CHAR_VALUE;
  144. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  145. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  146. return UNICODE_BOGUS_CHAR_VALUE;
  147. *_str += 6; /* skip to next possible start of codepoint. */
  148. return UNICODE_BOGUS_CHAR_VALUE;
  149. } /* else if */
  150. return UNICODE_BOGUS_CHAR_VALUE;
  151. } /* utf8codepoint */
  152. void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  153. {
  154. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  155. while (len >= sizeof (PHYSFS_uint32))
  156. {
  157. PHYSFS_uint32 cp = utf8codepoint(&src);
  158. if (cp == 0)
  159. break;
  160. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  161. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  162. *(dst++) = cp;
  163. len -= sizeof (PHYSFS_uint32);
  164. } /* while */
  165. *dst = 0;
  166. } /* PHYSFS_utf8ToUcs4 */
  167. void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  168. {
  169. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  170. while (len >= sizeof (PHYSFS_uint16))
  171. {
  172. PHYSFS_uint32 cp = utf8codepoint(&src);
  173. if (cp == 0)
  174. break;
  175. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  176. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  177. if (cp > 0xFFFF) /* UTF-16 surrogates (bogus chars in UCS-2) */
  178. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  179. *(dst++) = cp;
  180. len -= sizeof (PHYSFS_uint16);
  181. } /* while */
  182. *dst = 0;
  183. } /* PHYSFS_utf8ToUcs2 */
  184. void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  185. {
  186. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  187. while (len >= sizeof (PHYSFS_uint16))
  188. {
  189. PHYSFS_uint32 cp = utf8codepoint(&src);
  190. if (cp == 0)
  191. break;
  192. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  193. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  194. if (cp > 0xFFFF) /* encode as surrogate pair */
  195. {
  196. if (len < (sizeof (PHYSFS_uint16) * 2))
  197. break; /* not enough room for the pair, stop now. */
  198. cp -= 0x10000; /* Make this a 20-bit value */
  199. *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
  200. len -= sizeof (PHYSFS_uint16);
  201. cp = 0xDC00 + (cp & 0x3FF);
  202. } /* if */
  203. *(dst++) = cp;
  204. len -= sizeof (PHYSFS_uint16);
  205. } /* while */
  206. *dst = 0;
  207. } /* PHYSFS_utf8ToUtf16 */
  208. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  209. {
  210. char *dst = *_dst;
  211. PHYSFS_uint64 len = *_len;
  212. if (len == 0)
  213. return;
  214. if (cp > 0x10FFFF)
  215. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  216. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  217. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  218. else
  219. {
  220. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  221. switch (cp)
  222. {
  223. case 0xD800:
  224. case 0xDB7F:
  225. case 0xDB80:
  226. case 0xDBFF:
  227. case 0xDC00:
  228. case 0xDF80:
  229. case 0xDFFF:
  230. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  231. } /* switch */
  232. } /* else */
  233. /* Do the encoding... */
  234. if (cp < 0x80)
  235. {
  236. *(dst++) = (char) cp;
  237. len--;
  238. } /* if */
  239. else if (cp < 0x800)
  240. {
  241. if (len < 2)
  242. len = 0;
  243. else
  244. {
  245. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  246. *(dst++) = (char) (cp & 0x3F) | 128;
  247. len -= 2;
  248. } /* else */
  249. } /* else if */
  250. else if (cp < 0x10000)
  251. {
  252. if (len < 3)
  253. len = 0;
  254. else
  255. {
  256. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  257. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  258. *(dst++) = (char) (cp & 0x3F) | 128;
  259. len -= 3;
  260. } /* else */
  261. } /* else if */
  262. else
  263. {
  264. if (len < 4)
  265. len = 0;
  266. else
  267. {
  268. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  269. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  270. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  271. *(dst++) = (char) (cp & 0x3F) | 128;
  272. len -= 4;
  273. } /* else if */
  274. } /* else */
  275. *_dst = dst;
  276. *_len = len;
  277. } /* utf8fromcodepoint */
  278. #define UTF8FROMTYPE(typ, src, dst, len) \
  279. if (len == 0) return; \
  280. len--; \
  281. while (len) \
  282. { \
  283. const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
  284. if (cp == 0) break; \
  285. utf8fromcodepoint(cp, &dst, &len); \
  286. } \
  287. *dst = '\0'; \
  288. void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  289. {
  290. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  291. } /* PHYSFS_utf8FromUcs4 */
  292. void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  293. {
  294. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  295. } /* PHYSFS_utf8FromUcs2 */
  296. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  297. void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
  298. {
  299. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  300. } /* PHYSFS_utf8FromLatin1 */
  301. #undef UTF8FROMTYPE
  302. void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  303. {
  304. if (len == 0)
  305. return;
  306. len--;
  307. while (len)
  308. {
  309. PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
  310. if (cp == 0)
  311. break;
  312. /* Orphaned second half of surrogate pair? */
  313. if ((cp >= 0xDC00) && (cp <= 0xDFFF))
  314. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  315. else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
  316. {
  317. const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
  318. if ((pair < 0xDC00) || (pair > 0xDFFF))
  319. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  320. else
  321. {
  322. src++; /* eat the other surrogate. */
  323. cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
  324. } /* else */
  325. } /* else if */
  326. utf8fromcodepoint(cp, &dst, &len);
  327. } /* while */
  328. *dst = '\0';
  329. } /* PHYSFS_utf8FromUtf16 */
  330. typedef struct CaseFoldMapping
  331. {
  332. PHYSFS_uint32 from;
  333. PHYSFS_uint32 to0;
  334. PHYSFS_uint32 to1;
  335. PHYSFS_uint32 to2;
  336. } CaseFoldMapping;
  337. typedef struct CaseFoldHashBucket
  338. {
  339. const PHYSFS_uint8 count;
  340. const CaseFoldMapping *list;
  341. } CaseFoldHashBucket;
  342. #include "physfs_casefolding.h"
  343. static void locate_case_fold_mapping(const PHYSFS_uint32 from,
  344. PHYSFS_uint32 *to)
  345. {
  346. PHYSFS_uint32 i;
  347. const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
  348. const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
  349. const CaseFoldMapping *mapping = bucket->list;
  350. for (i = 0; i < bucket->count; i++, mapping++)
  351. {
  352. if (mapping->from == from)
  353. {
  354. to[0] = mapping->to0;
  355. to[1] = mapping->to1;
  356. to[2] = mapping->to2;
  357. return;
  358. } /* if */
  359. } /* for */
  360. /* Not found...there's no remapping for this codepoint. */
  361. to[0] = from;
  362. to[1] = 0;
  363. to[2] = 0;
  364. } /* locate_case_fold_mapping */
  365. static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
  366. {
  367. PHYSFS_uint32 folded1[3], folded2[3];
  368. locate_case_fold_mapping(cp1, folded1);
  369. locate_case_fold_mapping(cp2, folded2);
  370. return ( (folded1[0] == folded2[0]) &&
  371. (folded1[1] == folded2[1]) &&
  372. (folded1[2] == folded2[2]) );
  373. } /* utf8codepointcmp */
  374. int __PHYSFS_utf8strcasecmp(const char *str1, const char *str2)
  375. {
  376. while (1)
  377. {
  378. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  379. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  380. if (!utf8codepointcmp(cp1, cp2)) break;
  381. if (cp1 == 0) return 1;
  382. } /* while */
  383. return 0;
  384. } /* __PHYSFS_utf8strcasecmp */
  385. int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
  386. {
  387. while (n > 0)
  388. {
  389. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  390. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  391. if (!utf8codepointcmp(cp1, cp2)) return 0;
  392. if (cp1 == 0) return 1;
  393. n--;
  394. } /* while */
  395. return 1; /* matched to n chars. */
  396. } /* __PHYSFS_utf8strnicmp */
  397. int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
  398. {
  399. while (1)
  400. {
  401. const char ch1 = *(str1++);
  402. const char ch2 = *(str2++);
  403. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  404. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  405. if (cp1 < cp2)
  406. return -1;
  407. else if (cp1 > cp2)
  408. return 1;
  409. else if (cp1 == 0) /* they're both null chars? */
  410. break;
  411. } /* while */
  412. return 0;
  413. } /* __PHYSFS_stricmpASCII */
  414. int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
  415. {
  416. while (n-- > 0)
  417. {
  418. const char ch1 = *(str1++);
  419. const char ch2 = *(str2++);
  420. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  421. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  422. if (cp1 < cp2)
  423. return -1;
  424. else if (cp1 > cp2)
  425. return 1;
  426. else if (cp1 == 0) /* they're both null chars? */
  427. return 0;
  428. } /* while */
  429. return 0;
  430. } /* __PHYSFS_strnicmpASCII */
  431. /* end of physfs_unicode.c ... */