physfs_unicode.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. #include "physfs.h"
  2. #define __PHYSICSFS_INTERNAL__
  3. #include "physfs_internal.h"
  4. /*
  5. * From rfc3629, the UTF-8 spec:
  6. * http://www.ietf.org/rfc/rfc3629.txt
  7. *
  8. * Char. number range | UTF-8 octet sequence
  9. * (hexadecimal) | (binary)
  10. * --------------------+---------------------------------------------
  11. * 0000 0000-0000 007F | 0xxxxxxx
  12. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  13. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  14. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  15. */
  16. /*
  17. * This may not be the best value, but it's one that isn't represented
  18. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  19. * value from utf8codepoint() if there's bogus bits in the
  20. * stream. utf8codepoint() will turn this value into something
  21. * reasonable (like a question mark), for text that wants to try to recover,
  22. * whereas utf8valid() will use the value to determine if a string has bad
  23. * bits.
  24. */
  25. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  26. /*
  27. * This is the codepoint we currently return when there was bogus bits in a
  28. * UTF-8 string. May not fly in Asian locales?
  29. */
  30. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  31. static PHYSFS_uint32 utf8codepoint(const char **_str)
  32. {
  33. const char *str = *_str;
  34. PHYSFS_uint32 retval = 0;
  35. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  36. PHYSFS_uint32 octet2, octet3, octet4;
  37. if (octet == 0) /* null terminator, end of string. */
  38. return 0;
  39. else if (octet < 128) /* one octet char: 0 to 127 */
  40. {
  41. (*_str)++; /* skip to next possible start of codepoint. */
  42. return(octet);
  43. } /* else if */
  44. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  45. {
  46. /*
  47. * Apparently each of these is supposed to be flagged as a bogus
  48. * char, instead of just resyncing to the next valid codepoint.
  49. */
  50. (*_str)++; /* skip to next possible start of codepoint. */
  51. return UNICODE_BOGUS_CHAR_VALUE;
  52. } /* else if */
  53. else if (octet < 224) /* two octets */
  54. {
  55. octet -= (128+64);
  56. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  57. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  58. return UNICODE_BOGUS_CHAR_VALUE;
  59. *_str += 2; /* skip to next possible start of codepoint. */
  60. retval = ((octet << 6) | (octet2 - 128));
  61. if ((retval >= 0x80) && (retval <= 0x7FF))
  62. return retval;
  63. } /* else if */
  64. else if (octet < 240) /* three octets */
  65. {
  66. octet -= (128+64+32);
  67. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  68. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  69. return UNICODE_BOGUS_CHAR_VALUE;
  70. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  71. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  72. return UNICODE_BOGUS_CHAR_VALUE;
  73. *_str += 3; /* skip to next possible start of codepoint. */
  74. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  75. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  76. switch (retval)
  77. {
  78. case 0xD800:
  79. case 0xDB7F:
  80. case 0xDB80:
  81. case 0xDBFF:
  82. case 0xDC00:
  83. case 0xDF80:
  84. case 0xDFFF:
  85. return UNICODE_BOGUS_CHAR_VALUE;
  86. } /* switch */
  87. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  88. if ((retval >= 0x800) && (retval <= 0xFFFD))
  89. return retval;
  90. } /* else if */
  91. else if (octet < 248) /* four octets */
  92. {
  93. octet -= (128+64+32+16);
  94. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  95. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  96. return UNICODE_BOGUS_CHAR_VALUE;
  97. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  98. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  99. return UNICODE_BOGUS_CHAR_VALUE;
  100. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  101. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  102. return UNICODE_BOGUS_CHAR_VALUE;
  103. *_str += 4; /* skip to next possible start of codepoint. */
  104. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  105. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  106. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  107. return retval;
  108. } /* else if */
  109. /*
  110. * Five and six octet sequences became illegal in rfc3629.
  111. * We throw the codepoint away, but parse them to make sure we move
  112. * ahead the right number of bytes and don't overflow the buffer.
  113. */
  114. else if (octet < 252) /* five octets */
  115. {
  116. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  117. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  118. return UNICODE_BOGUS_CHAR_VALUE;
  119. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  120. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  121. return UNICODE_BOGUS_CHAR_VALUE;
  122. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  123. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  124. return UNICODE_BOGUS_CHAR_VALUE;
  125. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  126. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  127. return UNICODE_BOGUS_CHAR_VALUE;
  128. *_str += 5; /* skip to next possible start of codepoint. */
  129. return UNICODE_BOGUS_CHAR_VALUE;
  130. } /* else if */
  131. else /* six octets */
  132. {
  133. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  134. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  135. return UNICODE_BOGUS_CHAR_VALUE;
  136. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  137. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  138. return UNICODE_BOGUS_CHAR_VALUE;
  139. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  140. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  141. return UNICODE_BOGUS_CHAR_VALUE;
  142. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  143. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  144. return UNICODE_BOGUS_CHAR_VALUE;
  145. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  146. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  147. return UNICODE_BOGUS_CHAR_VALUE;
  148. *_str += 6; /* skip to next possible start of codepoint. */
  149. return UNICODE_BOGUS_CHAR_VALUE;
  150. } /* else if */
  151. return UNICODE_BOGUS_CHAR_VALUE;
  152. } /* utf8codepoint */
  153. void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  154. {
  155. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  156. while (len >= sizeof (PHYSFS_uint32))
  157. {
  158. PHYSFS_uint32 cp = utf8codepoint(&src);
  159. if (cp == 0)
  160. break;
  161. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  162. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  163. *(dst++) = cp;
  164. len -= sizeof (PHYSFS_uint32);
  165. } /* while */
  166. *dst = 0;
  167. } /* PHYSFS_utf8ToUcs4 */
  168. void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  169. {
  170. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  171. while (len >= sizeof (PHYSFS_uint16))
  172. {
  173. PHYSFS_uint32 cp = utf8codepoint(&src);
  174. if (cp == 0)
  175. break;
  176. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  177. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  178. /* !!! BLUESKY: UTF-16 surrogates? */
  179. if (cp > 0xFFFF)
  180. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  181. *(dst++) = cp;
  182. len -= sizeof (PHYSFS_uint16);
  183. } /* while */
  184. *dst = 0;
  185. } /* PHYSFS_utf8ToUcs2 */
  186. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  187. {
  188. char *dst = *_dst;
  189. PHYSFS_uint64 len = *_len;
  190. if (len == 0)
  191. return;
  192. if (cp > 0x10FFFF)
  193. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  194. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  195. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  196. else
  197. {
  198. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  199. switch (cp)
  200. {
  201. case 0xD800:
  202. case 0xDB7F:
  203. case 0xDB80:
  204. case 0xDBFF:
  205. case 0xDC00:
  206. case 0xDF80:
  207. case 0xDFFF:
  208. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  209. } /* switch */
  210. } /* else */
  211. /* Do the encoding... */
  212. if (cp < 0x80)
  213. {
  214. *(dst++) = (char) cp;
  215. len--;
  216. } /* if */
  217. else if (cp < 0x800)
  218. {
  219. if (len < 2)
  220. len = 0;
  221. else
  222. {
  223. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  224. *(dst++) = (char) (cp & 0x3F) | 128;
  225. len -= 2;
  226. } /* else */
  227. } /* else if */
  228. else if (cp < 0x10000)
  229. {
  230. if (len < 3)
  231. len = 0;
  232. else
  233. {
  234. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  235. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  236. *(dst++) = (char) (cp & 0x3F) | 128;
  237. len -= 3;
  238. } /* else */
  239. } /* else if */
  240. else
  241. {
  242. if (len < 4)
  243. len = 0;
  244. else
  245. {
  246. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  247. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  248. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  249. *(dst++) = (char) (cp & 0x3F) | 128;
  250. len -= 4;
  251. } /* else if */
  252. } /* else */
  253. *_dst = dst;
  254. *_len = len;
  255. } /* utf8fromcodepoint */
  256. #define UTF8FROMTYPE(typ, src, dst, len) \
  257. if (len == 0) return; \
  258. len--; \
  259. while (len) \
  260. { \
  261. const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
  262. if (cp == 0) break; \
  263. utf8fromcodepoint(cp, &dst, &len); \
  264. } \
  265. *dst = '\0'; \
  266. void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  267. {
  268. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  269. } /* PHYSFS_utf8FromUcs4 */
  270. void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  271. {
  272. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  273. } /* PHYSFS_utf8FromUcs4 */
  274. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  275. void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
  276. {
  277. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  278. } /* PHYSFS_utf8FromLatin1 */
  279. #undef UTF8FROMTYPE
  280. typedef struct CaseFoldMapping
  281. {
  282. PHYSFS_uint32 from;
  283. PHYSFS_uint32 to0;
  284. PHYSFS_uint32 to1;
  285. PHYSFS_uint32 to2;
  286. } CaseFoldMapping;
  287. typedef struct CaseFoldHashBucket
  288. {
  289. const PHYSFS_uint8 count;
  290. const CaseFoldMapping *list;
  291. } CaseFoldHashBucket;
  292. #include "physfs_casefolding.h"
  293. static void locate_case_fold_mapping(const PHYSFS_uint32 from,
  294. PHYSFS_uint32 *to)
  295. {
  296. PHYSFS_uint32 i;
  297. const PHYSFS_uint8 hashed = ((from ^ (from >> 8)) & 0xFF);
  298. const CaseFoldHashBucket *bucket = &case_fold_hash[hashed];
  299. const CaseFoldMapping *mapping = bucket->list;
  300. for (i = 0; i < bucket->count; i++, mapping++)
  301. {
  302. if (mapping->from == from)
  303. {
  304. to[0] = mapping->to0;
  305. to[1] = mapping->to1;
  306. to[2] = mapping->to2;
  307. return;
  308. } /* if */
  309. } /* for */
  310. /* Not found...there's no remapping for this codepoint. */
  311. to[0] = from;
  312. to[1] = 0;
  313. to[2] = 0;
  314. } /* locate_case_fold_mapping */
  315. static int utf8codepointcmp(const PHYSFS_uint32 cp1, const PHYSFS_uint32 cp2)
  316. {
  317. PHYSFS_uint32 folded1[3], folded2[3];
  318. locate_case_fold_mapping(cp1, folded1);
  319. locate_case_fold_mapping(cp2, folded2);
  320. return ( (folded1[0] == folded2[0]) &&
  321. (folded1[1] == folded2[1]) &&
  322. (folded1[2] == folded2[2]) );
  323. } /* utf8codepointcmp */
  324. int __PHYSFS_utf8strcasecmp(const char *str1, const char *str2)
  325. {
  326. while (1)
  327. {
  328. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  329. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  330. if (!utf8codepointcmp(cp1, cp2)) return 0;
  331. if (cp1 == 0) return 1;
  332. } /* while */
  333. return 0; /* shouldn't hit this. */
  334. } /* __PHYSFS_utf8strcasecmp */
  335. int __PHYSFS_utf8strnicmp(const char *str1, const char *str2, PHYSFS_uint32 n)
  336. {
  337. while (n > 0)
  338. {
  339. const PHYSFS_uint32 cp1 = utf8codepoint(&str1);
  340. const PHYSFS_uint32 cp2 = utf8codepoint(&str2);
  341. if (!utf8codepointcmp(cp1, cp2)) return 0;
  342. if (cp1 == 0) return 1;
  343. n--;
  344. } /* while */
  345. return 1; /* matched to n chars. */
  346. } /* __PHYSFS_utf8strnicmp */
  347. int __PHYSFS_stricmpASCII(const char *str1, const char *str2)
  348. {
  349. while (1)
  350. {
  351. const char ch1 = *(str1++);
  352. const char ch2 = *(str2++);
  353. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  354. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  355. if (cp1 < cp2)
  356. return -1;
  357. else if (cp1 > cp2)
  358. return 1;
  359. else if (cp1 == 0) /* they're both null chars? */
  360. return 0;
  361. } /* while */
  362. return 0; /* shouldn't hit this. */
  363. } /* __PHYSFS_stricmpASCII */
  364. int __PHYSFS_strnicmpASCII(const char *str1, const char *str2, PHYSFS_uint32 n)
  365. {
  366. while (n-- > 0)
  367. {
  368. const char ch1 = *(str1++);
  369. const char ch2 = *(str2++);
  370. const char cp1 = ((ch1 >= 'A') && (ch1 <= 'Z')) ? (ch1+32) : ch1;
  371. const char cp2 = ((ch2 >= 'A') && (ch2 <= 'Z')) ? (ch2+32) : ch2;
  372. if (cp1 < cp2)
  373. return -1;
  374. else if (cp1 > cp2)
  375. return 1;
  376. else if (cp1 == 0) /* they're both null chars? */
  377. return 0;
  378. } /* while */
  379. return 0;
  380. } /* __PHYSFS_stricmpASCII */
  381. /* end of physfs_unicode.c ... */