physfs_unicode.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #include "physfs.h"
  2. #define __PHYSICSFS_INTERNAL__
  3. #include "physfs_internal.h"
  4. /*
  5. * From rfc3629, the UTF-8 spec:
  6. * http://www.ietf.org/rfc/rfc3629.txt
  7. *
  8. * Char. number range | UTF-8 octet sequence
  9. * (hexadecimal) | (binary)
  10. * --------------------+---------------------------------------------
  11. * 0000 0000-0000 007F | 0xxxxxxx
  12. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  13. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  14. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  15. */
  16. /*
  17. * This may not be the best value, but it's one that isn't represented
  18. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  19. * value from utf8codepoint() if there's bogus bits in the
  20. * stream. utf8codepoint() will turn this value into something
  21. * reasonable (like a question mark), for text that wants to try to recover,
  22. * whereas utf8valid() will use the value to determine if a string has bad
  23. * bits.
  24. */
  25. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  26. /*
  27. * This is the codepoint we currently return when there was bogus bits in a
  28. * UTF-8 string. May not fly in Asian locales?
  29. */
  30. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  31. static PHYSFS_uint32 utf8codepoint(const char **_str)
  32. {
  33. const char *str = *_str;
  34. PHYSFS_uint32 retval = 0;
  35. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  36. PHYSFS_uint32 octet2, octet3, octet4;
  37. if (octet == 0) /* null terminator, end of string. */
  38. return 0;
  39. else if (octet < 128) /* one octet char: 0 to 127 */
  40. {
  41. (*_str)++; /* skip to next possible start of codepoint. */
  42. return(octet);
  43. } /* else if */
  44. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  45. {
  46. /*
  47. * Apparently each of these is supposed to be flagged as a bogus
  48. * char, instead of just resyncing to the next valid codepoint.
  49. */
  50. (*_str)++; /* skip to next possible start of codepoint. */
  51. return UNICODE_BOGUS_CHAR_VALUE;
  52. } /* else if */
  53. else if (octet < 224) /* two octets */
  54. {
  55. octet -= (128+64);
  56. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  57. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  58. return UNICODE_BOGUS_CHAR_VALUE;
  59. *_str += 2; /* skip to next possible start of codepoint. */
  60. retval = ((octet << 6) | (octet2 - 128));
  61. if ((retval >= 0x80) && (retval <= 0x7FF))
  62. return retval;
  63. } /* else if */
  64. else if (octet < 240) /* three octets */
  65. {
  66. octet -= (128+64+32);
  67. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  68. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  69. return UNICODE_BOGUS_CHAR_VALUE;
  70. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  71. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  72. return UNICODE_BOGUS_CHAR_VALUE;
  73. *_str += 3; /* skip to next possible start of codepoint. */
  74. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  75. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  76. switch (retval)
  77. {
  78. case 0xD800:
  79. case 0xDB7F:
  80. case 0xDB80:
  81. case 0xDBFF:
  82. case 0xDC00:
  83. case 0xDF80:
  84. case 0xDFFF:
  85. return UNICODE_BOGUS_CHAR_VALUE;
  86. } /* switch */
  87. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  88. if ((retval >= 0x800) && (retval <= 0xFFFD))
  89. return retval;
  90. } /* else if */
  91. else if (octet < 248) /* four octets */
  92. {
  93. octet -= (128+64+32+16);
  94. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  95. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  96. return UNICODE_BOGUS_CHAR_VALUE;
  97. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  98. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  99. return UNICODE_BOGUS_CHAR_VALUE;
  100. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  101. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  102. return UNICODE_BOGUS_CHAR_VALUE;
  103. *_str += 4; /* skip to next possible start of codepoint. */
  104. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  105. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  106. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  107. return retval;
  108. } /* else if */
  109. /*
  110. * Five and six octet sequences became illegal in rfc3629.
  111. * We throw the codepoint away, but parse them to make sure we move
  112. * ahead the right number of bytes and don't overflow the buffer.
  113. */
  114. else if (octet < 252) /* five octets */
  115. {
  116. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  117. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  118. return UNICODE_BOGUS_CHAR_VALUE;
  119. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  120. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  121. return UNICODE_BOGUS_CHAR_VALUE;
  122. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  123. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  124. return UNICODE_BOGUS_CHAR_VALUE;
  125. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  126. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  127. return UNICODE_BOGUS_CHAR_VALUE;
  128. *_str += 5; /* skip to next possible start of codepoint. */
  129. return UNICODE_BOGUS_CHAR_VALUE;
  130. } /* else if */
  131. else /* six octets */
  132. {
  133. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  134. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  135. return UNICODE_BOGUS_CHAR_VALUE;
  136. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  137. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  138. return UNICODE_BOGUS_CHAR_VALUE;
  139. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  140. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  141. return UNICODE_BOGUS_CHAR_VALUE;
  142. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  143. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  144. return UNICODE_BOGUS_CHAR_VALUE;
  145. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  146. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  147. return UNICODE_BOGUS_CHAR_VALUE;
  148. *_str += 6; /* skip to next possible start of codepoint. */
  149. return UNICODE_BOGUS_CHAR_VALUE;
  150. } /* else if */
  151. return UNICODE_BOGUS_CHAR_VALUE;
  152. } /* utf8codepoint */
  153. void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  154. {
  155. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  156. while (len >= sizeof (PHYSFS_uint32))
  157. {
  158. PHYSFS_uint32 cp = utf8codepoint(&src);
  159. if (cp == 0)
  160. break;
  161. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  162. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  163. *(dst++) = cp;
  164. len -= sizeof (PHYSFS_uint32);
  165. } /* while */
  166. *dst = 0;
  167. } /* PHYSFS_utf8toucs4 */
  168. void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  169. {
  170. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  171. while (len >= sizeof (PHYSFS_uint16))
  172. {
  173. PHYSFS_uint32 cp = utf8codepoint(&src);
  174. if (cp == 0)
  175. break;
  176. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  177. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  178. /* !!! BLUESKY: UTF-16 surrogates? */
  179. if (cp > 0xFFFF)
  180. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  181. *(dst++) = cp;
  182. len -= sizeof (PHYSFS_uint16);
  183. } /* while */
  184. *dst = 0;
  185. } /* PHYSFS_utf8toucs2 */
  186. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  187. {
  188. char *dst = *_dst;
  189. PHYSFS_uint64 len = *_len;
  190. if (len == 0)
  191. return;
  192. if (cp > 0x10FFFF)
  193. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  194. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  195. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  196. else
  197. {
  198. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  199. switch (cp)
  200. {
  201. case 0xD800:
  202. case 0xDB7F:
  203. case 0xDB80:
  204. case 0xDBFF:
  205. case 0xDC00:
  206. case 0xDF80:
  207. case 0xDFFF:
  208. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  209. } /* switch */
  210. } /* else */
  211. /* Do the encoding... */
  212. if (cp < 0x80)
  213. {
  214. *(dst++) = (char) cp;
  215. len--;
  216. } /* if */
  217. else if (cp < 0x800)
  218. {
  219. if (len < 2)
  220. len = 0;
  221. else
  222. {
  223. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  224. *(dst++) = (char) (cp & 0x3F) | 128;
  225. len -= 2;
  226. } /* else */
  227. } /* else if */
  228. else if (cp < 0x10000)
  229. {
  230. if (len < 3)
  231. len = 0;
  232. else
  233. {
  234. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  235. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  236. *(dst++) = (char) (cp & 0x3F) | 128;
  237. len -= 3;
  238. } /* else */
  239. } /* else if */
  240. else
  241. {
  242. if (len < 4)
  243. len = 0;
  244. else
  245. {
  246. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  247. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  248. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  249. *(dst++) = (char) (cp & 0x3F) | 128;
  250. len -= 4;
  251. } /* else if */
  252. } /* else */
  253. *_dst = dst;
  254. *_len = len;
  255. } /* utf8fromcodepoint */
  256. #define UTF8FROMTYPE(typ, src, dst, len) \
  257. len--; \
  258. while (len) \
  259. { \
  260. const PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); \
  261. if (cp == 0) break; \
  262. utf8fromcodepoint(cp, &dst, &len); \
  263. } \
  264. *dst = '\0'; \
  265. void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  266. {
  267. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  268. } /* PHYSFS_utf8fromucs4 */
  269. void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  270. {
  271. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  272. } /* PHYSFS_utf8fromucs4 */
  273. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  274. void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
  275. {
  276. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  277. } /* PHYSFS_utf8fromlatin1 */
  278. #undef UTF8FROMTYPE
  279. /* end of physfs_unicode.c ... */