physfs_unicode.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #if HAVE_CONFIG_H
  2. # include <config.h>
  3. #endif
  4. #include "physfs.h"
  5. #define __PHYSICSFS_INTERNAL__
  6. #include "physfs_internal.h"
  7. /*
  8. * From rfc3629, the UTF-8 spec:
  9. * http://www.ietf.org/rfc/rfc3629.txt
  10. *
  11. * Char. number range | UTF-8 octet sequence
  12. * (hexadecimal) | (binary)
  13. * --------------------+---------------------------------------------
  14. * 0000 0000-0000 007F | 0xxxxxxx
  15. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  16. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  17. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  18. */
  19. /*
  20. * This may not be the best value, but it's one that isn't represented
  21. * in Unicode (0x10FFFF is the largest codepoint value). We return this
  22. * value from utf8codepoint() if there's bogus bits in the
  23. * stream. utf8codepoint() will turn this value into something
  24. * reasonable (like a question mark), for text that wants to try to recover,
  25. * whereas utf8valid() will use the value to determine if a string has bad
  26. * bits.
  27. */
  28. #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
  29. /*
  30. * This is the codepoint we currently return when there was bogus bits in a
  31. * UTF-8 string. May not fly in Asian locales?
  32. */
  33. #define UNICODE_BOGUS_CHAR_CODEPOINT '?'
  34. static PHYSFS_uint32 utf8codepoint(const char **_str)
  35. {
  36. const char *str = *_str;
  37. PHYSFS_uint32 retval = 0;
  38. PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
  39. PHYSFS_uint32 octet2, octet3, octet4;
  40. if (octet == 0) /* null terminator, end of string. */
  41. return 0;
  42. else if (octet < 128) /* one octet char: 0 to 127 */
  43. {
  44. (*_str)++; /* skip to next possible start of codepoint. */
  45. return(octet);
  46. } /* else if */
  47. else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
  48. {
  49. /*
  50. * Apparently each of these is supposed to be flagged as a bogus
  51. * char, instead of just resyncing to the next valid codepoint.
  52. */
  53. (*_str)++; /* skip to next possible start of codepoint. */
  54. return UNICODE_BOGUS_CHAR_VALUE;
  55. } /* else if */
  56. else if (octet < 224) /* two octets */
  57. {
  58. octet -= (128+64);
  59. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  60. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  61. return UNICODE_BOGUS_CHAR_VALUE;
  62. *_str += 2; /* skip to next possible start of codepoint. */
  63. retval = ((octet << 6) | (octet2 - 128));
  64. if ((retval >= 0x80) && (retval <= 0x7FF))
  65. return retval;
  66. } /* else if */
  67. else if (octet < 240) /* three octets */
  68. {
  69. octet -= (128+64+32);
  70. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  71. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  72. return UNICODE_BOGUS_CHAR_VALUE;
  73. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  74. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  75. return UNICODE_BOGUS_CHAR_VALUE;
  76. *_str += 3; /* skip to next possible start of codepoint. */
  77. retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
  78. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  79. switch (retval)
  80. {
  81. case 0xD800:
  82. case 0xDB7F:
  83. case 0xDB80:
  84. case 0xDBFF:
  85. case 0xDC00:
  86. case 0xDF80:
  87. case 0xDFFF:
  88. return UNICODE_BOGUS_CHAR_VALUE;
  89. } /* switch */
  90. /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
  91. if ((retval >= 0x800) && (retval <= 0xFFFD))
  92. return retval;
  93. } /* else if */
  94. else if (octet < 248) /* four octets */
  95. {
  96. octet -= (128+64+32+16);
  97. octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  98. if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  99. return UNICODE_BOGUS_CHAR_VALUE;
  100. octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  101. if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  102. return UNICODE_BOGUS_CHAR_VALUE;
  103. octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  104. if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  105. return UNICODE_BOGUS_CHAR_VALUE;
  106. *_str += 4; /* skip to next possible start of codepoint. */
  107. retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
  108. ((octet3 - 128) << 6) | ((octet4 - 128)) );
  109. if ((retval >= 0x10000) && (retval <= 0x10FFFF))
  110. return retval;
  111. } /* else if */
  112. /*
  113. * Five and six octet sequences became illegal in rfc3629.
  114. * We throw the codepoint away, but parse them to make sure we move
  115. * ahead the right number of bytes and don't overflow the buffer.
  116. */
  117. else if (octet < 252) /* five octets */
  118. {
  119. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  120. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  121. return UNICODE_BOGUS_CHAR_VALUE;
  122. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  123. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  124. return UNICODE_BOGUS_CHAR_VALUE;
  125. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  126. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  127. return UNICODE_BOGUS_CHAR_VALUE;
  128. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  129. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  130. return UNICODE_BOGUS_CHAR_VALUE;
  131. *_str += 5; /* skip to next possible start of codepoint. */
  132. return UNICODE_BOGUS_CHAR_VALUE;
  133. } /* else if */
  134. else /* six octets */
  135. {
  136. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  137. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  138. return UNICODE_BOGUS_CHAR_VALUE;
  139. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  140. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  141. return UNICODE_BOGUS_CHAR_VALUE;
  142. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  143. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  144. return UNICODE_BOGUS_CHAR_VALUE;
  145. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  146. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  147. return UNICODE_BOGUS_CHAR_VALUE;
  148. octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
  149. if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
  150. return UNICODE_BOGUS_CHAR_VALUE;
  151. *_str += 6; /* skip to next possible start of codepoint. */
  152. return UNICODE_BOGUS_CHAR_VALUE;
  153. } /* else if */
  154. return UNICODE_BOGUS_CHAR_VALUE;
  155. } /* utf8codepoint */
  156. void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
  157. {
  158. len -= sizeof (PHYSFS_uint32); /* save room for null char. */
  159. while (len >= sizeof (PHYSFS_uint32))
  160. {
  161. PHYSFS_uint32 cp = utf8codepoint(&src);
  162. if (cp == 0)
  163. break;
  164. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  165. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  166. *(dst++) = cp;
  167. len -= sizeof (PHYSFS_uint32);
  168. } /* while */
  169. *dst = 0;
  170. } /* PHYSFS_utf8toucs4 */
  171. void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
  172. {
  173. len -= sizeof (PHYSFS_uint16); /* save room for null char. */
  174. while (len >= sizeof (PHYSFS_uint16))
  175. {
  176. PHYSFS_uint32 cp = utf8codepoint(&src);
  177. if (cp == 0)
  178. break;
  179. else if (cp == UNICODE_BOGUS_CHAR_VALUE)
  180. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  181. /* !!! BLUESKY: UTF-16 surrogates? */
  182. if (cp > 0xFFFF)
  183. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  184. *(dst++) = cp;
  185. len -= sizeof (PHYSFS_uint16);
  186. } /* while */
  187. *dst = 0;
  188. } /* PHYSFS_utf8toucs2 */
  189. static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
  190. {
  191. char *dst = *_dst;
  192. PHYSFS_uint64 len = *_len;
  193. if (len == 0)
  194. return;
  195. if (cp > 0x10FFFF)
  196. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  197. else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */
  198. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  199. else
  200. {
  201. /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
  202. switch (cp)
  203. {
  204. case 0xD800:
  205. case 0xDB7F:
  206. case 0xDB80:
  207. case 0xDBFF:
  208. case 0xDC00:
  209. case 0xDF80:
  210. case 0xDFFF:
  211. cp = UNICODE_BOGUS_CHAR_CODEPOINT;
  212. } /* switch */
  213. } /* else */
  214. /* Do the encoding... */
  215. if (cp < 0x80)
  216. {
  217. *(dst++) = (char) cp;
  218. len--;
  219. } /* if */
  220. else if (cp < 0x800)
  221. {
  222. if (len < 2)
  223. len = 0;
  224. else
  225. {
  226. *(dst++) = (char) ((cp >> 6) | 128 | 64);
  227. *(dst++) = (char) (cp & 0x3F) | 128;
  228. len -= 2;
  229. } /* else */
  230. } /* else if */
  231. else if (cp < 0x10000)
  232. {
  233. if (len < 3)
  234. len = 0;
  235. else
  236. {
  237. *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
  238. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  239. *(dst++) = (char) (cp & 0x3F) | 128;
  240. len -= 3;
  241. } /* else */
  242. } /* else if */
  243. else
  244. {
  245. if (len < 4)
  246. len = 0;
  247. else
  248. {
  249. *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
  250. *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
  251. *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
  252. *(dst++) = (char) (cp & 0x3F) | 128;
  253. len -= 4;
  254. } /* else if */
  255. } /* else */
  256. *_dst = dst;
  257. *_len = len;
  258. } /* utf8fromcodepoint */
  259. #define UTF8FROMTYPE(typ, src, dst, len) \
  260. len--; \
  261. while (len) \
  262. { \
  263. const PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); \
  264. if (cp == 0) break; \
  265. utf8fromcodepoint(cp, &dst, &len); \
  266. } \
  267. *dst = '\0'; \
  268. void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
  269. {
  270. UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
  271. } /* PHYSFS_utf8fromucs4 */
  272. void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
  273. {
  274. UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
  275. } /* PHYSFS_utf8fromucs4 */
  276. /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
  277. void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
  278. {
  279. UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
  280. } /* PHYSFS_utf8fromlatin1 */
  281. #undef UTF8FROMTYPE
  282. /* end of physfs_unicode.c ... */