15 роки тому · c88d2c3746
--- a/src/physfs.h
+++ b/src/physfs.h
@@ -165,24 +165,27 @@
 
				  *  This means that if all you care about is English (ASCII characters <= 127)
			
 
				  *  then you just use regular C strings. If you care about Unicode (and you
			
 
				  *  should!) then you need to figure out what your platform wants, needs, and
			
 
				- *  offers. If you are on Windows and build with Unicode support, your TCHAR
			
 
				- *  strings are two bytes per character (this is called "UCS-2 encoding"). You
			
 
				+ *  offers. If you are on Windows before Win2000 and build with Unicode
			
 
				+ *  support, your TCHAR strings are two bytes per character (this is called
			
 
				+ *  "UCS-2 encoding"). Any modern Windows uses UTF-16, which is two bytes
			
 
				+ *  per character for most characters, but some characters are four. You
			
 
				  *  should convert them to UTF-8 before handing them to PhysicsFS with
			
 
				- *  PHYSFS_utf8FromUcs2(). If you're using Unix or Mac OS X, your wchar_t
			
 
				- *  strings are four bytes per character ("UCS-4 encoding"). Use
			
 
				- *  PHYSFS_utf8FromUcs4(). Mac OS X can give you UTF-8 directly from a
			
 
				- *  CFString, and many Unixes generally give you C strings in UTF-8 format
			
 
				- *  everywhere. If you have a single-byte high ASCII charset, like so-many
			
 
				- *  European "codepages" you may be out of luck. We'll convert from "Latin1"
			
 
				- *  to UTF-8 only, and never back to Latin1. If you're above ASCII 127, all
			
 
				- *  bets are off: move to Unicode or use your platform's facilities. Passing a
			
 
				- *  C string with high-ASCII data that isn't UTF-8 encoded will NOT do what
			
 
				- *  you expect!
			
 
				- *
			
 
				- * Naturally, there's also PHYSFS_utf8ToUcs2() and PHYSFS_utf8ToUcs4() to get
			
 
				- *  data back into a format you like. Behind the scenes, PhysicsFS will use
			
 
				- *  Unicode where possible: the UTF-8 strings on Windows will be converted
			
 
				- *  and used with the multibyte Windows APIs, for example.
			
 
				+ *  PHYSFS_utf8FromUcs2() or PHYSFS_utf8FromUtf16(). If you're using Unix or
			
 
				+ *  Mac OS X, your wchar_t strings are four bytes per character ("UCS-4
			
 
				+ *  encoding"). Use PHYSFS_utf8FromUcs4(). Mac OS X can give you UTF-8
			
 
				+ *  directly from a CFString or NSString, and many Unixes generally give you C
			
 
				+ *  strings in UTF-8 format everywhere. If you have a single-byte high ASCII
			
 
				+ *  charset, like so-many European "codepages" you may be out of luck. We'll
			
 
				+ *  convert from "Latin1" to UTF-8 only, and never back to Latin1. If you're
			
 
				+ *  above ASCII 127, all bets are off: move to Unicode or use your platform's
			
 
				+ *  facilities. Passing a C string with high-ASCII data that isn't UTF-8
			
 
				+ *  encoded will NOT do what you expect!
			
 
				+ *
			
 
				+ * Naturally, there's also PHYSFS_utf8ToUcs2(), PHYSFS_utf8ToUtf16(), and
			
 
				+ *  PHYSFS_utf8ToUcs4() to get data back into a format you like. Behind the
			
 
				+ *  scenes, PhysicsFS will use Unicode where possible: the UTF-8 strings on
			
 
				+ *  Windows will be converted and used with the multibyte Windows APIs, for
			
 
				+ *  example.
			
 
				  *
			
 
				  * PhysicsFS offers basic encoding conversion support, but not a whole string
			
 
				  *  library. Get your stuff into whatever format you can work with.
			
@@ -2355,8 +2358,12 @@ PHYSFS_DECL void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst,
 
				  * \fn void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
			
 
				  * \brief Convert a UCS-2 string to a UTF-8 string.
			
 
				  *
			
 
				+ * \warning you almost certainly should use PHYSFS_utf8FromUtf16(), which
			
 
				+ *  became available in PhysicsFS 2.1, unless you know what you're doing.
			
 
				+ *
			
 
				  * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
			
 
				- *  with Unicode support.
			
 
				+ *  with Unicode support. Please note that modern versions of Windows use
			
 
				+ *  UTF-16, not UCS-2. You almost certainly want PHYSFS_utf8FromUtf16() instead.
			
 
				  *
			
 
				  * To ensure that the destination buffer is large enough for the conversion,
			
 
				  *  please allocate a buffer that is double the size of the source buffer.
			
@@ -2367,12 +2374,11 @@ PHYSFS_DECL void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst,
 
				  *  will always be null-terminated and never have an incomplete UTF-8
			
 
				  *  sequence at the end. If the buffer length is 0, this function does nothing.
			
 
				  *
			
 
				- * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
			
 
				- *  values at this time.
			
 
				- *
			
 
				  *   \param src Null-terminated source string in UCS-2 format.
			
 
				  *   \param dst Buffer to store converted UTF-8 string.
			
 
				  *   \param len Size, in bytes, of destination buffer.
			
 
				+ *
			
 
				+ * \sa PHYSFS_utf8FromUtf16
			
 
				  */
			
 
				 PHYSFS_DECL void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst,
			
 
				                                      PHYSFS_uint64 len);
			
@@ -2381,8 +2387,13 @@ PHYSFS_DECL void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst,
 
				  * \fn PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
			
 
				  * \brief Convert a UTF-8 string to a UCS-2 string.
			
 
				  *
			
 
				+ * \warning you almost certainly should use PHYSFS_utf8ToUtf16(), which
			
 
				+ *  became available in PhysicsFS 2.1, unless you know what you're doing.
			
 
				+ *
			
 
				  * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
			
 
				- *  with Unicode support.
			
 
				+ *  with Unicode support. Please note that modern versions of Windows use
			
 
				+ *  UTF-16, not UCS-2. You almost certainly want PHYSFS_utf8ToUtf16() instead,
			
 
				+ *  but you need to understand how that changes things, too.
			
 
				  *
			
 
				  * To ensure that the destination buffer is large enough for the conversion,
			
 
				  *  please allocate a buffer that is double the size of the source buffer.
			
@@ -2393,12 +2404,11 @@ PHYSFS_DECL void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst,
 
				  *  will always be null-terminated and never have an incomplete UCS-2
			
 
				  *  sequence at the end. If the buffer length is 0, this function does nothing.
			
 
				  *
			
 
				- * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
			
 
				- *  values at this time.
			
 
				- *
			
 
				  *   \param src Null-terminated source string in UTF-8 format.
			
 
				  *   \param dst Buffer to store converted UCS-2 string.
			
 
				  *   \param len Size, in bytes, of destination buffer.
			
 
				+ *
			
 
				+ * \sa PHYSFS_utf8ToUtf16
			
 
				  */
			
 
				 PHYSFS_DECL void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst,
			
 
				                                    PHYSFS_uint64 len);
			
@@ -2524,6 +2534,66 @@ typedef struct PHYSFS_Stat
 
				  */
			
 
				 PHYSFS_DECL int PHYSFS_stat(const char *fname, PHYSFS_Stat *stat);
			
 
				 
			
 
				+/**
			
 
				+ * \fn void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
			
 
				+ * \brief Convert a UTF-16 string to a UTF-8 string.
			
 
				+ *
			
 
				+ * UTF-16 strings are 16-bits per character (except some chars, which are
			
 
				+ *  32-bits): \c TCHAR on Windows, when building with Unicode support. Modern
			
 
				+ *  Windows releases use UTF-16. Windows releases before 2000 used TCHAR, but
			
 
				+ *  only handled UCS-2. UTF-16 _is_ UCS-2, except for the characters that
			
 
				+ *  are 4 bytes, which aren't representable in UCS-2 at all anyhow. If you
			
 
				+ *  aren't sure, you should be using UTF-16 at this point on Windows.
			
 
				+ *
			
 
				+ * To ensure that the destination buffer is large enough for the conversion,
			
 
				+ *  please allocate a buffer that is double the size of the source buffer.
			
 
				+ *  UTF-8 never uses more than 32-bits per character, so while it may shrink
			
 
				+ *  a UTF-16 string, it may also expand it.
			
 
				+ *
			
 
				+ * Strings that don't fit in the destination buffer will be truncated, but
			
 
				+ *  will always be null-terminated and never have an incomplete UTF-8
			
 
				+ *  sequence at the end. If the buffer length is 0, this function does nothing.
			
 
				+ *
			
 
				+ *   \param src Null-terminated source string in UTF-16 format.
			
 
				+ *   \param dst Buffer to store converted UTF-8 string.
			
 
				+ *   \param len Size, in bytes, of destination buffer.
			
 
				+ */
			
 
				+PHYSFS_DECL void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst,
			
 
				+                                      PHYSFS_uint64 len);
			
 
				+
			
 
				+/**
			
 
				+ * \fn PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
			
 
				+ * \brief Convert a UTF-8 string to a UTF-16 string.
			
 
				+ *
			
 
				+ * UTF-16 strings are 16-bits per character (except some chars, which are
			
 
				+ *  32-bits): \c TCHAR on Windows, when building with Unicode support. Modern
			
 
				+ *  Windows releases use UTF-16. Windows releases before 2000 used TCHAR, but
			
 
				+ *  only handled UCS-2. UTF-16 _is_ UCS-2, except for the characters that
			
 
				+ *  are 4 bytes, which aren't representable in UCS-2 at all anyhow. If you
			
 
				+ *  aren't sure, you should be using UTF-16 at this point on Windows.
			
 
				+ *
			
 
				+ * To ensure that the destination buffer is large enough for the conversion,
			
 
				+ *  please allocate a buffer that is double the size of the source buffer.
			
 
				+ *  UTF-8 uses from one to four bytes per character, but UTF-16 always uses
			
 
				+ *  two to four, so an entirely low-ASCII string will double in size! The
			
 
				+ *  UTF-16 characters that would take four bytes also take four bytes in UTF-8,
			
 
				+ *  so you don't need to allocate 4x the space just in case: double will do.
			
 
				+ *
			
 
				+ * Strings that don't fit in the destination buffer will be truncated, but
			
 
				+ *  will always be null-terminated and never have an incomplete UTF-16
			
 
				+ *  surrogate pair at the end. If the buffer length is 0, this function does
			
 
				+ *  nothing.
			
 
				+ *
			
 
				+ *   \param src Null-terminated source string in UTF-8 format.
			
 
				+ *   \param dst Buffer to store converted UTF-16 string.
			
 
				+ *   \param len Size, in bytes, of destination buffer.
			
 
				+ *
			
 
				+ * \sa PHYSFS_utf8ToUtf16
			
 
				+ */
			
 
				+PHYSFS_DECL void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst,
			
 
				+                                    PHYSFS_uint64 len);
			
 
				+
			
 
				+
			
 
				 /* Everything above this line is part of the PhysicsFS 2.1 API. */
			
 
				 
			
 
				 
			
--- a/src/physfs_unicode.c
+++ b/src/physfs_unicode.c
@@ -215,8 +215,7 @@ void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
 
				         else if (cp == UNICODE_BOGUS_CHAR_VALUE)
			
 
				             cp = UNICODE_BOGUS_CHAR_CODEPOINT;
			
 
				 
			
 
				-        /* !!! BLUESKY: UTF-16 surrogates? */
			
 
				-        if (cp > 0xFFFF)
			
 
				+        if (cp > 0xFFFF)  /* UTF-16 surrogates (bogus chars in UCS-2) */
			
 
				             cp = UNICODE_BOGUS_CHAR_CODEPOINT;
			
 
				 
			
 
				         *(dst++) = cp;
			
@@ -226,6 +225,38 @@ void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
 
				     *dst = 0;
			
 
				 } /* PHYSFS_utf8ToUcs2 */
			
 
				 
			
 
				+
			
 
				+void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
			
 
				+{
			
 
				+    len -= sizeof (PHYSFS_uint16);   /* save room for null char. */
			
 
				+    while (len >= sizeof (PHYSFS_uint16))
			
 
				+    {
			
 
				+        PHYSFS_uint32 cp = utf8codepoint(&src);
			
 
				+        if (cp == 0)
			
 
				+            break;
			
 
				+        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
			
 
				+            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
			
 
				+
			
 
				+        if (cp > 0xFFFF)  /* encode as surrogate pair */
			
 
				+        {
			
 
				+            if (len < (sizeof (PHYSFS_uint16) * 2))
			
 
				+                break;  /* not enough room for the pair, stop now. */
			
 
				+
			
 
				+            cp -= 0x10000;  /* Make this a 20-bit value */
			
 
				+
			
 
				+            *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
			
 
				+            len -= sizeof (PHYSFS_uint16);
			
 
				+
			
 
				+            cp = 0xDC00 + (cp & 0x3FF);
			
 
				+        } /* if */
			
 
				+
			
 
				+        *(dst++) = cp;
			
 
				+        len -= sizeof (PHYSFS_uint16);
			
 
				+    } /* while */
			
 
				+
			
 
				+    *dst = 0;
			
 
				+} /* PHYSFS_utf8ToUtf16 */
			
 
				+
			
 
				 static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
			
 
				 {
			
 
				     char *dst = *_dst;
			
@@ -334,6 +365,40 @@ void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
 
				 #undef UTF8FROMTYPE
			
 
				 
			
 
				 
			
 
				+void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
			
 
				+{
			
 
				+    if (len == 0)
			
 
				+        return;
			
 
				+
			
 
				+    len--;
			
 
				+    while (len)
			
 
				+    {
			
 
				+        PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
			
 
				+        if (cp == 0)
			
 
				+            break;
			
 
				+
			
 
				+        /* Orphaned second half of surrogate pair? */
			
 
				+        if ((cp >= 0xDC00) && (cp <= 0xDFFF))
			
 
				+            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
			
 
				+        else if ((cp >= 0xD800) && (cp <= 0xDBFF))  /* start surrogate pair! */
			
 
				+        {
			
 
				+            const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
			
 
				+            if ((pair < 0xDC00) || (pair > 0xDFFF))
			
 
				+                cp = UNICODE_BOGUS_CHAR_CODEPOINT;
			
 
				+            else
			
 
				+            {
			
 
				+                src++;  // eat the other surrogate.
			
 
				+                cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
			
 
				+            } /* else */
			
 
				+        } /* else if */
			
 
				+
			
 
				+        utf8fromcodepoint(cp, &dst, &len);
			
 
				+    } /* while */
			
 
				+
			
 
				+    *dst = '\0';
			
 
				+} /* PHYSFS_utf8FromUtf16 */
			
 
				+
			
 
				+
			
 
				 typedef struct CaseFoldMapping
			
 
				 {
			
 
				     PHYSFS_uint32 from;