2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)

* include/hbdefs.h + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR is mapped to HB_WCHAR16 (just like before) * include/hbapicdp.h * src/harbour.def * src/rtl/cdpapi.c + added new C functions for encoding and decoding UTF-8 string using which HB_WCHAR32: int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ); HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC ); HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ); They support full UCS and are much more restrictive against errors and wrong UTF-8 encoding, i.e. now overlong encoding is forbidden. The wrong characters are translated to 0xFFFD and later if such character does not exist in final CP to '?' ASCII character. * declaration of the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: int hb_cdpUTF8CharSize( HB_WCHAR32 wc ); HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ); * the following C functions have been changed to internally operate on HB_WCHAR32 instead of HB_WCHAR: hb_cdpUTF8StringLength() hb_cdpUTF8StringAt() hb_cdpUTF8StringSubstr() * the following C functions have been changed to use new hb_cdpUTF8GetU*() instead of step by step decoding with hb_cdpUTF8ToU16NextChar() hb_cdpStrToUTF8Disp() hb_cdpUTF8AsStrLen() hb_cdpUTF8ToStr() hb_cdpStrToU16() hb_cdpUtf8Char() * use HB_CDP_ERROR_* macros to mark wrong encoding * src/rtl/cdpapihb.c * the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: hb_utf8Chr() hb_utf8Asc() hb_utf8Poke() hb_utf8Peek() Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes in corresponding C functions. * src/codepage/cp_utf8.c * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP * src/rtl/arc4.c + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux builds for GLIBC < 2.30
2025-09-09 13:50:42 +02:00
parent 315887a395
commit 75ff90a49d
8 changed files with 398 additions and 227 deletions
--- a/include/hbapicdp.h
+++ b/include/hbapicdp.h
@@ -401,6 +401,19 @@ extern HB_EXPORT void         hb_vmSetCDP( PHB_CODEPAGE pCDP );
 */
 #define HB_MAX_CHAR_LEN             8

+/* UCS maximal character value */
+#define HB_CDP_UNICODE_MAX          0x10FFFF
+
+/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */
+#define HB_CDP_SURROGATE_FIRST      0xD800
+#define HB_CDP_SURROGATE_LAST       0xDFFF
+#define HB_CDP_SURROGATE_HIGH       0xD800
+#define HB_CDP_SURROGATE_LOW        0xDC00
+
+/* character codes to replace sequences with wrong encoding or translation */
+#define HB_CDP_ERROR_UNICHAR        0xFFFD      /* <?> */
+#define HB_CDP_ERROR_ASCCHAR        0x3F        /* ? */
+
 /* codepage uses simple binary sorting */
 #define HB_CDP_ISBINSORT( cdp )     ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 )
 /* codepage uses custom string decoding */
@@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL      hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch,

 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen );
 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse );
-extern HB_EXPORT HB_WCHAR     hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
+extern HB_EXPORT HB_WCHAR32   hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
 extern HB_EXPORT char *       hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest );

 extern HB_EXPORT HB_SIZE      hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax );
@@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR *   hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c

 extern HB_EXPORT HB_WCHAR     hb_cdpGetU16Ctrl( HB_WCHAR wc );

-extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR wc );
+extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR32 wc );
+extern HB_EXPORT int          hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
 extern HB_EXPORT int          hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc );
 extern HB_EXPORT HB_BOOL      hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc );
-
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );

 extern HB_EXPORT PHB_ITEM     hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut );
 extern HB_EXPORT char *       hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize );
--- a/include/hbdefs.h
+++ b/include/hbdefs.h
@@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR;
 #  endif
 #endif

-#if defined( HB_OS_WIN )
+#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 )
   typedef wchar_t         HB_WCHAR;
+   typedef wchar_t         HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
+#elif defined( __WATCOMC__ )
+   typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
 #else
   typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef wchar_t         HB_WCHAR32;
 #endif

 /* maximum length of double number in decimal representation: