2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)

* include/hbdefs.h
    + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR
      is mapped to HB_WCHAR16 (just like before)

  * include/hbapicdp.h
  * src/harbour.def
  * src/rtl/cdpapi.c
    + added new C functions for encoding and decoding UTF-8 string using
      which HB_WCHAR32:
         int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
         HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
         HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
      They support full UCS and are much more restrictive against errors and
      wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
      The wrong characters are translated to 0xFFFD and later if such
      character does not exist in final CP to '?' ASCII character.
    * declaration of the following UTF-8 C functions have been changed to
      operate on HB_WCHAR32 instead of HB_WCHAR:
         int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
         HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
                                          HB_SIZE nPos );
    * the following C functions have been changed to internally operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_cdpUTF8StringLength()
         hb_cdpUTF8StringAt()
         hb_cdpUTF8StringSubstr()
    * the following C functions have been changed to use new hb_cdpUTF8GetU*()
      instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
         hb_cdpStrToUTF8Disp()
         hb_cdpUTF8AsStrLen()
         hb_cdpUTF8ToStr()
         hb_cdpStrToU16()
         hb_cdpUtf8Char()
    * use HB_CDP_ERROR_* macros to mark wrong encoding

  * src/rtl/cdpapihb.c
    * the following UTF-8 C functions have been changed to operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_utf8Chr()
         hb_utf8Asc()
         hb_utf8Poke()
         hb_utf8Peek()
      Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
      in corresponding C functions.

  * src/codepage/cp_utf8.c
    * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP

  * src/rtl/arc4.c
    + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
      builds for GLIBC < 2.30
This commit is contained in:
Przemysław Czerpak
2025-09-09 13:50:42 +02:00
parent 315887a395
commit 75ff90a49d
8 changed files with 398 additions and 227 deletions

View File

@@ -401,6 +401,19 @@ extern HB_EXPORT void hb_vmSetCDP( PHB_CODEPAGE pCDP );
*/
#define HB_MAX_CHAR_LEN 8
/* UCS maximal character value */
#define HB_CDP_UNICODE_MAX 0x10FFFF
/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */
#define HB_CDP_SURROGATE_FIRST 0xD800
#define HB_CDP_SURROGATE_LAST 0xDFFF
#define HB_CDP_SURROGATE_HIGH 0xD800
#define HB_CDP_SURROGATE_LOW 0xDC00
/* character codes to replace sequences with wrong encoding or translation */
#define HB_CDP_ERROR_UNICHAR 0xFFFD /* <?> */
#define HB_CDP_ERROR_ASCCHAR 0x3F /* ? */
/* codepage uses simple binary sorting */
#define HB_CDP_ISBINSORT( cdp ) ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 )
/* codepage uses custom string decoding */
@@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch,
extern HB_EXPORT HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen );
extern HB_EXPORT HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse );
extern HB_EXPORT HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
extern HB_EXPORT HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
extern HB_EXPORT char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest );
extern HB_EXPORT HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax );
@@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR * hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c
extern HB_EXPORT HB_WCHAR hb_cdpGetU16Ctrl( HB_WCHAR wc );
extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR wc );
extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
extern HB_EXPORT int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
extern HB_EXPORT int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc );
extern HB_EXPORT HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
extern HB_EXPORT PHB_ITEM hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut );
extern HB_EXPORT char * hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize );

View File

@@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR;
# endif
#endif
#if defined( HB_OS_WIN )
#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 )
typedef wchar_t HB_WCHAR;
typedef wchar_t HB_WCHAR16;
typedef HB_I32 HB_WCHAR32;
#elif defined( __WATCOMC__ )
typedef unsigned short HB_WCHAR;
typedef unsigned short HB_WCHAR16;
typedef HB_I32 HB_WCHAR32;
#else
typedef unsigned short HB_WCHAR;
typedef unsigned short HB_WCHAR16;
typedef wchar_t HB_WCHAR32;
#endif
/* maximum length of double number in decimal representation: