From 75ff90a49d84d73a527b9fba10a56b950177eeac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Czerpak?= Date: Tue, 9 Sep 2025 13:50:42 +0200 Subject: [PATCH] 2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl) * include/hbdefs.h + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR is mapped to HB_WCHAR16 (just like before) * include/hbapicdp.h * src/harbour.def * src/rtl/cdpapi.c + added new C functions for encoding and decoding UTF-8 string using which HB_WCHAR32: int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ); HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC ); HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ); They support full UCS and are much more restrictive against errors and wrong UTF-8 encoding, i.e. now overlong encoding is forbidden. The wrong characters are translated to 0xFFFD and later if such character does not exist in final CP to '?' ASCII character. * declaration of the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: int hb_cdpUTF8CharSize( HB_WCHAR32 wc ); HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ); * the following C functions have been changed to internally operate on HB_WCHAR32 instead of HB_WCHAR: hb_cdpUTF8StringLength() hb_cdpUTF8StringAt() hb_cdpUTF8StringSubstr() * the following C functions have been changed to use new hb_cdpUTF8GetU*() instead of step by step decoding with hb_cdpUTF8ToU16NextChar() hb_cdpStrToUTF8Disp() hb_cdpUTF8AsStrLen() hb_cdpUTF8ToStr() hb_cdpStrToU16() hb_cdpUtf8Char() * use HB_CDP_ERROR_* macros to mark wrong encoding * src/rtl/cdpapihb.c * the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: hb_utf8Chr() hb_utf8Asc() hb_utf8Poke() hb_utf8Peek() Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes in corresponding C functions. * src/codepage/cp_utf8.c * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP * src/rtl/arc4.c + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux builds for GLIBC < 2.30 --- ChangeLog.txt | 58 ++++++ include/hbapicdp.h | 23 ++- include/hbdefs.h | 10 +- src/codepage/cp_utf8.c | 19 +- src/harbour.def | 5 + src/rtl/arc4.c | 7 +- src/rtl/cdpapi.c | 444 +++++++++++++++++++++++++---------------- src/rtl/cdpapihb.c | 59 ++---- 8 files changed, 398 insertions(+), 227 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index 8530fa7da7..030b1a6d19 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -7,6 +7,64 @@ Entries may not always be in chronological/commit order. See license at the end of file. */ +2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl) + * include/hbdefs.h + + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR + is mapped to HB_WCHAR16 (just like before) + + * include/hbapicdp.h + * src/harbour.def + * src/rtl/cdpapi.c + + added new C functions for encoding and decoding UTF-8 string using + which HB_WCHAR32: + int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ); + HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); + HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); + HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR16 * pWC ); + HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ); + They support full UCS and are much more restrictive against errors and + wrong UTF-8 encoding, i.e. now overlong encoding is forbidden. + The wrong characters are translated to 0xFFFD and later if such + character does not exist in final CP to '?' ASCII character. + * declaration of the following UTF-8 C functions have been changed to + operate on HB_WCHAR32 instead of HB_WCHAR: + int hb_cdpUTF8CharSize( HB_WCHAR32 wc ); + HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, + HB_SIZE nPos ); + * the following C functions have been changed to internally operate on + HB_WCHAR32 instead of HB_WCHAR: + hb_cdpUTF8StringLength() + hb_cdpUTF8StringAt() + hb_cdpUTF8StringSubstr() + * the following C functions have been changed to use new hb_cdpUTF8GetU*() + instead of step by step decoding with hb_cdpUTF8ToU16NextChar() + hb_cdpStrToUTF8Disp() + hb_cdpUTF8AsStrLen() + hb_cdpUTF8ToStr() + hb_cdpStrToU16() + hb_cdpUtf8Char() + * use HB_CDP_ERROR_* macros to mark wrong encoding + + * src/rtl/cdpapihb.c + * the following UTF-8 C functions have been changed to operate on + HB_WCHAR32 instead of HB_WCHAR: + hb_utf8Chr() + hb_utf8Asc() + hb_utf8Poke() + hb_utf8Peek() + Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes + in corresponding C functions. + + * src/codepage/cp_utf8.c + * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP + + * src/rtl/arc4.c + + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux + builds for GLIBC < 2.30 + 2025-09-03 12:21 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl) * src/rtl/cdpapi.c + added fallback translation table for different variants of Latin diff --git a/include/hbapicdp.h b/include/hbapicdp.h index 273d4f9853..12a65204c6 100644 --- a/include/hbapicdp.h +++ b/include/hbapicdp.h @@ -401,6 +401,19 @@ extern HB_EXPORT void hb_vmSetCDP( PHB_CODEPAGE pCDP ); */ #define HB_MAX_CHAR_LEN 8 +/* UCS maximal character value */ +#define HB_CDP_UNICODE_MAX 0x10FFFF + +/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */ +#define HB_CDP_SURROGATE_FIRST 0xD800 +#define HB_CDP_SURROGATE_LAST 0xDFFF +#define HB_CDP_SURROGATE_HIGH 0xD800 +#define HB_CDP_SURROGATE_LOW 0xDC00 + +/* character codes to replace sequences with wrong encoding or translation */ +#define HB_CDP_ERROR_UNICHAR 0xFFFD /* */ +#define HB_CDP_ERROR_ASCCHAR 0x3F /* ? */ + /* codepage uses simple binary sorting */ #define HB_CDP_ISBINSORT( cdp ) ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 ) /* codepage uses custom string decoding */ @@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch, extern HB_EXPORT HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen ); extern HB_EXPORT HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse ); -extern HB_EXPORT HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ); +extern HB_EXPORT HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ); extern HB_EXPORT char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest ); extern HB_EXPORT HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax ); @@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR * hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c extern HB_EXPORT HB_WCHAR hb_cdpGetU16Ctrl( HB_WCHAR wc ); -extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR wc ); +extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR32 wc ); +extern HB_EXPORT int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ); extern HB_EXPORT int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc ); extern HB_EXPORT HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc ); - +extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); +extern HB_EXPORT HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); +extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC ); +extern HB_EXPORT HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ); extern HB_EXPORT PHB_ITEM hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut ); extern HB_EXPORT char * hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize ); diff --git a/include/hbdefs.h b/include/hbdefs.h index 8b9fafb516..41fa8a6916 100644 --- a/include/hbdefs.h +++ b/include/hbdefs.h @@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR; # endif #endif -#if defined( HB_OS_WIN ) +#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 ) typedef wchar_t HB_WCHAR; + typedef wchar_t HB_WCHAR16; + typedef HB_I32 HB_WCHAR32; +#elif defined( __WATCOMC__ ) + typedef unsigned short HB_WCHAR; + typedef unsigned short HB_WCHAR16; + typedef HB_I32 HB_WCHAR32; #else typedef unsigned short HB_WCHAR; + typedef unsigned short HB_WCHAR16; + typedef wchar_t HB_WCHAR32; #endif /* maximum length of double number in decimal representation: diff --git a/src/codepage/cp_utf8.c b/src/codepage/cp_utf8.c index b264dbd978..4990799270 100644 --- a/src/codepage/cp_utf8.c +++ b/src/codepage/cp_utf8.c @@ -57,27 +57,14 @@ static HB_CDP_GET_FUNC( UTF8_get ) { - HB_SIZE nIndex = *pnIndex; - int n = 0; - HB_SYMBOL_UNUSED( cdp ); - *wc = 0; - while( nIndex < nLen ) + if( *pnIndex < nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) ) - ++nIndex; - if( n == 0 ) - { - *pnIndex = nIndex; - return HB_TRUE; - } - } - if( n != 0 ) - { - *pnIndex = nIndex; + hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc ); return HB_TRUE; } + *wc = 0; return HB_FALSE; } diff --git a/src/harbour.def b/src/harbour.def index 519f6501ff..8558ae0201 100644 --- a/src/harbour.def +++ b/src/harbour.def @@ -2157,14 +2157,19 @@ hb_cdpTranslateDispChar hb_cdpU16AsStrLen hb_cdpU16CharToUTF8 hb_cdpU16ToStr +hb_cdpU32CharToUTF8 hb_cdpUTF8AsStrLen hb_cdpUTF8CharSize +hb_cdpUTF8GetU16 +hb_cdpUTF8GetU32 +hb_cdpUTF8GetUCS hb_cdpUTF8StringAt hb_cdpUTF8StringLength hb_cdpUTF8StringPeek hb_cdpUTF8StringSubstr hb_cdpUTF8ToStr hb_cdpUTF8ToU16NextChar +hb_cdpUTF8Validate hb_cdpUpperWC hb_cdpcmp hb_cdpicmp diff --git a/src/rtl/arc4.c b/src/rtl/arc4.c index 4c0d179986..cdfee3b744 100644 --- a/src/rtl/arc4.c +++ b/src/rtl/arc4.c @@ -57,9 +57,10 @@ * sysctl() on Linux has fallen into depreciation. Not available in current * runtime C libraries, like musl and glibc >= 2.30. */ -# if ( ! defined( HB_OS_LINUX ) || \ - ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \ - defined( __UCLIBC__ ) ) +# if ! defined( HB_NO_SYSCTL ) && \ + ( ! defined( HB_OS_LINUX ) || \ + ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \ + defined( __UCLIBC__ ) ) # define HAVE_SYS_SYSCTL_H # endif # define HAVE_DECL_CTL_KERN diff --git a/src/rtl/cdpapi.c b/src/rtl/cdpapi.c index 2ae45f959f..3aa5c396f2 100644 --- a/src/rtl/cdpapi.c +++ b/src/rtl/cdpapi.c @@ -355,7 +355,7 @@ static HB_BOOL hb_cdpStd_put( PHB_CODEPAGE cdp, cdp->uniTable->uniTrans[ wc ] ) pDst[ ( *pnIndex )++ ] = cdp->uniTable->uniTrans[ wc ]; else - pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; + pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc; return HB_TRUE; } @@ -519,27 +519,14 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR * wc ) { - HB_SIZE nIndex = *pnIndex; - int n = 0; - HB_SYMBOL_UNUSED( cdp ); - *wc = 0; - while( nIndex < nLen ) + if( *pnIndex < nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) ) - ++nIndex; - if( n == 0 ) - { - *pnIndex = nIndex; - return HB_TRUE; - } - } - if( n > 0 ) - { - *pnIndex = nIndex; + hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc ); return HB_TRUE; } + *wc = 0; return HB_FALSE; } @@ -650,7 +637,7 @@ static HB_BOOL hb_cdpMulti_put( PHB_CODEPAGE cdp, return HB_TRUE; } } - pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; + pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc; } return HB_TRUE; } @@ -1156,14 +1143,78 @@ int hb_cdpicmp( const char * szFirst, HB_SIZE nLenFirst, /* * UTF-8 conversions */ -int hb_cdpUTF8CharSize( HB_WCHAR wc ) +int hb_cdpUTF8CharSize( HB_WCHAR32 wc ) { + if ( ( HB_I32 ) wc < 0 ) + wc = HB_CDP_ERROR_UNICHAR; + if( wc < 0x0080 ) return 1; else if( wc < 0x0800 ) return 2; - else /* if( wc <= 0xffff ) */ + else if( wc < 0xFFFF ) return 3; + else if( wc < 0x1FFFFF ) + return 4; + else if( wc < 0x3FFFFFF ) + return 5; + else /* if( wc <= 0x7FFFFFFF ) */ + return 6; +} + +int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ) +{ + int n; + + if( ( HB_I32 ) wc < 0 ) + wc = HB_CDP_ERROR_UNICHAR; + + if( wc < 0x0080 ) + { + szUTF8[ 0 ] = wc & 0xFF; + n = 1; + } + else if( wc < 0x0800 ) + { + szUTF8[ 0 ] = 0xc0 | ( ( wc >> 6 ) & 0x1F ); + szUTF8[ 1 ] = 0x80 | ( wc & 0x3F ); + n = 2; + } + else if( wc < 0xFFFF ) + { + szUTF8[ 0 ] = 0xE0 | ( ( wc >> 12 ) & 0x0F ); + szUTF8[ 1 ] = 0x80 | ( ( wc >> 6 ) & 0x3F ); + szUTF8[ 2 ] = 0x80 | ( wc & 0x3F ); + n = 3; + } + else if( wc < 0x1FFFFF ) + { + szUTF8[ 0 ] = 0xF0 | ( ( wc >> 18 ) & 0x07 ); + szUTF8[ 1 ] = 0x80 | ( ( wc >> 12 ) & 0x3F ); + szUTF8[ 2 ] = 0x80 | ( ( wc >> 6 ) & 0x3F ); + szUTF8[ 3 ] = 0x80 | ( wc & 0x3F ); + n = 4; + } + else if( wc < 0x3FFFFFF ) + { + szUTF8[ 0 ] = 0xF8 | ( ( wc >> 24 ) & 0x03 ); + szUTF8[ 1 ] = 0x80 | ( ( wc >> 18 ) & 0x3F ); + szUTF8[ 2 ] = 0x80 | ( ( wc >> 12 ) & 0x3F ); + szUTF8[ 3 ] = 0x80 | ( ( wc >> 6 ) & 0x3F ); + szUTF8[ 4 ] = 0x80 | ( wc & 0x3F ); + n = 5; + } + else /* if( wc <= 0x7FFFFFFF ) */ + { + szUTF8[ 0 ] = 0xFC | ( ( wc >> 30 ) & 0x01 ); + szUTF8[ 1 ] = 0x80 | ( ( wc >> 24 ) & 0x3F ); + szUTF8[ 2 ] = 0x80 | ( ( wc >> 18 ) & 0x3F ); + szUTF8[ 3 ] = 0x80 | ( ( wc >> 12 ) & 0x3F ); + szUTF8[ 4 ] = 0x80 | ( ( wc >> 6 ) & 0x3F ); + szUTF8[ 5 ] = 0x80 | ( wc & 0x3F ); + n = 6; + } + return n; } int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc ) @@ -1240,27 +1291,153 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc ) *pwc &= 0x01; *n = 5; } + else + { + *n = 0; + return HB_FALSE; + } + } + return HB_TRUE; +} + +HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR32 * pWC ) +{ + HB_SIZE nIndex = *pnIndex; + HB_WCHAR32 wc = 0; + int n = -1; + + if( nIndex < nLen ) + { + HB_WCHAR32 wcMin = 0; /* forbid overlong encodings */ + HB_UCHAR uc = ( HB_UCHAR ) pSrc[ nIndex++ ]; + + if( uc < 0x80 ) + { + wc = uc; + n = 0; + } + else if( uc >= 0xc0 ) + { + if( uc < 0xe0 ) + { + wc = uc & 0x1f; + n = 1; + wcMin = 0x80; + } + else if( uc < 0xf0 ) + { + wc = uc & 0x0f; + n = 2; + wcMin = 0x800; + } + else if( uc < 0xf8 ) + { + wc = uc & 0x07; + n = 3; + wcMin = 0x10000; + } + else if( uc < 0xfc ) + { + wc = uc & 0x03; + n = 4; + wcMin = 0x200000; + } + else if( uc < 0xfe ) + { + wc = uc & 0x01; + n = 5; + wcMin = 0x4000000; + } + while( n > 0 && nIndex < nLen ) + { + uc = ( HB_UCHAR ) pSrc[ nIndex ]; + if( ( uc & 0xc0 ) != 0x80 ) + break; + wc = ( wc << 6 ) | ( uc & 0x3f ); + ++nIndex; + --n; + } + } + + if( n != 0 || wc < wcMin ) + { + wc = HB_CDP_ERROR_UNICHAR; + while( n-- > 0 && nIndex < nLen ) + { + uc = ( HB_UCHAR ) pSrc[ nIndex ]; + if( uc < 0x80 || ( uc >= 0xc2 && uc <= 0xf4 ) ) + break; + ++nIndex; + } + n = -1; + } + } + + *pnIndex = nIndex; + *pWC = wc; + + return n == 0; +} + +HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR32 * pWC ) +{ + HB_BOOL fResult; + + fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, pWC ); + if( fResult && ( *pWC > HB_CDP_UNICODE_MAX || + ( *pWC >= HB_CDP_SURROGATE_FIRST && *pWC <= HB_CDP_SURROGATE_LAST ) ) ) + { + *pWC = HB_CDP_ERROR_UNICHAR; + fResult = HB_FALSE; + } + return fResult; +} + +HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, + HB_SIZE * pnIndex, HB_WCHAR16 * pWC ) +{ + HB_WCHAR32 wc; + HB_BOOL fResult; + + fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, &wc ); + + if( fResult && wc > 0xFFFF ) + { + wc = HB_CDP_ERROR_UNICHAR; + fResult = HB_FALSE; + } + *pWC = ( HB_WCHAR16 ) wc; + + return fResult; +} + +HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ) +{ + HB_SIZE nIndex = 0; + + while( nIndex < nLen ) + { + HB_WCHAR32 wc; + if( ! hb_cdpUTF8GetUCS( pSrc, nLen, &nIndex, &wc ) ) + return HB_FALSE; } return HB_TRUE; } HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen ) { - HB_SIZE nPos, nDst; - HB_WCHAR wc; - int n = 0; + HB_SIZE nIndex = 0, nChars = 0; + HB_WCHAR32 wc; - for( nPos = nDst = 0; nPos < nLen; ) + while( nIndex < nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos ], &n, &wc ) ) - ++nPos; - if( n == 0 ) - ++nDst; + hb_cdpUTF8GetU32( pSrc, nLen, &nIndex, &wc ); + ++nChars; } - if( n > 0 ) - ++nDst; - return nDst; + return nChars; } HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, @@ -1274,31 +1451,16 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, HB_SIZE nRAt = 0; HB_SIZE nAt = 0; - HB_WCHAR wcN = 0; - HB_WCHAR wcH = 0; - int nN = 0; - int nH = 0; + HB_WCHAR32 wcN = 0; + HB_WCHAR32 wcH = 0; while( nPosH < nLenH && nPosN < nLenN && nPos < nEnd ) { - do - { - if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) ) - break; - ++nPosH; - } - while( nH && nPosH < nLenH ); - + hb_cdpUTF8GetU32( szHaystack, nLenH, &nPosH, &wcH ); if( ++nPos < nStart ) continue; - do - { - if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) ) - break; - ++nPosN; - } - while( nN && nPosN < nLenN ); + hb_cdpUTF8GetU32( szNeedle, nLenN, &nPosN, &wcN ); if( wcH == wcN ) { @@ -1339,36 +1501,17 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, return nRAt; } -HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ) +HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ) { - if( nLen ) + HB_SIZE nIndex = 0; + + while( nPos && nIndex < nLen ) { - HB_SIZE nPos2; - HB_WCHAR wc = 0; - int n = 0; - - for( nPos2 = 0; nPos2 < nLen && nPos; ) - { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) ) - ++nPos2; - if( n == 0 ) - --nPos; - } - - if( nPos2 < nLen ) - { - n = 0; - do - { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) ) - ++nPos2; - if( n == 0 ) - return wc; - } - while( nPos2 < nLen ); - } + HB_WCHAR wc; + hb_cdpUTF8GetU16( pSrc, nLen, &nIndex, &wc ); + if( --nPos == 0 ) + return wc; } - return 0; } @@ -1377,36 +1520,29 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pulDest ) { HB_SIZE nDst = 0; - HB_WCHAR wc; - int n; char * pDst = NULL; if( nCount && nLen ) { - HB_SIZE nPos; - n = 0; - for( nPos = 0; nPos < nLen && nFrom; ) + HB_WCHAR32 wc; + HB_SIZE nPos = 0; + + while( nPos < nLen && nFrom ) { - if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) ) - ++nPos; - if( n == 0 ) - --nFrom; + hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc ); + --nFrom; } if( nPos < nLen ) { - HB_SIZE nCnt; + HB_SIZE nCnt = nCount; + nFrom = nPos; - nCnt = nCount; - n = 0; do { - if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) ) - ++nPos; - if( n == 0 ) - --nCnt; + hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc ); } - while( nPos < nLen && nCnt ); + while( nPos < nLen && --nCnt ); nDst = nPos - nFrom; pDst = ( char * ) hb_xgrab( nDst + 1 ); @@ -1620,9 +1756,8 @@ HB_SIZE hb_cdpStrToUTF8Disp( PHB_CODEPAGE cdp, HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax ) { - HB_WCHAR wc = 0; + HB_WCHAR wc; HB_SIZE nPosS, nPosD; - int n = 0, i; if( HB_CDP_ISUTF8( cdp ) ) return ( nMax && nSrc > nMax ) ? nMax : nSrc; @@ -1630,31 +1765,22 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, { for( nPosS = nPosD = 0; nPosS < nSrc; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) ) - ++nPosS; - - if( n == 0 ) - { - i = HB_CDPCHAR_LEN( cdp, wc ); - if( nMax && nPosD + i > nMax ) - break; - nPosD += i; - } + int i; + hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc ); + i = HB_CDPCHAR_LEN( cdp, wc ); + if( nMax && nPosD + i > nMax ) + break; + nPosD += i; } } else { for( nPosS = nPosD = 0; nPosS < nSrc; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) ) - ++nPosS; - - if( n == 0 ) - { - ++nPosD; - if( nMax && nPosD >= nMax ) - break; - } + hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc ); + ++nPosD; + if( nMax && nPosD >= nMax ) + break; } } @@ -1665,10 +1791,8 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, char * pDst, HB_SIZE nDst ) { - HB_UCHAR * uniTrans; - HB_WCHAR wcMax, wc = 0; + HB_WCHAR wcMax, wc; HB_SIZE nPosS, nPosD; - int n = 0; if( HB_CDP_ISUTF8( cdp ) ) { @@ -1683,18 +1807,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp, { for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) ) - ++nPosS; - - if( n == 0 ) - { - if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) ) - break; - } + hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc ); + if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) ) + break; } } else { + HB_UCHAR * uniTrans; + if( cdp->uniTable->uniTrans == NULL ) hb_cdpBuildTransTable( cdp->uniTable ); uniTrans = cdp->uniTable->uniTrans; @@ -1702,16 +1823,11 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp, for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) ) - ++nPosS; - - if( n == 0 ) - { - if( wc <= wcMax && uniTrans[ wc ] ) - pDst[ nPosD++ ] = uniTrans[ wc ]; - else - pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; - } + hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc ); + if( wc <= wcMax && uniTrans[ wc ] ) + pDst[ nPosD++ ] = uniTrans[ wc ]; + else + pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc; } } @@ -1795,12 +1911,12 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc ) char c; if( ! HB_CDPCHAR_PUT( cdp, &c, 1, &n, wc ) ) - wc = '?'; + wc = HB_CDP_ERROR_ASCCHAR; else wc = ( HB_UCHAR ) c; } else - wc = '?'; + wc = HB_CDP_ERROR_ASCCHAR; } else { @@ -1815,7 +1931,7 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc ) } } } - return wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; + return wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc; } HB_UCHAR hb_cdpGetUC( PHB_CODEPAGE cdp, HB_WCHAR wc, HB_UCHAR ucDef ) @@ -1903,30 +2019,24 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian, if( HB_CDP_ISUTF8( cdp ) ) { - HB_WCHAR wc = 0; - int n = 0; + HB_WCHAR wc; for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) ) - ++nPosS; - - if( n == 0 ) - { + hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc ); #if defined( HB_CDP_ENDIAN_SWAP ) - if( iEndian == HB_CDP_ENDIAN_SWAP ) - wc = HB_SWAP_UINT16( wc ); - pDst[ nPosD++ ] = wc; + if( iEndian == HB_CDP_ENDIAN_SWAP ) + wc = HB_SWAP_UINT16( wc ); + pDst[ nPosD++ ] = wc; #else - if( iEndian == HB_CDP_ENDIAN_LITTLE ) - HB_PUT_LE_UINT16( &pDst[ nPosD ], wc ); - else if( iEndian == HB_CDP_ENDIAN_BIG ) - HB_PUT_BE_UINT16( &pDst[ nPosD ], wc ); - else - pDst[ nPosD ] = wc; - ++nPosD; + if( iEndian == HB_CDP_ENDIAN_LITTLE ) + HB_PUT_LE_UINT16( &pDst[ nPosD ], wc ); + else if( iEndian == HB_CDP_ENDIAN_BIG ) + HB_PUT_BE_UINT16( &pDst[ nPosD ], wc ); + else + pDst[ nPosD ] = wc; + ++nPosD; #endif - } } } else if( HB_CDP_ISCUSTOM( cdp ) ) @@ -2117,7 +2227,7 @@ HB_SIZE hb_cdpU16ToStr( PHB_CODEPAGE cdp, int iEndian, if( wc <= wcMax && uniTrans[ wc ] ) pDst[ nPosD++ ] = uniTrans[ wc ]; else - pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; + pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc; } } @@ -2240,7 +2350,7 @@ int hb_cdpTranslateChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut ) { if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) ) { - if( c != '?' ) + if( c != HB_CDP_ERROR_ASCCHAR ) iChar = ( HB_UCHAR ) c; } } @@ -2288,7 +2398,7 @@ int hb_cdpTranslateDispChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut wc = s_uniCtrls[ iChar ]; if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) ) { - if( c != '?' ) + if( c != HB_CDP_ERROR_ASCCHAR ) iChar = ( HB_UCHAR ) c; } } @@ -2751,19 +2861,19 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable ) { const char * pszString = *pStrPtr; HB_UCHAR uc = 0; - HB_WCHAR wc = 0; - int n = 0; - while( *pszString ) + if( *pszString ) { - if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) *pszString++, &n, &wc ) ) - break; - if( n == 0 ) + HB_SIZE nIndex = 0; + HB_WCHAR wc; + + if( hb_cdpUTF8GetU16( pszString, hb_strnlen( pszString, 6 ), &nIndex, &wc ) ) { if( wc < 127 ) uc = ( HB_UCHAR ) wc; else { + int n; for( n = 0; n < 256; ++n ) { if( wc == uniTable->uniCodes[ n ] ) @@ -2773,8 +2883,8 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable ) } } } - break; } + pszString += nIndex; } if( uc == 0 ) { diff --git a/src/rtl/cdpapihb.c b/src/rtl/cdpapihb.c index f180099a4d..17d2075a22 100644 --- a/src/rtl/cdpapihb.c +++ b/src/rtl/cdpapihb.c @@ -55,20 +55,14 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos ) if( nUTF8Pos > 0 && nUTF8Pos <= nLen ) { HB_SIZE n1, n2; - HB_WCHAR uc; - int n = 0; + HB_WCHAR32 wc; for( n1 = n2 = 0; n1 < nLen; ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) ) - ++n1; - - if( n == 0 ) - { - if( --nUTF8Pos == 0 ) - return n2 + 1; - n2 = n1; - } + hb_cdpUTF8GetU32( szUTF8, nLen, &n1, &wc ); + if( --nUTF8Pos == 0 ) + return n2 + 1; + n2 = n1; } } return 0; @@ -201,7 +195,7 @@ HB_FUNC( HB_UTF8CHR ) char utf8Char[ HB_MAX_CHAR_LEN ]; int iLen; - iLen = hb_cdpU16CharToUTF8( utf8Char, ( HB_WCHAR ) hb_parni( 1 ) ); + iLen = hb_cdpU32CharToUTF8( utf8Char, ( HB_WCHAR32 ) hb_parni( 1 ) ); hb_retclen( utf8Char, iLen ); } else @@ -214,19 +208,10 @@ HB_FUNC( HB_UTF8ASC ) if( pszString ) { - HB_SIZE nLen = hb_parclen( 1 ); - HB_WCHAR wc = 0; - int n = 0; + HB_SIZE nLen = hb_parclen( 1 ), nIndex = 0; + HB_WCHAR32 wc = 0; - while( nLen ) - { - if( ! hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) ) - break; - if( n == 0 ) - break; - pszString++; - nLen--; - } + hb_cdpUTF8GetU32( pszString, nLen, &nIndex, &wc ); hb_retnint( wc ); } else @@ -467,35 +452,35 @@ HB_FUNC( HB_UTF8POKE ) nPos = utf8pos( szString, nLen, hb_parns( 2 ) ); if( nPos ) { - HB_WCHAR uc, uc2; - int n, n2; + HB_WCHAR32 uc, uc2; + HB_SIZE nDstLen = 0; + int n; --nPos; - uc = ( HB_WCHAR ) hb_parni( 3 ); + uc = ( HB_WCHAR32 ) hb_parni( 3 ); n = hb_cdpUTF8CharSize( uc ); - n2 = 0; - hb_cdpUTF8ToU16NextChar( szString[ nPos ], &n2, &uc2 ); - ++n2; - if( n == n2 ) + + hb_cdpUTF8GetU32( &szString[ nPos ], nLen - nPos, &nDstLen, &uc2 ); + if( n == ( int ) nDstLen ) { char * szText; if( hb_itemGetWriteCL( pText, &szText, &nLen ) && nPos + n <= nLen ) { - hb_cdpU16CharToUTF8( &szText[ nPos ], uc ); + hb_cdpU32CharToUTF8( &szText[ nPos ], uc ); } hb_itemReturn( pText ); } else { - char * szResult = ( char * ) hb_xgrab( nLen - n2 + n + 1 ); + char * szResult = ( char * ) hb_xgrab( nLen - nDstLen + n + 1 ); memcpy( szResult, szString, nPos ); - hb_cdpU16CharToUTF8( &szResult[ nPos ], uc ); - memcpy( szResult + nPos + n, szString + nPos + n2, nLen - nPos - n2 ); + hb_cdpU32CharToUTF8( &szResult[ nPos ], uc ); + memcpy( szResult + nPos + n, szString + nPos + nDstLen, nLen - nPos - nDstLen ); if( HB_ISBYREF( 1 ) ) - hb_storclen( szResult, nLen - n2 + n, 1 ); - hb_retclen_buffer( szResult, nLen - n2 + n ); + hb_storclen( szResult, nLen - nDstLen + n, 1 ); + hb_retclen_buffer( szResult, nLen - nDstLen + n ); } } else