2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)

* include/hbdefs.h
    + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR
      is mapped to HB_WCHAR16 (just like before)

  * include/hbapicdp.h
  * src/harbour.def
  * src/rtl/cdpapi.c
    + added new C functions for encoding and decoding UTF-8 string using
      which HB_WCHAR32:
         int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
         HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
         HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
      They support full UCS and are much more restrictive against errors and
      wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
      The wrong characters are translated to 0xFFFD and later if such
      character does not exist in final CP to '?' ASCII character.
    * declaration of the following UTF-8 C functions have been changed to
      operate on HB_WCHAR32 instead of HB_WCHAR:
         int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
         HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
                                          HB_SIZE nPos );
    * the following C functions have been changed to internally operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_cdpUTF8StringLength()
         hb_cdpUTF8StringAt()
         hb_cdpUTF8StringSubstr()
    * the following C functions have been changed to use new hb_cdpUTF8GetU*()
      instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
         hb_cdpStrToUTF8Disp()
         hb_cdpUTF8AsStrLen()
         hb_cdpUTF8ToStr()
         hb_cdpStrToU16()
         hb_cdpUtf8Char()
    * use HB_CDP_ERROR_* macros to mark wrong encoding

  * src/rtl/cdpapihb.c
    * the following UTF-8 C functions have been changed to operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_utf8Chr()
         hb_utf8Asc()
         hb_utf8Poke()
         hb_utf8Peek()
      Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
      in corresponding C functions.

  * src/codepage/cp_utf8.c
    * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP

  * src/rtl/arc4.c
    + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
      builds for GLIBC < 2.30
This commit is contained in:
Przemysław Czerpak
2025-09-09 13:50:42 +02:00
parent 315887a395
commit 75ff90a49d
8 changed files with 398 additions and 227 deletions

View File

@@ -7,6 +7,64 @@
Entries may not always be in chronological/commit order.
See license at the end of file. */
2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
* include/hbdefs.h
+ added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR
is mapped to HB_WCHAR16 (just like before)
* include/hbapicdp.h
* src/harbour.def
* src/rtl/cdpapi.c
+ added new C functions for encoding and decoding UTF-8 string using
which HB_WCHAR32:
int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
They support full UCS and are much more restrictive against errors and
wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
The wrong characters are translated to 0xFFFD and later if such
character does not exist in final CP to '?' ASCII character.
* declaration of the following UTF-8 C functions have been changed to
operate on HB_WCHAR32 instead of HB_WCHAR:
int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
HB_SIZE nPos );
* the following C functions have been changed to internally operate on
HB_WCHAR32 instead of HB_WCHAR:
hb_cdpUTF8StringLength()
hb_cdpUTF8StringAt()
hb_cdpUTF8StringSubstr()
* the following C functions have been changed to use new hb_cdpUTF8GetU*()
instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
hb_cdpStrToUTF8Disp()
hb_cdpUTF8AsStrLen()
hb_cdpUTF8ToStr()
hb_cdpStrToU16()
hb_cdpUtf8Char()
* use HB_CDP_ERROR_* macros to mark wrong encoding
* src/rtl/cdpapihb.c
* the following UTF-8 C functions have been changed to operate on
HB_WCHAR32 instead of HB_WCHAR:
hb_utf8Chr()
hb_utf8Asc()
hb_utf8Poke()
hb_utf8Peek()
Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
in corresponding C functions.
* src/codepage/cp_utf8.c
* use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP
* src/rtl/arc4.c
+ added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
builds for GLIBC < 2.30
2025-09-03 12:21 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
* src/rtl/cdpapi.c
+ added fallback translation table for different variants of Latin

View File

@@ -401,6 +401,19 @@ extern HB_EXPORT void hb_vmSetCDP( PHB_CODEPAGE pCDP );
*/
#define HB_MAX_CHAR_LEN 8
/* UCS maximal character value */
#define HB_CDP_UNICODE_MAX 0x10FFFF
/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */
#define HB_CDP_SURROGATE_FIRST 0xD800
#define HB_CDP_SURROGATE_LAST 0xDFFF
#define HB_CDP_SURROGATE_HIGH 0xD800
#define HB_CDP_SURROGATE_LOW 0xDC00
/* character codes to replace sequences with wrong encoding or translation */
#define HB_CDP_ERROR_UNICHAR 0xFFFD /* <?> */
#define HB_CDP_ERROR_ASCCHAR 0x3F /* ? */
/* codepage uses simple binary sorting */
#define HB_CDP_ISBINSORT( cdp ) ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 )
/* codepage uses custom string decoding */
@@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch,
extern HB_EXPORT HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen );
extern HB_EXPORT HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse );
extern HB_EXPORT HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
extern HB_EXPORT HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
extern HB_EXPORT char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest );
extern HB_EXPORT HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax );
@@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR * hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c
extern HB_EXPORT HB_WCHAR hb_cdpGetU16Ctrl( HB_WCHAR wc );
extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR wc );
extern HB_EXPORT int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
extern HB_EXPORT int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
extern HB_EXPORT int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc );
extern HB_EXPORT HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
extern HB_EXPORT HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
extern HB_EXPORT PHB_ITEM hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut );
extern HB_EXPORT char * hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize );

View File

@@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR;
# endif
#endif
#if defined( HB_OS_WIN )
#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 )
typedef wchar_t HB_WCHAR;
typedef wchar_t HB_WCHAR16;
typedef HB_I32 HB_WCHAR32;
#elif defined( __WATCOMC__ )
typedef unsigned short HB_WCHAR;
typedef unsigned short HB_WCHAR16;
typedef HB_I32 HB_WCHAR32;
#else
typedef unsigned short HB_WCHAR;
typedef unsigned short HB_WCHAR16;
typedef wchar_t HB_WCHAR32;
#endif
/* maximum length of double number in decimal representation:

View File

@@ -57,27 +57,14 @@
static HB_CDP_GET_FUNC( UTF8_get )
{
HB_SIZE nIndex = *pnIndex;
int n = 0;
HB_SYMBOL_UNUSED( cdp );
*wc = 0;
while( nIndex < nLen )
if( *pnIndex < nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
++nIndex;
if( n == 0 )
{
*pnIndex = nIndex;
return HB_TRUE;
}
}
if( n != 0 )
{
*pnIndex = nIndex;
hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
return HB_TRUE;
}
*wc = 0;
return HB_FALSE;
}

View File

@@ -2157,14 +2157,19 @@ hb_cdpTranslateDispChar
hb_cdpU16AsStrLen
hb_cdpU16CharToUTF8
hb_cdpU16ToStr
hb_cdpU32CharToUTF8
hb_cdpUTF8AsStrLen
hb_cdpUTF8CharSize
hb_cdpUTF8GetU16
hb_cdpUTF8GetU32
hb_cdpUTF8GetUCS
hb_cdpUTF8StringAt
hb_cdpUTF8StringLength
hb_cdpUTF8StringPeek
hb_cdpUTF8StringSubstr
hb_cdpUTF8ToStr
hb_cdpUTF8ToU16NextChar
hb_cdpUTF8Validate
hb_cdpUpperWC
hb_cdpcmp
hb_cdpicmp

View File

@@ -57,9 +57,10 @@
* sysctl() on Linux has fallen into depreciation. Not available in current
* runtime C libraries, like musl and glibc >= 2.30.
*/
# if ( ! defined( HB_OS_LINUX ) || \
( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
defined( __UCLIBC__ ) )
# if ! defined( HB_NO_SYSCTL ) && \
( ! defined( HB_OS_LINUX ) || \
( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
defined( __UCLIBC__ ) )
# define HAVE_SYS_SYSCTL_H
# endif
# define HAVE_DECL_CTL_KERN

View File

@@ -355,7 +355,7 @@ static HB_BOOL hb_cdpStd_put( PHB_CODEPAGE cdp,
cdp->uniTable->uniTrans[ wc ] )
pDst[ ( *pnIndex )++ ] = cdp->uniTable->uniTrans[ wc ];
else
pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
return HB_TRUE;
}
@@ -519,27 +519,14 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp,
const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR * wc )
{
HB_SIZE nIndex = *pnIndex;
int n = 0;
HB_SYMBOL_UNUSED( cdp );
*wc = 0;
while( nIndex < nLen )
if( *pnIndex < nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
++nIndex;
if( n == 0 )
{
*pnIndex = nIndex;
return HB_TRUE;
}
}
if( n > 0 )
{
*pnIndex = nIndex;
hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
return HB_TRUE;
}
*wc = 0;
return HB_FALSE;
}
@@ -650,7 +637,7 @@ static HB_BOOL hb_cdpMulti_put( PHB_CODEPAGE cdp,
return HB_TRUE;
}
}
pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
}
return HB_TRUE;
}
@@ -1156,14 +1143,78 @@ int hb_cdpicmp( const char * szFirst, HB_SIZE nLenFirst,
/*
* UTF-8 conversions
*/
int hb_cdpUTF8CharSize( HB_WCHAR wc )
int hb_cdpUTF8CharSize( HB_WCHAR32 wc )
{
if ( ( HB_I32 ) wc < 0 )
wc = HB_CDP_ERROR_UNICHAR;
if( wc < 0x0080 )
return 1;
else if( wc < 0x0800 )
return 2;
else /* if( wc <= 0xffff ) */
else if( wc < 0xFFFF )
return 3;
else if( wc < 0x1FFFFF )
return 4;
else if( wc < 0x3FFFFFF )
return 5;
else /* if( wc <= 0x7FFFFFFF ) */
return 6;
}
int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc )
{
int n;
if( ( HB_I32 ) wc < 0 )
wc = HB_CDP_ERROR_UNICHAR;
if( wc < 0x0080 )
{
szUTF8[ 0 ] = wc & 0xFF;
n = 1;
}
else if( wc < 0x0800 )
{
szUTF8[ 0 ] = 0xc0 | ( ( wc >> 6 ) & 0x1F );
szUTF8[ 1 ] = 0x80 | ( wc & 0x3F );
n = 2;
}
else if( wc < 0xFFFF )
{
szUTF8[ 0 ] = 0xE0 | ( ( wc >> 12 ) & 0x0F );
szUTF8[ 1 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
szUTF8[ 2 ] = 0x80 | ( wc & 0x3F );
n = 3;
}
else if( wc < 0x1FFFFF )
{
szUTF8[ 0 ] = 0xF0 | ( ( wc >> 18 ) & 0x07 );
szUTF8[ 1 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
szUTF8[ 2 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
szUTF8[ 3 ] = 0x80 | ( wc & 0x3F );
n = 4;
}
else if( wc < 0x3FFFFFF )
{
szUTF8[ 0 ] = 0xF8 | ( ( wc >> 24 ) & 0x03 );
szUTF8[ 1 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
szUTF8[ 2 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
szUTF8[ 3 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
szUTF8[ 4 ] = 0x80 | ( wc & 0x3F );
n = 5;
}
else /* if( wc <= 0x7FFFFFFF ) */
{
szUTF8[ 0 ] = 0xFC | ( ( wc >> 30 ) & 0x01 );
szUTF8[ 1 ] = 0x80 | ( ( wc >> 24 ) & 0x3F );
szUTF8[ 2 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
szUTF8[ 3 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
szUTF8[ 4 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
szUTF8[ 5 ] = 0x80 | ( wc & 0x3F );
n = 6;
}
return n;
}
int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc )
@@ -1240,27 +1291,153 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc )
*pwc &= 0x01;
*n = 5;
}
else
{
*n = 0;
return HB_FALSE;
}
}
return HB_TRUE;
}
HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
{
HB_SIZE nIndex = *pnIndex;
HB_WCHAR32 wc = 0;
int n = -1;
if( nIndex < nLen )
{
HB_WCHAR32 wcMin = 0; /* forbid overlong encodings */
HB_UCHAR uc = ( HB_UCHAR ) pSrc[ nIndex++ ];
if( uc < 0x80 )
{
wc = uc;
n = 0;
}
else if( uc >= 0xc0 )
{
if( uc < 0xe0 )
{
wc = uc & 0x1f;
n = 1;
wcMin = 0x80;
}
else if( uc < 0xf0 )
{
wc = uc & 0x0f;
n = 2;
wcMin = 0x800;
}
else if( uc < 0xf8 )
{
wc = uc & 0x07;
n = 3;
wcMin = 0x10000;
}
else if( uc < 0xfc )
{
wc = uc & 0x03;
n = 4;
wcMin = 0x200000;
}
else if( uc < 0xfe )
{
wc = uc & 0x01;
n = 5;
wcMin = 0x4000000;
}
while( n > 0 && nIndex < nLen )
{
uc = ( HB_UCHAR ) pSrc[ nIndex ];
if( ( uc & 0xc0 ) != 0x80 )
break;
wc = ( wc << 6 ) | ( uc & 0x3f );
++nIndex;
--n;
}
}
if( n != 0 || wc < wcMin )
{
wc = HB_CDP_ERROR_UNICHAR;
while( n-- > 0 && nIndex < nLen )
{
uc = ( HB_UCHAR ) pSrc[ nIndex ];
if( uc < 0x80 || ( uc >= 0xc2 && uc <= 0xf4 ) )
break;
++nIndex;
}
n = -1;
}
}
*pnIndex = nIndex;
*pWC = wc;
return n == 0;
}
HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
{
HB_BOOL fResult;
fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, pWC );
if( fResult && ( *pWC > HB_CDP_UNICODE_MAX ||
( *pWC >= HB_CDP_SURROGATE_FIRST && *pWC <= HB_CDP_SURROGATE_LAST ) ) )
{
*pWC = HB_CDP_ERROR_UNICHAR;
fResult = HB_FALSE;
}
return fResult;
}
HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
HB_SIZE * pnIndex, HB_WCHAR16 * pWC )
{
HB_WCHAR32 wc;
HB_BOOL fResult;
fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, &wc );
if( fResult && wc > 0xFFFF )
{
wc = HB_CDP_ERROR_UNICHAR;
fResult = HB_FALSE;
}
*pWC = ( HB_WCHAR16 ) wc;
return fResult;
}
HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen )
{
HB_SIZE nIndex = 0;
while( nIndex < nLen )
{
HB_WCHAR32 wc;
if( ! hb_cdpUTF8GetUCS( pSrc, nLen, &nIndex, &wc ) )
return HB_FALSE;
}
return HB_TRUE;
}
HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen )
{
HB_SIZE nPos, nDst;
HB_WCHAR wc;
int n = 0;
HB_SIZE nIndex = 0, nChars = 0;
HB_WCHAR32 wc;
for( nPos = nDst = 0; nPos < nLen; )
while( nIndex < nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos ], &n, &wc ) )
++nPos;
if( n == 0 )
++nDst;
hb_cdpUTF8GetU32( pSrc, nLen, &nIndex, &wc );
++nChars;
}
if( n > 0 )
++nDst;
return nDst;
return nChars;
}
HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
@@ -1274,31 +1451,16 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
HB_SIZE nRAt = 0;
HB_SIZE nAt = 0;
HB_WCHAR wcN = 0;
HB_WCHAR wcH = 0;
int nN = 0;
int nH = 0;
HB_WCHAR32 wcN = 0;
HB_WCHAR32 wcH = 0;
while( nPosH < nLenH && nPosN < nLenN && nPos < nEnd )
{
do
{
if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) )
break;
++nPosH;
}
while( nH && nPosH < nLenH );
hb_cdpUTF8GetU32( szHaystack, nLenH, &nPosH, &wcH );
if( ++nPos < nStart )
continue;
do
{
if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) )
break;
++nPosN;
}
while( nN && nPosN < nLenN );
hb_cdpUTF8GetU32( szNeedle, nLenN, &nPosN, &wcN );
if( wcH == wcN )
{
@@ -1339,36 +1501,17 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
return nRAt;
}
HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
{
if( nLen )
HB_SIZE nIndex = 0;
while( nPos && nIndex < nLen )
{
HB_SIZE nPos2;
HB_WCHAR wc = 0;
int n = 0;
for( nPos2 = 0; nPos2 < nLen && nPos; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
++nPos2;
if( n == 0 )
--nPos;
}
if( nPos2 < nLen )
{
n = 0;
do
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
++nPos2;
if( n == 0 )
return wc;
}
while( nPos2 < nLen );
}
HB_WCHAR wc;
hb_cdpUTF8GetU16( pSrc, nLen, &nIndex, &wc );
if( --nPos == 0 )
return wc;
}
return 0;
}
@@ -1377,36 +1520,29 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pulDest )
{
HB_SIZE nDst = 0;
HB_WCHAR wc;
int n;
char * pDst = NULL;
if( nCount && nLen )
{
HB_SIZE nPos;
n = 0;
for( nPos = 0; nPos < nLen && nFrom; )
HB_WCHAR32 wc;
HB_SIZE nPos = 0;
while( nPos < nLen && nFrom )
{
if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
++nPos;
if( n == 0 )
--nFrom;
hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
--nFrom;
}
if( nPos < nLen )
{
HB_SIZE nCnt;
HB_SIZE nCnt = nCount;
nFrom = nPos;
nCnt = nCount;
n = 0;
do
{
if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
++nPos;
if( n == 0 )
--nCnt;
hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
}
while( nPos < nLen && nCnt );
while( nPos < nLen && --nCnt );
nDst = nPos - nFrom;
pDst = ( char * ) hb_xgrab( nDst + 1 );
@@ -1620,9 +1756,8 @@ HB_SIZE hb_cdpStrToUTF8Disp( PHB_CODEPAGE cdp,
HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
HB_SIZE nMax )
{
HB_WCHAR wc = 0;
HB_WCHAR wc;
HB_SIZE nPosS, nPosD;
int n = 0, i;
if( HB_CDP_ISUTF8( cdp ) )
return ( nMax && nSrc > nMax ) ? nMax : nSrc;
@@ -1630,31 +1765,22 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
{
for( nPosS = nPosD = 0; nPosS < nSrc; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
++nPosS;
if( n == 0 )
{
i = HB_CDPCHAR_LEN( cdp, wc );
if( nMax && nPosD + i > nMax )
break;
nPosD += i;
}
int i;
hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
i = HB_CDPCHAR_LEN( cdp, wc );
if( nMax && nPosD + i > nMax )
break;
nPosD += i;
}
}
else
{
for( nPosS = nPosD = 0; nPosS < nSrc; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
++nPosS;
if( n == 0 )
{
++nPosD;
if( nMax && nPosD >= nMax )
break;
}
hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
++nPosD;
if( nMax && nPosD >= nMax )
break;
}
}
@@ -1665,10 +1791,8 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
const char * pSrc, HB_SIZE nSrc,
char * pDst, HB_SIZE nDst )
{
HB_UCHAR * uniTrans;
HB_WCHAR wcMax, wc = 0;
HB_WCHAR wcMax, wc;
HB_SIZE nPosS, nPosD;
int n = 0;
if( HB_CDP_ISUTF8( cdp ) )
{
@@ -1683,18 +1807,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
{
for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
++nPosS;
if( n == 0 )
{
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
break;
}
hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
break;
}
}
else
{
HB_UCHAR * uniTrans;
if( cdp->uniTable->uniTrans == NULL )
hb_cdpBuildTransTable( cdp->uniTable );
uniTrans = cdp->uniTable->uniTrans;
@@ -1702,16 +1823,11 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
++nPosS;
if( n == 0 )
{
if( wc <= wcMax && uniTrans[ wc ] )
pDst[ nPosD++ ] = uniTrans[ wc ];
else
pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
}
hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
if( wc <= wcMax && uniTrans[ wc ] )
pDst[ nPosD++ ] = uniTrans[ wc ];
else
pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
}
}
@@ -1795,12 +1911,12 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
char c;
if( ! HB_CDPCHAR_PUT( cdp, &c, 1, &n, wc ) )
wc = '?';
wc = HB_CDP_ERROR_ASCCHAR;
else
wc = ( HB_UCHAR ) c;
}
else
wc = '?';
wc = HB_CDP_ERROR_ASCCHAR;
}
else
{
@@ -1815,7 +1931,7 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
}
}
}
return wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
return wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
}
HB_UCHAR hb_cdpGetUC( PHB_CODEPAGE cdp, HB_WCHAR wc, HB_UCHAR ucDef )
@@ -1903,30 +2019,24 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian,
if( HB_CDP_ISUTF8( cdp ) )
{
HB_WCHAR wc = 0;
int n = 0;
HB_WCHAR wc;
for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
++nPosS;
if( n == 0 )
{
hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
#if defined( HB_CDP_ENDIAN_SWAP )
if( iEndian == HB_CDP_ENDIAN_SWAP )
wc = HB_SWAP_UINT16( wc );
pDst[ nPosD++ ] = wc;
if( iEndian == HB_CDP_ENDIAN_SWAP )
wc = HB_SWAP_UINT16( wc );
pDst[ nPosD++ ] = wc;
#else
if( iEndian == HB_CDP_ENDIAN_LITTLE )
HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
else if( iEndian == HB_CDP_ENDIAN_BIG )
HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
else
pDst[ nPosD ] = wc;
++nPosD;
if( iEndian == HB_CDP_ENDIAN_LITTLE )
HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
else if( iEndian == HB_CDP_ENDIAN_BIG )
HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
else
pDst[ nPosD ] = wc;
++nPosD;
#endif
}
}
}
else if( HB_CDP_ISCUSTOM( cdp ) )
@@ -2117,7 +2227,7 @@ HB_SIZE hb_cdpU16ToStr( PHB_CODEPAGE cdp, int iEndian,
if( wc <= wcMax && uniTrans[ wc ] )
pDst[ nPosD++ ] = uniTrans[ wc ];
else
pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
}
}
@@ -2240,7 +2350,7 @@ int hb_cdpTranslateChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut )
{
if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
{
if( c != '?' )
if( c != HB_CDP_ERROR_ASCCHAR )
iChar = ( HB_UCHAR ) c;
}
}
@@ -2288,7 +2398,7 @@ int hb_cdpTranslateDispChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut
wc = s_uniCtrls[ iChar ];
if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
{
if( c != '?' )
if( c != HB_CDP_ERROR_ASCCHAR )
iChar = ( HB_UCHAR ) c;
}
}
@@ -2751,19 +2861,19 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
{
const char * pszString = *pStrPtr;
HB_UCHAR uc = 0;
HB_WCHAR wc = 0;
int n = 0;
while( *pszString )
if( *pszString )
{
if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) *pszString++, &n, &wc ) )
break;
if( n == 0 )
HB_SIZE nIndex = 0;
HB_WCHAR wc;
if( hb_cdpUTF8GetU16( pszString, hb_strnlen( pszString, 6 ), &nIndex, &wc ) )
{
if( wc < 127 )
uc = ( HB_UCHAR ) wc;
else
{
int n;
for( n = 0; n < 256; ++n )
{
if( wc == uniTable->uniCodes[ n ] )
@@ -2773,8 +2883,8 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
}
}
}
break;
}
pszString += nIndex;
}
if( uc == 0 )
{

View File

@@ -55,20 +55,14 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos )
if( nUTF8Pos > 0 && nUTF8Pos <= nLen )
{
HB_SIZE n1, n2;
HB_WCHAR uc;
int n = 0;
HB_WCHAR32 wc;
for( n1 = n2 = 0; n1 < nLen; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) )
++n1;
if( n == 0 )
{
if( --nUTF8Pos == 0 )
return n2 + 1;
n2 = n1;
}
hb_cdpUTF8GetU32( szUTF8, nLen, &n1, &wc );
if( --nUTF8Pos == 0 )
return n2 + 1;
n2 = n1;
}
}
return 0;
@@ -201,7 +195,7 @@ HB_FUNC( HB_UTF8CHR )
char utf8Char[ HB_MAX_CHAR_LEN ];
int iLen;
iLen = hb_cdpU16CharToUTF8( utf8Char, ( HB_WCHAR ) hb_parni( 1 ) );
iLen = hb_cdpU32CharToUTF8( utf8Char, ( HB_WCHAR32 ) hb_parni( 1 ) );
hb_retclen( utf8Char, iLen );
}
else
@@ -214,19 +208,10 @@ HB_FUNC( HB_UTF8ASC )
if( pszString )
{
HB_SIZE nLen = hb_parclen( 1 );
HB_WCHAR wc = 0;
int n = 0;
HB_SIZE nLen = hb_parclen( 1 ), nIndex = 0;
HB_WCHAR32 wc = 0;
while( nLen )
{
if( ! hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) )
break;
if( n == 0 )
break;
pszString++;
nLen--;
}
hb_cdpUTF8GetU32( pszString, nLen, &nIndex, &wc );
hb_retnint( wc );
}
else
@@ -467,35 +452,35 @@ HB_FUNC( HB_UTF8POKE )
nPos = utf8pos( szString, nLen, hb_parns( 2 ) );
if( nPos )
{
HB_WCHAR uc, uc2;
int n, n2;
HB_WCHAR32 uc, uc2;
HB_SIZE nDstLen = 0;
int n;
--nPos;
uc = ( HB_WCHAR ) hb_parni( 3 );
uc = ( HB_WCHAR32 ) hb_parni( 3 );
n = hb_cdpUTF8CharSize( uc );
n2 = 0;
hb_cdpUTF8ToU16NextChar( szString[ nPos ], &n2, &uc2 );
++n2;
if( n == n2 )
hb_cdpUTF8GetU32( &szString[ nPos ], nLen - nPos, &nDstLen, &uc2 );
if( n == ( int ) nDstLen )
{
char * szText;
if( hb_itemGetWriteCL( pText, &szText, &nLen ) &&
nPos + n <= nLen )
{
hb_cdpU16CharToUTF8( &szText[ nPos ], uc );
hb_cdpU32CharToUTF8( &szText[ nPos ], uc );
}
hb_itemReturn( pText );
}
else
{
char * szResult = ( char * ) hb_xgrab( nLen - n2 + n + 1 );
char * szResult = ( char * ) hb_xgrab( nLen - nDstLen + n + 1 );
memcpy( szResult, szString, nPos );
hb_cdpU16CharToUTF8( &szResult[ nPos ], uc );
memcpy( szResult + nPos + n, szString + nPos + n2, nLen - nPos - n2 );
hb_cdpU32CharToUTF8( &szResult[ nPos ], uc );
memcpy( szResult + nPos + n, szString + nPos + nDstLen, nLen - nPos - nDstLen );
if( HB_ISBYREF( 1 ) )
hb_storclen( szResult, nLen - n2 + n, 1 );
hb_retclen_buffer( szResult, nLen - n2 + n );
hb_storclen( szResult, nLen - nDstLen + n, 1 );
hb_retclen_buffer( szResult, nLen - nDstLen + n );
}
}
else