2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)

* include/hbdefs.h + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR is mapped to HB_WCHAR16 (just like before) * include/hbapicdp.h * src/harbour.def * src/rtl/cdpapi.c + added new C functions for encoding and decoding UTF-8 string using which HB_WCHAR32: int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc ); HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC ); HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC ); HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen ); They support full UCS and are much more restrictive against errors and wrong UTF-8 encoding, i.e. now overlong encoding is forbidden. The wrong characters are translated to 0xFFFD and later if such character does not exist in final CP to '?' ASCII character. * declaration of the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: int hb_cdpUTF8CharSize( HB_WCHAR32 wc ); HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ); * the following C functions have been changed to internally operate on HB_WCHAR32 instead of HB_WCHAR: hb_cdpUTF8StringLength() hb_cdpUTF8StringAt() hb_cdpUTF8StringSubstr() * the following C functions have been changed to use new hb_cdpUTF8GetU*() instead of step by step decoding with hb_cdpUTF8ToU16NextChar() hb_cdpStrToUTF8Disp() hb_cdpUTF8AsStrLen() hb_cdpUTF8ToStr() hb_cdpStrToU16() hb_cdpUtf8Char() * use HB_CDP_ERROR_* macros to mark wrong encoding * src/rtl/cdpapihb.c * the following UTF-8 C functions have been changed to operate on HB_WCHAR32 instead of HB_WCHAR: hb_utf8Chr() hb_utf8Asc() hb_utf8Poke() hb_utf8Peek() Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes in corresponding C functions. * src/codepage/cp_utf8.c * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP * src/rtl/arc4.c + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux builds for GLIBC < 2.30
2025-09-09 13:50:42 +02:00
parent 315887a395
commit 75ff90a49d
8 changed files with 398 additions and 227 deletions
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -7,6 +7,64 @@
   Entries may not always be in chronological/commit order.
   See license at the end of file. */

+2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
+  * include/hbdefs.h
+    + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR
+      is mapped to HB_WCHAR16 (just like before)
+
+  * include/hbapicdp.h
+  * src/harbour.def
+  * src/rtl/cdpapi.c
+    + added new C functions for encoding and decoding UTF-8 string using
+      which HB_WCHAR32:
+         int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
+         HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+         HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+         HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
+         HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
+      They support full UCS and are much more restrictive against errors and
+      wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
+      The wrong characters are translated to 0xFFFD and later if such
+      character does not exist in final CP to '?' ASCII character.
+    * declaration of the following UTF-8 C functions have been changed to
+      operate on HB_WCHAR32 instead of HB_WCHAR:
+         int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
+         HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
+                                          HB_SIZE nPos );
+    * the following C functions have been changed to internally operate on
+      HB_WCHAR32 instead of HB_WCHAR:
+         hb_cdpUTF8StringLength()
+         hb_cdpUTF8StringAt()
+         hb_cdpUTF8StringSubstr()
+    * the following C functions have been changed to use new hb_cdpUTF8GetU*()
+      instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
+         hb_cdpStrToUTF8Disp()
+         hb_cdpUTF8AsStrLen()
+         hb_cdpUTF8ToStr()
+         hb_cdpStrToU16()
+         hb_cdpUtf8Char()
+    * use HB_CDP_ERROR_* macros to mark wrong encoding
+
+  * src/rtl/cdpapihb.c
+    * the following UTF-8 C functions have been changed to operate on
+      HB_WCHAR32 instead of HB_WCHAR:
+         hb_utf8Chr()
+         hb_utf8Asc()
+         hb_utf8Poke()
+         hb_utf8Peek()
+      Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
+      in corresponding C functions.
+
+  * src/codepage/cp_utf8.c
+    * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP
+
+  * src/rtl/arc4.c
+    + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
+      builds for GLIBC < 2.30
+
 2025-09-03 12:21 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
  * src/rtl/cdpapi.c
    + added fallback translation table for different variants of Latin
--- a/include/hbapicdp.h
+++ b/include/hbapicdp.h
@@ -401,6 +401,19 @@ extern HB_EXPORT void         hb_vmSetCDP( PHB_CODEPAGE pCDP );
 */
 #define HB_MAX_CHAR_LEN             8

+/* UCS maximal character value */
+#define HB_CDP_UNICODE_MAX          0x10FFFF
+
+/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */
+#define HB_CDP_SURROGATE_FIRST      0xD800
+#define HB_CDP_SURROGATE_LAST       0xDFFF
+#define HB_CDP_SURROGATE_HIGH       0xD800
+#define HB_CDP_SURROGATE_LOW        0xDC00
+
+/* character codes to replace sequences with wrong encoding or translation */
+#define HB_CDP_ERROR_UNICHAR        0xFFFD      /* <?> */
+#define HB_CDP_ERROR_ASCCHAR        0x3F        /* ? */
+
 /* codepage uses simple binary sorting */
 #define HB_CDP_ISBINSORT( cdp )     ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 )
 /* codepage uses custom string decoding */
@@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL      hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch,

 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen );
 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse );
-extern HB_EXPORT HB_WCHAR     hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
+extern HB_EXPORT HB_WCHAR32   hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
 extern HB_EXPORT char *       hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest );

 extern HB_EXPORT HB_SIZE      hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax );
@@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR *   hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c

 extern HB_EXPORT HB_WCHAR     hb_cdpGetU16Ctrl( HB_WCHAR wc );

-extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR wc );
+extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR32 wc );
+extern HB_EXPORT int          hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
 extern HB_EXPORT int          hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc );
 extern HB_EXPORT HB_BOOL      hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc );
-
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );

 extern HB_EXPORT PHB_ITEM     hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut );
 extern HB_EXPORT char *       hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize );
--- a/include/hbdefs.h
+++ b/include/hbdefs.h
@@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR;
 #  endif
 #endif

-#if defined( HB_OS_WIN )
+#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 )
   typedef wchar_t         HB_WCHAR;
+   typedef wchar_t         HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
+#elif defined( __WATCOMC__ )
+   typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
 #else
   typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef wchar_t         HB_WCHAR32;
 #endif

 /* maximum length of double number in decimal representation:
--- a/src/codepage/cp_utf8.c
+++ b/src/codepage/cp_utf8.c
@@ -57,27 +57,14 @@

 static HB_CDP_GET_FUNC( UTF8_get )
 {
-   HB_SIZE nIndex = *pnIndex;
-   int n = 0;
-
   HB_SYMBOL_UNUSED( cdp );

-   *wc = 0;
-   while( nIndex < nLen )
+   if( *pnIndex < nLen )
   {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
-         ++nIndex;
-      if( n == 0 )
-      {
-         *pnIndex = nIndex;
-         return HB_TRUE;
-      }
-   }
-   if( n != 0 )
-   {
-      *pnIndex = nIndex;
+      hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
      return HB_TRUE;
   }
+   *wc = 0;
   return HB_FALSE;
 }

--- a/src/harbour.def
+++ b/src/harbour.def
@@ -2157,14 +2157,19 @@ hb_cdpTranslateDispChar
 hb_cdpU16AsStrLen
 hb_cdpU16CharToUTF8
 hb_cdpU16ToStr
+hb_cdpU32CharToUTF8
 hb_cdpUTF8AsStrLen
 hb_cdpUTF8CharSize
+hb_cdpUTF8GetU16
+hb_cdpUTF8GetU32
+hb_cdpUTF8GetUCS
 hb_cdpUTF8StringAt
 hb_cdpUTF8StringLength
 hb_cdpUTF8StringPeek
 hb_cdpUTF8StringSubstr
 hb_cdpUTF8ToStr
 hb_cdpUTF8ToU16NextChar
+hb_cdpUTF8Validate
 hb_cdpUpperWC
 hb_cdpcmp
 hb_cdpicmp
--- a/src/rtl/arc4.c
+++ b/src/rtl/arc4.c
@@ -57,9 +57,10 @@
    * sysctl() on Linux has fallen into depreciation. Not available in current
    * runtime C libraries, like musl and glibc >= 2.30.
    */
-#  if ( ! defined( HB_OS_LINUX ) || \
-      ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
-      defined( __UCLIBC__ ) )
+#  if ! defined( HB_NO_SYSCTL ) && \
+      ( ! defined( HB_OS_LINUX ) || \
+        ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
+        defined( __UCLIBC__ ) )
 #     define HAVE_SYS_SYSCTL_H
 #  endif
 #  define HAVE_DECL_CTL_KERN
--- a/src/rtl/cdpapi.c
+++ b/src/rtl/cdpapi.c
@@ -355,7 +355,7 @@ static HB_BOOL hb_cdpStd_put( PHB_CODEPAGE cdp,
          cdp->uniTable->uniTrans[ wc ] )
         pDst[ ( *pnIndex )++ ] = cdp->uniTable->uniTrans[ wc ];
      else
-         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;

      return HB_TRUE;
   }
@@ -519,27 +519,14 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp,
                               const char * pSrc, HB_SIZE nLen,
                               HB_SIZE * pnIndex, HB_WCHAR * wc )
 {
-   HB_SIZE nIndex = *pnIndex;
-   int n = 0;
-
   HB_SYMBOL_UNUSED( cdp );

-   *wc = 0;
-   while( nIndex < nLen )
+   if( *pnIndex < nLen )
   {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
-         ++nIndex;
-      if( n == 0 )
-      {
-         *pnIndex = nIndex;
-         return HB_TRUE;
-      }
-   }
-   if( n > 0 )
-   {
-      *pnIndex = nIndex;
+      hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
      return HB_TRUE;
   }
+   *wc = 0;
   return HB_FALSE;
 }

@@ -650,7 +637,7 @@ static HB_BOOL hb_cdpMulti_put( PHB_CODEPAGE cdp,
               return HB_TRUE;
            }
         }
-         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
      }
      return HB_TRUE;
   }
@@ -1156,14 +1143,78 @@ int hb_cdpicmp( const char * szFirst, HB_SIZE nLenFirst,
 /*
 * UTF-8 conversions
 */
-int hb_cdpUTF8CharSize( HB_WCHAR wc )
+int hb_cdpUTF8CharSize( HB_WCHAR32 wc )
 {
+   if ( ( HB_I32 ) wc < 0 )
+      wc = HB_CDP_ERROR_UNICHAR;
+
   if( wc < 0x0080 )
      return 1;
   else if( wc < 0x0800 )
      return 2;
-   else                         /* if( wc <= 0xffff ) */
+   else if( wc < 0xFFFF )
      return 3;
+   else if( wc < 0x1FFFFF )
+      return 4;
+   else if( wc < 0x3FFFFFF )
+      return 5;
+   else                         /* if( wc <= 0x7FFFFFFF ) */
+      return 6;
+}
+
+int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc )
+{
+   int n;
+
+   if( ( HB_I32 ) wc < 0 )
+      wc = HB_CDP_ERROR_UNICHAR;
+
+   if( wc < 0x0080 )
+   {
+      szUTF8[ 0 ] = wc & 0xFF;
+      n = 1;
+   }
+   else if( wc < 0x0800 )
+   {
+      szUTF8[ 0 ] = 0xc0 | ( ( wc >> 6 ) & 0x1F );
+      szUTF8[ 1 ] = 0x80 | ( wc & 0x3F );
+      n = 2;
+   }
+   else if( wc < 0xFFFF )
+   {
+      szUTF8[ 0 ] = 0xE0 | ( ( wc >> 12 ) & 0x0F );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( wc & 0x3F );
+      n = 3;
+   }
+   else if( wc < 0x1FFFFF )
+   {
+      szUTF8[ 0 ] = 0xF0 | ( ( wc >> 18 ) & 0x07 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( wc & 0x3F );
+      n = 4;
+   }
+   else if( wc < 0x3FFFFFF )
+   {
+      szUTF8[ 0 ] = 0xF8 | ( ( wc >> 24 ) & 0x03 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 4 ] = 0x80 | ( wc & 0x3F );
+      n = 5;
+   }
+   else                         /* if( wc <= 0x7FFFFFFF ) */
+   {
+      szUTF8[ 0 ] = 0xFC | ( ( wc >> 30 ) & 0x01 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 24 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 4 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 5 ] = 0x80 | ( wc & 0x3F );
+      n = 6;
+   }
+   return n;
 }

 int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc )
@@ -1240,27 +1291,153 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc )
         *pwc &= 0x01;
         *n = 5;
      }
+      else
+      {
+         *n = 0;
+         return HB_FALSE;
+      }
+   }
+   return HB_TRUE;
+}
+
+HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
+{
+   HB_SIZE nIndex = *pnIndex;
+   HB_WCHAR32 wc = 0;
+   int n = -1;
+
+   if( nIndex < nLen )
+   {
+      HB_WCHAR32 wcMin = 0;   /* forbid overlong encodings */
+      HB_UCHAR uc = ( HB_UCHAR ) pSrc[ nIndex++ ];
+
+      if( uc < 0x80 )
+      {
+         wc = uc;
+         n = 0;
+      }
+      else if( uc >= 0xc0 )
+      {
+         if( uc < 0xe0 )
+         {
+            wc = uc & 0x1f;
+            n = 1;
+            wcMin = 0x80;
+         }
+         else if( uc < 0xf0 )
+         {
+            wc = uc & 0x0f;
+            n = 2;
+            wcMin = 0x800;
+         }
+         else if( uc < 0xf8 )
+         {
+            wc = uc & 0x07;
+            n = 3;
+            wcMin = 0x10000;
+         }
+         else if( uc < 0xfc )
+         {
+            wc = uc & 0x03;
+            n = 4;
+            wcMin = 0x200000;
+         }
+         else if( uc < 0xfe )
+         {
+            wc = uc & 0x01;
+            n = 5;
+            wcMin = 0x4000000;
+         }
+         while( n > 0 && nIndex < nLen )
+         {
+            uc = ( HB_UCHAR ) pSrc[ nIndex ];
+            if( ( uc & 0xc0 ) != 0x80 )
+               break;
+            wc = ( wc << 6 ) | ( uc & 0x3f );
+            ++nIndex;
+            --n;
+         }
+      }
+
+      if( n != 0 || wc < wcMin )
+      {
+         wc = HB_CDP_ERROR_UNICHAR;
+         while( n-- > 0 && nIndex < nLen )
+         {
+            uc = ( HB_UCHAR ) pSrc[ nIndex ];
+            if( uc < 0x80 || ( uc >= 0xc2 && uc <= 0xf4 ) )
+               break;
+            ++nIndex;
+         }
+         n = -1;
+      }
+   }
+
+   *pnIndex = nIndex;
+   *pWC = wc;
+
+   return n == 0;
+}
+
+HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
+{
+   HB_BOOL fResult;
+
+   fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, pWC );
+   if( fResult && ( *pWC > HB_CDP_UNICODE_MAX ||
+         ( *pWC >= HB_CDP_SURROGATE_FIRST && *pWC <= HB_CDP_SURROGATE_LAST ) ) )
+   {
+      *pWC = HB_CDP_ERROR_UNICHAR;
+      fResult = HB_FALSE;
+   }
+   return fResult;
+}
+
+HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR16 * pWC )
+{
+   HB_WCHAR32 wc;
+   HB_BOOL fResult;
+
+   fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, &wc );
+
+   if( fResult && wc > 0xFFFF )
+   {
+      wc = HB_CDP_ERROR_UNICHAR;
+      fResult = HB_FALSE;
+   }
+   *pWC = ( HB_WCHAR16 ) wc;
+
+   return fResult;
+}
+
+HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen )
+{
+   HB_SIZE nIndex = 0;
+
+   while( nIndex < nLen )
+   {
+      HB_WCHAR32 wc;
+      if( ! hb_cdpUTF8GetUCS( pSrc, nLen, &nIndex, &wc ) )
+         return HB_FALSE;
   }
   return HB_TRUE;
 }

 HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen )
 {
-   HB_SIZE nPos, nDst;
-   HB_WCHAR wc;
-   int n = 0;
+   HB_SIZE nIndex = 0, nChars = 0;
+   HB_WCHAR32 wc;

-   for( nPos = nDst = 0; nPos < nLen; )
+   while( nIndex < nLen )
   {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos ], &n, &wc ) )
-         ++nPos;
-      if( n == 0 )
-         ++nDst;
+       hb_cdpUTF8GetU32( pSrc, nLen, &nIndex, &wc );
+       ++nChars;
   }
-   if( n > 0 )
-      ++nDst;

-   return nDst;
+   return nChars;
 }

 HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
@@ -1274,31 +1451,16 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
   HB_SIZE nRAt = 0;
   HB_SIZE nAt = 0;

-   HB_WCHAR wcN = 0;
-   HB_WCHAR wcH = 0;
-   int nN = 0;
-   int nH = 0;
+   HB_WCHAR32 wcN = 0;
+   HB_WCHAR32 wcH = 0;

   while( nPosH < nLenH && nPosN < nLenN && nPos < nEnd )
   {
-      do
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) )
-            break;
-         ++nPosH;
-      }
-      while( nH && nPosH < nLenH );
-
+      hb_cdpUTF8GetU32( szHaystack, nLenH, &nPosH, &wcH );
      if( ++nPos < nStart )
         continue;

-      do
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) )
-            break;
-         ++nPosN;
-      }
-      while( nN && nPosN < nLenN );
+      hb_cdpUTF8GetU32( szNeedle, nLenN, &nPosN, &wcN );

      if( wcH == wcN )
      {
@@ -1339,36 +1501,17 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
   return nRAt;
 }

-HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
+HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
 {
-   if( nLen )
+   HB_SIZE nIndex = 0;
+
+   while( nPos && nIndex < nLen )
   {
-      HB_SIZE nPos2;
-      HB_WCHAR wc = 0;
-      int n = 0;
-
-      for( nPos2 = 0; nPos2 < nLen && nPos; )
-      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
-            ++nPos2;
-         if( n == 0 )
-            --nPos;
-      }
-
-      if( nPos2 < nLen )
-      {
-         n = 0;
-         do
-         {
-            if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
-               ++nPos2;
-            if( n == 0 )
-               return wc;
-         }
-         while( nPos2 < nLen );
-      }
+      HB_WCHAR wc;
+      hb_cdpUTF8GetU16( pSrc, nLen, &nIndex, &wc );
+      if( --nPos == 0 )
+         return wc;
   }
-
   return 0;
 }

@@ -1377,36 +1520,29 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
                               HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pulDest )
 {
   HB_SIZE nDst = 0;
-   HB_WCHAR wc;
-   int n;
   char * pDst = NULL;

   if( nCount && nLen )
   {
-      HB_SIZE nPos;
-      n = 0;
-      for( nPos = 0; nPos < nLen && nFrom; )
+      HB_WCHAR32 wc;
+      HB_SIZE nPos = 0;
+
+      while( nPos < nLen && nFrom )
      {
-         if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
-            ++nPos;
-         if( n == 0 )
-            --nFrom;
+         hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
+         --nFrom;
      }

      if( nPos < nLen )
      {
-         HB_SIZE nCnt;
+         HB_SIZE nCnt = nCount;
+
         nFrom = nPos;
-         nCnt = nCount;
-         n = 0;
         do
         {
-            if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
-               ++nPos;
-            if( n == 0 )
-               --nCnt;
+            hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
         }
-         while( nPos < nLen && nCnt );
+         while( nPos < nLen && --nCnt );

         nDst = nPos - nFrom;
         pDst = ( char * ) hb_xgrab( nDst + 1 );
@@ -1620,9 +1756,8 @@ HB_SIZE hb_cdpStrToUTF8Disp( PHB_CODEPAGE cdp,
 HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
                            HB_SIZE nMax )
 {
-   HB_WCHAR wc = 0;
+   HB_WCHAR wc;
   HB_SIZE nPosS, nPosD;
-   int n = 0, i;

   if( HB_CDP_ISUTF8( cdp ) )
      return ( nMax && nSrc > nMax ) ? nMax : nSrc;
@@ -1630,31 +1765,22 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
   {
      for( nPosS = nPosD = 0; nPosS < nSrc; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            i = HB_CDPCHAR_LEN( cdp, wc );
-            if( nMax && nPosD + i > nMax )
-               break;
-            nPosD += i;
-         }
+         int i;
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         i = HB_CDPCHAR_LEN( cdp, wc );
+         if( nMax && nPosD + i > nMax )
+            break;
+         nPosD += i;
      }
   }
   else
   {
      for( nPosS = nPosD = 0; nPosS < nSrc; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            ++nPosD;
-            if( nMax && nPosD >= nMax )
-               break;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         ++nPosD;
+         if( nMax && nPosD >= nMax )
+            break;
      }
   }

@@ -1665,10 +1791,8 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
                         const char * pSrc, HB_SIZE nSrc,
                         char * pDst, HB_SIZE nDst )
 {
-   HB_UCHAR * uniTrans;
-   HB_WCHAR wcMax, wc = 0;
+   HB_WCHAR wcMax, wc;
   HB_SIZE nPosS, nPosD;
-   int n = 0;

   if( HB_CDP_ISUTF8( cdp ) )
   {
@@ -1683,18 +1807,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
   {
      for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
-               break;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
+            break;
      }
   }
   else
   {
+      HB_UCHAR * uniTrans;
+
      if( cdp->uniTable->uniTrans == NULL )
         hb_cdpBuildTransTable( cdp->uniTable );
      uniTrans = cdp->uniTable->uniTrans;
@@ -1702,16 +1823,11 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,

      for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            if( wc <= wcMax && uniTrans[ wc ] )
-               pDst[ nPosD++ ] = uniTrans[ wc ];
-            else
-               pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         if( wc <= wcMax && uniTrans[ wc ] )
+            pDst[ nPosD++ ] = uniTrans[ wc ];
+         else
+            pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
      }
   }

@@ -1795,12 +1911,12 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
            char c;

            if( ! HB_CDPCHAR_PUT( cdp, &c, 1, &n, wc ) )
-               wc = '?';
+               wc = HB_CDP_ERROR_ASCCHAR;
            else
               wc = ( HB_UCHAR ) c;
         }
         else
-            wc = '?';
+            wc = HB_CDP_ERROR_ASCCHAR;
      }
      else
      {
@@ -1815,7 +1931,7 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
         }
      }
   }
-   return wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+   return wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
 }

 HB_UCHAR hb_cdpGetUC( PHB_CODEPAGE cdp, HB_WCHAR wc, HB_UCHAR ucDef )
@@ -1903,30 +2019,24 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian,

   if( HB_CDP_ISUTF8( cdp ) )
   {
-      HB_WCHAR wc = 0;
-      int n = 0;
+      HB_WCHAR wc;

      for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
 #if defined( HB_CDP_ENDIAN_SWAP )
-            if( iEndian == HB_CDP_ENDIAN_SWAP )
-               wc = HB_SWAP_UINT16( wc );
-            pDst[ nPosD++ ] = wc;
+         if( iEndian == HB_CDP_ENDIAN_SWAP )
+            wc = HB_SWAP_UINT16( wc );
+         pDst[ nPosD++ ] = wc;
 #else
-            if( iEndian == HB_CDP_ENDIAN_LITTLE )
-               HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
-            else if( iEndian == HB_CDP_ENDIAN_BIG )
-               HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
-            else
-               pDst[ nPosD ] = wc;
-            ++nPosD;
+         if( iEndian == HB_CDP_ENDIAN_LITTLE )
+            HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
+         else if( iEndian == HB_CDP_ENDIAN_BIG )
+            HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
+         else
+            pDst[ nPosD ] = wc;
+         ++nPosD;
 #endif
-         }
      }
   }
   else if( HB_CDP_ISCUSTOM( cdp ) )
@@ -2117,7 +2227,7 @@ HB_SIZE hb_cdpU16ToStr( PHB_CODEPAGE cdp, int iEndian,
         if( wc <= wcMax && uniTrans[ wc ] )
            pDst[ nPosD++ ] = uniTrans[ wc ];
         else
-            pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+            pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
      }
   }

@@ -2240,7 +2350,7 @@ int hb_cdpTranslateChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut )
         {
            if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
            {
-               if( c != '?' )
+               if( c != HB_CDP_ERROR_ASCCHAR )
                  iChar = ( HB_UCHAR ) c;
            }
         }
@@ -2288,7 +2398,7 @@ int hb_cdpTranslateDispChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut
            wc = s_uniCtrls[ iChar ];
         if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
         {
-            if( c != '?' )
+            if( c != HB_CDP_ERROR_ASCCHAR )
               iChar = ( HB_UCHAR ) c;
         }
      }
@@ -2751,19 +2861,19 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
 {
   const char * pszString = *pStrPtr;
   HB_UCHAR uc = 0;
-   HB_WCHAR wc = 0;
-   int n = 0;

-   while( *pszString )
+   if( *pszString )
   {
-      if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) *pszString++, &n, &wc ) )
-         break;
-      if( n == 0 )
+      HB_SIZE nIndex = 0;
+      HB_WCHAR wc;
+
+      if( hb_cdpUTF8GetU16( pszString, hb_strnlen( pszString, 6 ), &nIndex, &wc ) )
      {
         if( wc < 127 )
            uc = ( HB_UCHAR ) wc;
         else
         {
+            int n;
            for( n = 0; n < 256; ++n )
            {
               if( wc == uniTable->uniCodes[ n ] )
@@ -2773,8 +2883,8 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
               }
            }
         }
-         break;
      }
+      pszString += nIndex;
   }
   if( uc == 0 )
   {
--- a/src/rtl/cdpapihb.c
+++ b/src/rtl/cdpapihb.c
@@ -55,20 +55,14 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos )
   if( nUTF8Pos > 0 && nUTF8Pos <= nLen )
   {
      HB_SIZE n1, n2;
-      HB_WCHAR uc;
-      int n = 0;
+      HB_WCHAR32 wc;

      for( n1 = n2 = 0; n1 < nLen; )
      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) )
-            ++n1;
-
-         if( n == 0 )
-         {
-            if( --nUTF8Pos == 0 )
-               return n2 + 1;
-            n2 = n1;
-         }
+         hb_cdpUTF8GetU32( szUTF8, nLen, &n1, &wc );
+         if( --nUTF8Pos == 0 )
+            return n2 + 1;
+         n2 = n1;
      }
   }
   return 0;
@@ -201,7 +195,7 @@ HB_FUNC( HB_UTF8CHR )
      char utf8Char[ HB_MAX_CHAR_LEN ];
      int iLen;

-      iLen = hb_cdpU16CharToUTF8( utf8Char, ( HB_WCHAR ) hb_parni( 1 ) );
+      iLen = hb_cdpU32CharToUTF8( utf8Char, ( HB_WCHAR32 ) hb_parni( 1 ) );
      hb_retclen( utf8Char, iLen );
   }
   else
@@ -214,19 +208,10 @@ HB_FUNC( HB_UTF8ASC )

   if( pszString )
   {
-      HB_SIZE nLen = hb_parclen( 1 );
-      HB_WCHAR wc = 0;
-      int n = 0;
+      HB_SIZE nLen = hb_parclen( 1 ), nIndex = 0;
+      HB_WCHAR32 wc = 0;

-      while( nLen )
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) )
-            break;
-         if( n == 0 )
-            break;
-         pszString++;
-         nLen--;
-      }
+      hb_cdpUTF8GetU32( pszString, nLen, &nIndex, &wc );
      hb_retnint( wc );
   }
   else
@@ -467,35 +452,35 @@ HB_FUNC( HB_UTF8POKE )
      nPos = utf8pos( szString, nLen, hb_parns( 2 ) );
      if( nPos )
      {
-         HB_WCHAR uc, uc2;
-         int n, n2;
+         HB_WCHAR32 uc, uc2;
+         HB_SIZE nDstLen = 0;
+         int n;

         --nPos;
-         uc = ( HB_WCHAR ) hb_parni( 3 );
+         uc = ( HB_WCHAR32 ) hb_parni( 3 );
         n = hb_cdpUTF8CharSize( uc );
-         n2 = 0;
-         hb_cdpUTF8ToU16NextChar( szString[ nPos ], &n2, &uc2 );
-         ++n2;
-         if( n == n2 )
+
+         hb_cdpUTF8GetU32( &szString[ nPos ], nLen - nPos, &nDstLen, &uc2 );
+         if( n == ( int ) nDstLen )
         {
            char * szText;
            if( hb_itemGetWriteCL( pText, &szText, &nLen ) &&
                nPos + n <= nLen )
            {
-               hb_cdpU16CharToUTF8( &szText[ nPos ], uc );
+               hb_cdpU32CharToUTF8( &szText[ nPos ], uc );
            }
            hb_itemReturn( pText );
         }
         else
         {
-            char * szResult = ( char * ) hb_xgrab( nLen - n2 + n + 1 );
+            char * szResult = ( char * ) hb_xgrab( nLen - nDstLen + n + 1 );

            memcpy( szResult, szString, nPos );
-            hb_cdpU16CharToUTF8( &szResult[ nPos ], uc );
-            memcpy( szResult + nPos + n, szString + nPos + n2, nLen - nPos - n2 );
+            hb_cdpU32CharToUTF8( &szResult[ nPos ], uc );
+            memcpy( szResult + nPos + n, szString + nPos + nDstLen, nLen - nPos - nDstLen );
            if( HB_ISBYREF( 1 ) )
-               hb_storclen( szResult, nLen - n2 + n, 1 );
-            hb_retclen_buffer( szResult, nLen - n2 + n );
+               hb_storclen( szResult, nLen - nDstLen + n, 1 );
+            hb_retclen_buffer( szResult, nLen - nDstLen + n );
         }
      }
      else