From 75ff90a49d84d73a527b9fba10a56b950177eeac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Czerpak?= <druzus@poczta.onet.pl>
Date: Tue, 9 Sep 2025 13:50:42 +0200
Subject: [PATCH] 2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak
 (druzus/at/poczta.onet.pl)   * include/hbdefs.h     + added new types
 HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR       is mapped to
 HB_WCHAR16 (just like before)

  * include/hbapicdp.h
  * src/harbour.def
  * src/rtl/cdpapi.c
    + added new C functions for encoding and decoding UTF-8 string using
      which HB_WCHAR32:
         int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
         HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
         HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
                                   HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
         HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
      They support full UCS and are much more restrictive against errors and
      wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
      The wrong characters are translated to 0xFFFD and later if such
      character does not exist in final CP to '?' ASCII character.
    * declaration of the following UTF-8 C functions have been changed to
      operate on HB_WCHAR32 instead of HB_WCHAR:
         int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
         HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
                                          HB_SIZE nPos );
    * the following C functions have been changed to internally operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_cdpUTF8StringLength()
         hb_cdpUTF8StringAt()
         hb_cdpUTF8StringSubstr()
    * the following C functions have been changed to use new hb_cdpUTF8GetU*()
      instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
         hb_cdpStrToUTF8Disp()
         hb_cdpUTF8AsStrLen()
         hb_cdpUTF8ToStr()
         hb_cdpStrToU16()
         hb_cdpUtf8Char()
    * use HB_CDP_ERROR_* macros to mark wrong encoding

  * src/rtl/cdpapihb.c
    * the following UTF-8 C functions have been changed to operate on
      HB_WCHAR32 instead of HB_WCHAR:
         hb_utf8Chr()
         hb_utf8Asc()
         hb_utf8Poke()
         hb_utf8Peek()
      Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
      in corresponding C functions.

  * src/codepage/cp_utf8.c
    * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP

  * src/rtl/arc4.c
    + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
      builds for GLIBC < 2.30
---
 ChangeLog.txt          |  58 ++++++
 include/hbapicdp.h     |  23 ++-
 include/hbdefs.h       |  10 +-
 src/codepage/cp_utf8.c |  19 +-
 src/harbour.def        |   5 +
 src/rtl/arc4.c         |   7 +-
 src/rtl/cdpapi.c       | 444 +++++++++++++++++++++++++----------------
 src/rtl/cdpapihb.c     |  59 ++----
 8 files changed, 398 insertions(+), 227 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index 8530fa7da7..030b1a6d19 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -7,6 +7,64 @@
    Entries may not always be in chronological/commit order.
    See license at the end of file. */
 
+2025-09-09 13:50 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
+  * include/hbdefs.h
+    + added new types HB_WCHAR16 and HB_WCHAR32, existing type HB_WCHAR
+      is mapped to HB_WCHAR16 (just like before)
+
+  * include/hbapicdp.h
+  * src/harbour.def
+  * src/rtl/cdpapi.c
+    + added new C functions for encoding and decoding UTF-8 string using
+      which HB_WCHAR32:
+         int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
+         HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+         HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+         HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
+                                   HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
+         HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
+      They support full UCS and are much more restrictive against errors and
+      wrong UTF-8 encoding, i.e. now overlong encoding is forbidden.
+      The wrong characters are translated to 0xFFFD and later if such
+      character does not exist in final CP to '?' ASCII character.
+    * declaration of the following UTF-8 C functions have been changed to
+      operate on HB_WCHAR32 instead of HB_WCHAR:
+         int hb_cdpUTF8CharSize( HB_WCHAR32 wc );
+         HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen,
+                                          HB_SIZE nPos );
+    * the following C functions have been changed to internally operate on
+      HB_WCHAR32 instead of HB_WCHAR:
+         hb_cdpUTF8StringLength()
+         hb_cdpUTF8StringAt()
+         hb_cdpUTF8StringSubstr()
+    * the following C functions have been changed to use new hb_cdpUTF8GetU*()
+      instead of step by step decoding with hb_cdpUTF8ToU16NextChar()
+         hb_cdpStrToUTF8Disp()
+         hb_cdpUTF8AsStrLen()
+         hb_cdpUTF8ToStr()
+         hb_cdpStrToU16()
+         hb_cdpUtf8Char()
+    * use HB_CDP_ERROR_* macros to mark wrong encoding
+
+  * src/rtl/cdpapihb.c
+    * the following UTF-8 C functions have been changed to operate on
+      HB_WCHAR32 instead of HB_WCHAR:
+         hb_utf8Chr()
+         hb_utf8Asc()
+         hb_utf8Poke()
+         hb_utf8Peek()
+      Other UTF-8 PRG functions have been adopted to HB_WCHAR32 by changes
+      in corresponding C functions.
+
+  * src/codepage/cp_utf8.c
+    * use new function hb_cdpUTF8GetU16() to decode UTF-8 strings in UTF8EX CP
+
+  * src/rtl/arc4.c
+    + added new macro HB_NO_SYSCTL which allow to disable sysctl() in Linux
+      builds for GLIBC < 2.30
+
 2025-09-03 12:21 UTC+0200 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
   * src/rtl/cdpapi.c
     + added fallback translation table for different variants of Latin
diff --git a/include/hbapicdp.h b/include/hbapicdp.h
index 273d4f9853..12a65204c6 100644
--- a/include/hbapicdp.h
+++ b/include/hbapicdp.h
@@ -401,6 +401,19 @@ extern HB_EXPORT void         hb_vmSetCDP( PHB_CODEPAGE pCDP );
  */
 #define HB_MAX_CHAR_LEN             8
 
+/* UCS maximal character value */
+#define HB_CDP_UNICODE_MAX          0x10FFFF
+
+/* UTF-16 surrogates for mapping U+010000 to U+10FFFF characters */
+#define HB_CDP_SURROGATE_FIRST      0xD800
+#define HB_CDP_SURROGATE_LAST       0xDFFF
+#define HB_CDP_SURROGATE_HIGH       0xD800
+#define HB_CDP_SURROGATE_LOW        0xDC00
+
+/* character codes to replace sequences with wrong encoding or translation */
+#define HB_CDP_ERROR_UNICHAR        0xFFFD      /* <?> */
+#define HB_CDP_ERROR_ASCCHAR        0x3F        /* ? */
+
 /* codepage uses simple binary sorting */
 #define HB_CDP_ISBINSORT( cdp )     ( ( ( cdp )->type & HB_CDP_TYPE_BINSORT ) != 0 )
 /* codepage uses custom string decoding */
@@ -473,7 +486,7 @@ extern HB_EXPORT HB_BOOL      hb_cdpGetFromUTF8( PHB_CODEPAGE cdp, HB_UCHAR ch,
 
 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen );
 extern HB_EXPORT HB_SIZE      hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, const char * szHaystack, HB_SIZE nLenH, HB_SIZE nStart, HB_SIZE nEnd, HB_BOOL fReverse );
-extern HB_EXPORT HB_WCHAR     hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
+extern HB_EXPORT HB_WCHAR32   hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos );
 extern HB_EXPORT char *       hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pnDest );
 
 extern HB_EXPORT HB_SIZE      hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, HB_SIZE nMax );
@@ -491,10 +504,14 @@ extern HB_EXPORT HB_WCHAR *   hb_cdpnStrDupU16( PHB_CODEPAGE cdp, int iEndian, c
 
 extern HB_EXPORT HB_WCHAR     hb_cdpGetU16Ctrl( HB_WCHAR wc );
 
-extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR wc );
+extern HB_EXPORT int          hb_cdpUTF8CharSize( HB_WCHAR32 wc );
+extern HB_EXPORT int          hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc );
 extern HB_EXPORT int          hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc );
 extern HB_EXPORT HB_BOOL      hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc );
-
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR32 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen, HB_SIZE * pnIndex, HB_WCHAR16 * pWC );
+extern HB_EXPORT HB_BOOL      hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen );
 
 extern HB_EXPORT PHB_ITEM     hb_itemDeserializeCP( const char ** pBufferPtr, HB_SIZE * pnSize, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut );
 extern HB_EXPORT char *       hb_itemSerializeCP( PHB_ITEM pItem, int iFlags, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut, HB_SIZE * pnSize );
diff --git a/include/hbdefs.h b/include/hbdefs.h
index 8b9fafb516..41fa8a6916 100644
--- a/include/hbdefs.h
+++ b/include/hbdefs.h
@@ -639,10 +639,18 @@ typedef HB_U32 HB_FATTR;
 #  endif
 #endif
 
-#if defined( HB_OS_WIN )
+#if defined( HB_OS_WIN ) || defined( HB_OS_DOS ) || defined( HB_OS_OS2 )
    typedef wchar_t         HB_WCHAR;
+   typedef wchar_t         HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
+#elif defined( __WATCOMC__ )
+   typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef HB_I32          HB_WCHAR32;
 #else
    typedef unsigned short  HB_WCHAR;
+   typedef unsigned short  HB_WCHAR16;
+   typedef wchar_t         HB_WCHAR32;
 #endif
 
 /* maximum length of double number in decimal representation:
diff --git a/src/codepage/cp_utf8.c b/src/codepage/cp_utf8.c
index b264dbd978..4990799270 100644
--- a/src/codepage/cp_utf8.c
+++ b/src/codepage/cp_utf8.c
@@ -57,27 +57,14 @@
 
 static HB_CDP_GET_FUNC( UTF8_get )
 {
-   HB_SIZE nIndex = *pnIndex;
-   int n = 0;
-
    HB_SYMBOL_UNUSED( cdp );
 
-   *wc = 0;
-   while( nIndex < nLen )
+   if( *pnIndex < nLen )
    {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
-         ++nIndex;
-      if( n == 0 )
-      {
-         *pnIndex = nIndex;
-         return HB_TRUE;
-      }
-   }
-   if( n != 0 )
-   {
-      *pnIndex = nIndex;
+      hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
       return HB_TRUE;
    }
+   *wc = 0;
    return HB_FALSE;
 }
 
diff --git a/src/harbour.def b/src/harbour.def
index 519f6501ff..8558ae0201 100644
--- a/src/harbour.def
+++ b/src/harbour.def
@@ -2157,14 +2157,19 @@ hb_cdpTranslateDispChar
 hb_cdpU16AsStrLen
 hb_cdpU16CharToUTF8
 hb_cdpU16ToStr
+hb_cdpU32CharToUTF8
 hb_cdpUTF8AsStrLen
 hb_cdpUTF8CharSize
+hb_cdpUTF8GetU16
+hb_cdpUTF8GetU32
+hb_cdpUTF8GetUCS
 hb_cdpUTF8StringAt
 hb_cdpUTF8StringLength
 hb_cdpUTF8StringPeek
 hb_cdpUTF8StringSubstr
 hb_cdpUTF8ToStr
 hb_cdpUTF8ToU16NextChar
+hb_cdpUTF8Validate
 hb_cdpUpperWC
 hb_cdpcmp
 hb_cdpicmp
diff --git a/src/rtl/arc4.c b/src/rtl/arc4.c
index 4c0d179986..cdfee3b744 100644
--- a/src/rtl/arc4.c
+++ b/src/rtl/arc4.c
@@ -57,9 +57,10 @@
     * sysctl() on Linux has fallen into depreciation. Not available in current
     * runtime C libraries, like musl and glibc >= 2.30.
     */
-#  if ( ! defined( HB_OS_LINUX ) || \
-      ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
-      defined( __UCLIBC__ ) )
+#  if ! defined( HB_NO_SYSCTL ) && \
+      ( ! defined( HB_OS_LINUX ) || \
+        ( ( defined( __GLIBC__ ) && ! ( ( __GLIBC__ > 2 ) || ( ( __GLIBC__ == 2 ) && ( __GLIBC_MINOR__ >= 30 ) ) ) ) ) || \
+        defined( __UCLIBC__ ) )
 #     define HAVE_SYS_SYSCTL_H
 #  endif
 #  define HAVE_DECL_CTL_KERN
diff --git a/src/rtl/cdpapi.c b/src/rtl/cdpapi.c
index 2ae45f959f..3aa5c396f2 100644
--- a/src/rtl/cdpapi.c
+++ b/src/rtl/cdpapi.c
@@ -355,7 +355,7 @@ static HB_BOOL hb_cdpStd_put( PHB_CODEPAGE cdp,
           cdp->uniTable->uniTrans[ wc ] )
          pDst[ ( *pnIndex )++ ] = cdp->uniTable->uniTrans[ wc ];
       else
-         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
 
       return HB_TRUE;
    }
@@ -519,27 +519,14 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp,
                                const char * pSrc, HB_SIZE nLen,
                                HB_SIZE * pnIndex, HB_WCHAR * wc )
 {
-   HB_SIZE nIndex = *pnIndex;
-   int n = 0;
-
    HB_SYMBOL_UNUSED( cdp );
 
-   *wc = 0;
-   while( nIndex < nLen )
+   if( *pnIndex < nLen )
    {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
-         ++nIndex;
-      if( n == 0 )
-      {
-         *pnIndex = nIndex;
-         return HB_TRUE;
-      }
-   }
-   if( n > 0 )
-   {
-      *pnIndex = nIndex;
+      hb_cdpUTF8GetU16( pSrc, nLen, pnIndex, wc );
       return HB_TRUE;
    }
+   *wc = 0;
    return HB_FALSE;
 }
 
@@ -650,7 +637,7 @@ static HB_BOOL hb_cdpMulti_put( PHB_CODEPAGE cdp,
                return HB_TRUE;
             }
          }
-         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+         pDst[ ( *pnIndex )++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
       }
       return HB_TRUE;
    }
@@ -1156,14 +1143,78 @@ int hb_cdpicmp( const char * szFirst, HB_SIZE nLenFirst,
 /*
  * UTF-8 conversions
  */
-int hb_cdpUTF8CharSize( HB_WCHAR wc )
+int hb_cdpUTF8CharSize( HB_WCHAR32 wc )
 {
+   if ( ( HB_I32 ) wc < 0 )
+      wc = HB_CDP_ERROR_UNICHAR;
+
    if( wc < 0x0080 )
       return 1;
    else if( wc < 0x0800 )
       return 2;
-   else                         /* if( wc <= 0xffff ) */
+   else if( wc < 0xFFFF )
       return 3;
+   else if( wc < 0x1FFFFF )
+      return 4;
+   else if( wc < 0x3FFFFFF )
+      return 5;
+   else                         /* if( wc <= 0x7FFFFFFF ) */
+      return 6;
+}
+
+int hb_cdpU32CharToUTF8( char * szUTF8, HB_WCHAR32 wc )
+{
+   int n;
+
+   if( ( HB_I32 ) wc < 0 )
+      wc = HB_CDP_ERROR_UNICHAR;
+
+   if( wc < 0x0080 )
+   {
+      szUTF8[ 0 ] = wc & 0xFF;
+      n = 1;
+   }
+   else if( wc < 0x0800 )
+   {
+      szUTF8[ 0 ] = 0xc0 | ( ( wc >> 6 ) & 0x1F );
+      szUTF8[ 1 ] = 0x80 | ( wc & 0x3F );
+      n = 2;
+   }
+   else if( wc < 0xFFFF )
+   {
+      szUTF8[ 0 ] = 0xE0 | ( ( wc >> 12 ) & 0x0F );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( wc & 0x3F );
+      n = 3;
+   }
+   else if( wc < 0x1FFFFF )
+   {
+      szUTF8[ 0 ] = 0xF0 | ( ( wc >> 18 ) & 0x07 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( wc & 0x3F );
+      n = 4;
+   }
+   else if( wc < 0x3FFFFFF )
+   {
+      szUTF8[ 0 ] = 0xF8 | ( ( wc >> 24 ) & 0x03 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 4 ] = 0x80 | ( wc & 0x3F );
+      n = 5;
+   }
+   else                         /* if( wc <= 0x7FFFFFFF ) */
+   {
+      szUTF8[ 0 ] = 0xFC | ( ( wc >> 30 ) & 0x01 );
+      szUTF8[ 1 ] = 0x80 | ( ( wc >> 24 ) & 0x3F );
+      szUTF8[ 2 ] = 0x80 | ( ( wc >> 18 ) & 0x3F );
+      szUTF8[ 3 ] = 0x80 | ( ( wc >> 12 ) & 0x3F );
+      szUTF8[ 4 ] = 0x80 | ( ( wc >> 6 ) & 0x3F );
+      szUTF8[ 5 ] = 0x80 | ( wc & 0x3F );
+      n = 6;
+   }
+   return n;
 }
 
 int hb_cdpU16CharToUTF8( char * szUTF8, HB_WCHAR wc )
@@ -1240,27 +1291,153 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc )
          *pwc &= 0x01;
          *n = 5;
       }
+      else
+      {
+         *n = 0;
+         return HB_FALSE;
+      }
+   }
+   return HB_TRUE;
+}
+
+HB_BOOL hb_cdpUTF8GetU32( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
+{
+   HB_SIZE nIndex = *pnIndex;
+   HB_WCHAR32 wc = 0;
+   int n = -1;
+
+   if( nIndex < nLen )
+   {
+      HB_WCHAR32 wcMin = 0;   /* forbid overlong encodings */
+      HB_UCHAR uc = ( HB_UCHAR ) pSrc[ nIndex++ ];
+
+      if( uc < 0x80 )
+      {
+         wc = uc;
+         n = 0;
+      }
+      else if( uc >= 0xc0 )
+      {
+         if( uc < 0xe0 )
+         {
+            wc = uc & 0x1f;
+            n = 1;
+            wcMin = 0x80;
+         }
+         else if( uc < 0xf0 )
+         {
+            wc = uc & 0x0f;
+            n = 2;
+            wcMin = 0x800;
+         }
+         else if( uc < 0xf8 )
+         {
+            wc = uc & 0x07;
+            n = 3;
+            wcMin = 0x10000;
+         }
+         else if( uc < 0xfc )
+         {
+            wc = uc & 0x03;
+            n = 4;
+            wcMin = 0x200000;
+         }
+         else if( uc < 0xfe )
+         {
+            wc = uc & 0x01;
+            n = 5;
+            wcMin = 0x4000000;
+         }
+         while( n > 0 && nIndex < nLen )
+         {
+            uc = ( HB_UCHAR ) pSrc[ nIndex ];
+            if( ( uc & 0xc0 ) != 0x80 )
+               break;
+            wc = ( wc << 6 ) | ( uc & 0x3f );
+            ++nIndex;
+            --n;
+         }
+      }
+
+      if( n != 0 || wc < wcMin )
+      {
+         wc = HB_CDP_ERROR_UNICHAR;
+         while( n-- > 0 && nIndex < nLen )
+         {
+            uc = ( HB_UCHAR ) pSrc[ nIndex ];
+            if( uc < 0x80 || ( uc >= 0xc2 && uc <= 0xf4 ) )
+               break;
+            ++nIndex;
+         }
+         n = -1;
+      }
+   }
+
+   *pnIndex = nIndex;
+   *pWC = wc;
+
+   return n == 0;
+}
+
+HB_BOOL hb_cdpUTF8GetUCS( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR32 * pWC )
+{
+   HB_BOOL fResult;
+
+   fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, pWC );
+   if( fResult && ( *pWC > HB_CDP_UNICODE_MAX ||
+         ( *pWC >= HB_CDP_SURROGATE_FIRST && *pWC <= HB_CDP_SURROGATE_LAST ) ) )
+   {
+      *pWC = HB_CDP_ERROR_UNICHAR;
+      fResult = HB_FALSE;
+   }
+   return fResult;
+}
+
+HB_BOOL hb_cdpUTF8GetU16( const char * pSrc, HB_SIZE nLen,
+                          HB_SIZE * pnIndex, HB_WCHAR16 * pWC )
+{
+   HB_WCHAR32 wc;
+   HB_BOOL fResult;
+
+   fResult = hb_cdpUTF8GetU32( pSrc, nLen, pnIndex, &wc );
+
+   if( fResult && wc > 0xFFFF )
+   {
+      wc = HB_CDP_ERROR_UNICHAR;
+      fResult = HB_FALSE;
+   }
+   *pWC = ( HB_WCHAR16 ) wc;
+
+   return fResult;
+}
+
+HB_BOOL hb_cdpUTF8Validate( const char * pSrc, HB_SIZE nLen )
+{
+   HB_SIZE nIndex = 0;
+
+   while( nIndex < nLen )
+   {
+      HB_WCHAR32 wc;
+      if( ! hb_cdpUTF8GetUCS( pSrc, nLen, &nIndex, &wc ) )
+         return HB_FALSE;
    }
    return HB_TRUE;
 }
 
 HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen )
 {
-   HB_SIZE nPos, nDst;
-   HB_WCHAR wc;
-   int n = 0;
+   HB_SIZE nIndex = 0, nChars = 0;
+   HB_WCHAR32 wc;
 
-   for( nPos = nDst = 0; nPos < nLen; )
+   while( nIndex < nLen )
    {
-      if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos ], &n, &wc ) )
-         ++nPos;
-      if( n == 0 )
-         ++nDst;
+       hb_cdpUTF8GetU32( pSrc, nLen, &nIndex, &wc );
+       ++nChars;
    }
-   if( n > 0 )
-      ++nDst;
 
-   return nDst;
+   return nChars;
 }
 
 HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
@@ -1274,31 +1451,16 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
    HB_SIZE nRAt = 0;
    HB_SIZE nAt = 0;
 
-   HB_WCHAR wcN = 0;
-   HB_WCHAR wcH = 0;
-   int nN = 0;
-   int nH = 0;
+   HB_WCHAR32 wcN = 0;
+   HB_WCHAR32 wcH = 0;
 
    while( nPosH < nLenH && nPosN < nLenN && nPos < nEnd )
    {
-      do
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) )
-            break;
-         ++nPosH;
-      }
-      while( nH && nPosH < nLenH );
-
+      hb_cdpUTF8GetU32( szHaystack, nLenH, &nPosH, &wcH );
       if( ++nPos < nStart )
          continue;
 
-      do
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) )
-            break;
-         ++nPosN;
-      }
-      while( nN && nPosN < nLenN );
+      hb_cdpUTF8GetU32( szNeedle, nLenN, &nPosN, &wcN );
 
       if( wcH == wcN )
       {
@@ -1339,36 +1501,17 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
    return nRAt;
 }
 
-HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
+HB_WCHAR32 hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
 {
-   if( nLen )
+   HB_SIZE nIndex = 0;
+
+   while( nPos && nIndex < nLen )
    {
-      HB_SIZE nPos2;
-      HB_WCHAR wc = 0;
-      int n = 0;
-
-      for( nPos2 = 0; nPos2 < nLen && nPos; )
-      {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
-            ++nPos2;
-         if( n == 0 )
-            --nPos;
-      }
-
-      if( nPos2 < nLen )
-      {
-         n = 0;
-         do
-         {
-            if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPos2 ], &n, &wc ) )
-               ++nPos2;
-            if( n == 0 )
-               return wc;
-         }
-         while( nPos2 < nLen );
-      }
+      HB_WCHAR wc;
+      hb_cdpUTF8GetU16( pSrc, nLen, &nIndex, &wc );
+      if( --nPos == 0 )
+         return wc;
    }
-
    return 0;
 }
 
@@ -1377,36 +1520,29 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
                                HB_SIZE nFrom, HB_SIZE nCount, HB_SIZE * pulDest )
 {
    HB_SIZE nDst = 0;
-   HB_WCHAR wc;
-   int n;
    char * pDst = NULL;
 
    if( nCount && nLen )
    {
-      HB_SIZE nPos;
-      n = 0;
-      for( nPos = 0; nPos < nLen && nFrom; )
+      HB_WCHAR32 wc;
+      HB_SIZE nPos = 0;
+
+      while( nPos < nLen && nFrom )
       {
-         if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
-            ++nPos;
-         if( n == 0 )
-            --nFrom;
+         hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
+         --nFrom;
       }
 
       if( nPos < nLen )
       {
-         HB_SIZE nCnt;
+         HB_SIZE nCnt = nCount;
+
          nFrom = nPos;
-         nCnt = nCount;
-         n = 0;
          do
          {
-            if( hb_cdpUTF8ToU16NextChar( pSrc[ nPos ], &n, &wc ) )
-               ++nPos;
-            if( n == 0 )
-               --nCnt;
+            hb_cdpUTF8GetU32( pSrc, nLen, &nPos, &wc );
          }
-         while( nPos < nLen && nCnt );
+         while( nPos < nLen && --nCnt );
 
          nDst = nPos - nFrom;
          pDst = ( char * ) hb_xgrab( nDst + 1 );
@@ -1620,9 +1756,8 @@ HB_SIZE hb_cdpStrToUTF8Disp( PHB_CODEPAGE cdp,
 HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
                             HB_SIZE nMax )
 {
-   HB_WCHAR wc = 0;
+   HB_WCHAR wc;
    HB_SIZE nPosS, nPosD;
-   int n = 0, i;
 
    if( HB_CDP_ISUTF8( cdp ) )
       return ( nMax && nSrc > nMax ) ? nMax : nSrc;
@@ -1630,31 +1765,22 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
    {
       for( nPosS = nPosD = 0; nPosS < nSrc; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            i = HB_CDPCHAR_LEN( cdp, wc );
-            if( nMax && nPosD + i > nMax )
-               break;
-            nPosD += i;
-         }
+         int i;
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         i = HB_CDPCHAR_LEN( cdp, wc );
+         if( nMax && nPosD + i > nMax )
+            break;
+         nPosD += i;
       }
    }
    else
    {
       for( nPosS = nPosD = 0; nPosS < nSrc; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            ++nPosD;
-            if( nMax && nPosD >= nMax )
-               break;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         ++nPosD;
+         if( nMax && nPosD >= nMax )
+            break;
       }
    }
 
@@ -1665,10 +1791,8 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
                          const char * pSrc, HB_SIZE nSrc,
                          char * pDst, HB_SIZE nDst )
 {
-   HB_UCHAR * uniTrans;
-   HB_WCHAR wcMax, wc = 0;
+   HB_WCHAR wcMax, wc;
    HB_SIZE nPosS, nPosD;
-   int n = 0;
 
    if( HB_CDP_ISUTF8( cdp ) )
    {
@@ -1683,18 +1807,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
    {
       for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
-               break;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &nPosD, wc ) )
+            break;
       }
    }
    else
    {
+      HB_UCHAR * uniTrans;
+
       if( cdp->uniTable->uniTrans == NULL )
          hb_cdpBuildTransTable( cdp->uniTable );
       uniTrans = cdp->uniTable->uniTrans;
@@ -1702,16 +1823,11 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
 
       for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
-            if( wc <= wcMax && uniTrans[ wc ] )
-               pDst[ nPosD++ ] = uniTrans[ wc ];
-            else
-               pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
-         }
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
+         if( wc <= wcMax && uniTrans[ wc ] )
+            pDst[ nPosD++ ] = uniTrans[ wc ];
+         else
+            pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
       }
    }
 
@@ -1795,12 +1911,12 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
             char c;
 
             if( ! HB_CDPCHAR_PUT( cdp, &c, 1, &n, wc ) )
-               wc = '?';
+               wc = HB_CDP_ERROR_ASCCHAR;
             else
                wc = ( HB_UCHAR ) c;
          }
          else
-            wc = '?';
+            wc = HB_CDP_ERROR_ASCCHAR;
       }
       else
       {
@@ -1815,7 +1931,7 @@ HB_UCHAR hb_cdpGetChar( PHB_CODEPAGE cdp, HB_WCHAR wc )
          }
       }
    }
-   return wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+   return wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
 }
 
 HB_UCHAR hb_cdpGetUC( PHB_CODEPAGE cdp, HB_WCHAR wc, HB_UCHAR ucDef )
@@ -1903,30 +2019,24 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian,
 
    if( HB_CDP_ISUTF8( cdp ) )
    {
-      HB_WCHAR wc = 0;
-      int n = 0;
+      HB_WCHAR wc;
 
       for( nPosS = nPosD = 0; nPosS < nSrc && nPosD < nDst; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nPosS ], &n, &wc ) )
-            ++nPosS;
-
-         if( n == 0 )
-         {
+         hb_cdpUTF8GetU16( pSrc, nSrc, &nPosS, &wc );
 #if defined( HB_CDP_ENDIAN_SWAP )
-            if( iEndian == HB_CDP_ENDIAN_SWAP )
-               wc = HB_SWAP_UINT16( wc );
-            pDst[ nPosD++ ] = wc;
+         if( iEndian == HB_CDP_ENDIAN_SWAP )
+            wc = HB_SWAP_UINT16( wc );
+         pDst[ nPosD++ ] = wc;
 #else
-            if( iEndian == HB_CDP_ENDIAN_LITTLE )
-               HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
-            else if( iEndian == HB_CDP_ENDIAN_BIG )
-               HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
-            else
-               pDst[ nPosD ] = wc;
-            ++nPosD;
+         if( iEndian == HB_CDP_ENDIAN_LITTLE )
+            HB_PUT_LE_UINT16( &pDst[ nPosD ], wc );
+         else if( iEndian == HB_CDP_ENDIAN_BIG )
+            HB_PUT_BE_UINT16( &pDst[ nPosD ], wc );
+         else
+            pDst[ nPosD ] = wc;
+         ++nPosD;
 #endif
-         }
       }
    }
    else if( HB_CDP_ISCUSTOM( cdp ) )
@@ -2117,7 +2227,7 @@ HB_SIZE hb_cdpU16ToStr( PHB_CODEPAGE cdp, int iEndian,
          if( wc <= wcMax && uniTrans[ wc ] )
             pDst[ nPosD++ ] = uniTrans[ wc ];
          else
-            pDst[ nPosD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
+            pDst[ nPosD++ ] = wc >= 0x100 ? HB_CDP_ERROR_ASCCHAR : ( HB_UCHAR ) wc;
       }
    }
 
@@ -2240,7 +2350,7 @@ int hb_cdpTranslateChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut )
          {
             if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
             {
-               if( c != '?' )
+               if( c != HB_CDP_ERROR_ASCCHAR )
                   iChar = ( HB_UCHAR ) c;
             }
          }
@@ -2288,7 +2398,7 @@ int hb_cdpTranslateDispChar( int iChar, PHB_CODEPAGE cdpIn, PHB_CODEPAGE cdpOut
             wc = s_uniCtrls[ iChar ];
          if( HB_CDPCHAR_PUT( cdpOut, &c, 1, &n, wc ) )
          {
-            if( c != '?' )
+            if( c != HB_CDP_ERROR_ASCCHAR )
                iChar = ( HB_UCHAR ) c;
          }
       }
@@ -2751,19 +2861,19 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
 {
    const char * pszString = *pStrPtr;
    HB_UCHAR uc = 0;
-   HB_WCHAR wc = 0;
-   int n = 0;
 
-   while( *pszString )
+   if( *pszString )
    {
-      if( ! hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) *pszString++, &n, &wc ) )
-         break;
-      if( n == 0 )
+      HB_SIZE nIndex = 0;
+      HB_WCHAR wc;
+
+      if( hb_cdpUTF8GetU16( pszString, hb_strnlen( pszString, 6 ), &nIndex, &wc ) )
       {
          if( wc < 127 )
             uc = ( HB_UCHAR ) wc;
          else
          {
+            int n;
             for( n = 0; n < 256; ++n )
             {
                if( wc == uniTable->uniCodes[ n ] )
@@ -2773,8 +2883,8 @@ static HB_UCHAR hb_cdpUtf8Char( const char ** pStrPtr, PHB_UNITABLE uniTable )
                }
             }
          }
-         break;
       }
+      pszString += nIndex;
    }
    if( uc == 0 )
    {
diff --git a/src/rtl/cdpapihb.c b/src/rtl/cdpapihb.c
index f180099a4d..17d2075a22 100644
--- a/src/rtl/cdpapihb.c
+++ b/src/rtl/cdpapihb.c
@@ -55,20 +55,14 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos )
    if( nUTF8Pos > 0 && nUTF8Pos <= nLen )
    {
       HB_SIZE n1, n2;
-      HB_WCHAR uc;
-      int n = 0;
+      HB_WCHAR32 wc;
 
       for( n1 = n2 = 0; n1 < nLen; )
       {
-         if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) )
-            ++n1;
-
-         if( n == 0 )
-         {
-            if( --nUTF8Pos == 0 )
-               return n2 + 1;
-            n2 = n1;
-         }
+         hb_cdpUTF8GetU32( szUTF8, nLen, &n1, &wc );
+         if( --nUTF8Pos == 0 )
+            return n2 + 1;
+         n2 = n1;
       }
    }
    return 0;
@@ -201,7 +195,7 @@ HB_FUNC( HB_UTF8CHR )
       char utf8Char[ HB_MAX_CHAR_LEN ];
       int iLen;
 
-      iLen = hb_cdpU16CharToUTF8( utf8Char, ( HB_WCHAR ) hb_parni( 1 ) );
+      iLen = hb_cdpU32CharToUTF8( utf8Char, ( HB_WCHAR32 ) hb_parni( 1 ) );
       hb_retclen( utf8Char, iLen );
    }
    else
@@ -214,19 +208,10 @@ HB_FUNC( HB_UTF8ASC )
 
    if( pszString )
    {
-      HB_SIZE nLen = hb_parclen( 1 );
-      HB_WCHAR wc = 0;
-      int n = 0;
+      HB_SIZE nLen = hb_parclen( 1 ), nIndex = 0;
+      HB_WCHAR32 wc = 0;
 
-      while( nLen )
-      {
-         if( ! hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) )
-            break;
-         if( n == 0 )
-            break;
-         pszString++;
-         nLen--;
-      }
+      hb_cdpUTF8GetU32( pszString, nLen, &nIndex, &wc );
       hb_retnint( wc );
    }
    else
@@ -467,35 +452,35 @@ HB_FUNC( HB_UTF8POKE )
       nPos = utf8pos( szString, nLen, hb_parns( 2 ) );
       if( nPos )
       {
-         HB_WCHAR uc, uc2;
-         int n, n2;
+         HB_WCHAR32 uc, uc2;
+         HB_SIZE nDstLen = 0;
+         int n;
 
          --nPos;
-         uc = ( HB_WCHAR ) hb_parni( 3 );
+         uc = ( HB_WCHAR32 ) hb_parni( 3 );
          n = hb_cdpUTF8CharSize( uc );
-         n2 = 0;
-         hb_cdpUTF8ToU16NextChar( szString[ nPos ], &n2, &uc2 );
-         ++n2;
-         if( n == n2 )
+
+         hb_cdpUTF8GetU32( &szString[ nPos ], nLen - nPos, &nDstLen, &uc2 );
+         if( n == ( int ) nDstLen )
          {
             char * szText;
             if( hb_itemGetWriteCL( pText, &szText, &nLen ) &&
                 nPos + n <= nLen )
             {
-               hb_cdpU16CharToUTF8( &szText[ nPos ], uc );
+               hb_cdpU32CharToUTF8( &szText[ nPos ], uc );
             }
             hb_itemReturn( pText );
          }
          else
          {
-            char * szResult = ( char * ) hb_xgrab( nLen - n2 + n + 1 );
+            char * szResult = ( char * ) hb_xgrab( nLen - nDstLen + n + 1 );
 
             memcpy( szResult, szString, nPos );
-            hb_cdpU16CharToUTF8( &szResult[ nPos ], uc );
-            memcpy( szResult + nPos + n, szString + nPos + n2, nLen - nPos - n2 );
+            hb_cdpU32CharToUTF8( &szResult[ nPos ], uc );
+            memcpy( szResult + nPos + n, szString + nPos + nDstLen, nLen - nPos - nDstLen );
             if( HB_ISBYREF( 1 ) )
-               hb_storclen( szResult, nLen - n2 + n, 1 );
-            hb_retclen_buffer( szResult, nLen - n2 + n );
+               hb_storclen( szResult, nLen - nDstLen + n, 1 );
+            hb_retclen_buffer( szResult, nLen - nDstLen + n );
          }
       }
       else