2012-03-20 19:06 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)

* harbour/src/rtl/cdpapi.c
  * harbour/src/rtl/cdpapihb.c
  * harbour/src/codepage/cp_utf8.c
    * slightly modified algorithms for decoding UTF8 characters to work
      better with broken UTF8 strings (i.e. to not eat characters after
      broken and unclosed multibyte one)
    * some minor improvements in HB_UTF8STUFF()
    * formating
This commit is contained in:
Przemyslaw Czerpak
2012-03-20 18:06:41 +00:00
parent 0b8e7986c3
commit de28a00d37
4 changed files with 128 additions and 117 deletions

View File

@@ -16,6 +16,16 @@
The license applies to all entries newer than 2009-04-28.
*/
2012-03-20 19:06 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
* harbour/src/rtl/cdpapi.c
* harbour/src/rtl/cdpapihb.c
* harbour/src/codepage/cp_utf8.c
* slightly modified algorithms for decoding UTF8 characters to work
better with broken UTF8 strings (i.e. to not eat characters after
broken and unclosed multibyte one)
* some minor improvements in HB_UTF8STUFF()
* formating
2012-03-20 18:44 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
* harbour/include/hbapicdp.h
+ added missing declaration of hb_fsNameConvU16()

View File

@@ -63,13 +63,12 @@ static HB_CDP_GET_FUNC( UTF8_get )
*wc = 0;
while( nIndex < nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) )
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
++nIndex;
if( n == 0 )
{
if( n == 0 )
{
*pnIndex = nIndex;
return HB_TRUE;
}
*pnIndex = nIndex;
return HB_TRUE;
}
}
return HB_FALSE;

View File

@@ -234,15 +234,19 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp,
*wc = 0;
while( nIndex < nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) )
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
++nIndex;
if( n == 0 )
{
if( n == 0 )
{
*pnIndex = nIndex;
return HB_TRUE;
}
*pnIndex = nIndex;
return HB_TRUE;
}
}
if( n > 0 )
{
*pnIndex = nIndex;
return HB_TRUE;
}
return HB_FALSE;
}
@@ -891,7 +895,10 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc )
if( *n > 0 )
{
if( ( ucChar & 0xc0 ) != 0x80 )
{
*n = 0;
return HB_FALSE;
}
*pwc = ( *pwc << 6 ) | ( ucChar & 0x3f );
( *n )--;
return HB_TRUE;
@@ -936,14 +943,15 @@ HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen )
HB_WCHAR wc;
int n = 0;
for( ul = nDst = 0; ul < nLen; ++ul )
for( ul = nDst = 0; ul < nLen; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
{
if( n == 0 )
++nDst;
}
++ul;
if( n == 0 )
++nDst;
}
if( n > 0 )
++nDst;
return nDst;
}
@@ -968,24 +976,20 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
{
do
{
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH++ ], &nH, &wcH ) )
{
--nPosH;
nH = 0;
}
} while( nH );
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) )
break;
++nPosH;
} while( nH && nPosH < nLenH );
if( ++nPos < nStart )
continue;
do
{
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN++ ], &nN, &wcN ) )
{
--nPosN;
nN = 0;
}
} while( nN );
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) )
break;
++nPosN;
} while( nN && nPosN < nLenN );
if( wcH == wcN )
{
@@ -1034,13 +1038,12 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
HB_WCHAR wc = 0;
int n = 0;
for( ul = 0; ul < nLen && nPos; ++ul )
for( ul = 0; ul < nLen && nPos; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
{
if( n == 0 )
--nPos;
}
++ul;
if( n == 0 )
--nPos;
}
if( ul < nLen )
@@ -1049,12 +1052,11 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
do
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
{
if( n == 0 )
return wc;
}
++ul;
if( n == 0 )
return wc;
}
while( ++ul < nLen );
while( ul < nLen );
}
}
@@ -1073,13 +1075,12 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
if( nCount && nLen )
{
n = 0;
for( ul = 0; ul < nLen && nFrom; ++ul )
for( ul = 0; ul < nLen && nFrom; )
{
if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) )
{
if( n == 0 )
--nFrom;
}
++ul;
if( n == 0 )
--nFrom;
}
if( ul < nLen )
@@ -1090,12 +1091,11 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
do
{
if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) )
{
if( n == 0 )
--nCnt;
}
++ul;
if( n == 0 )
--nCnt;
}
while( ++ul < nLen && nCnt );
while( ul < nLen && nCnt );
nDst = ul - nFrom;
pDst = ( char * ) hb_xgrab( nDst + 1 );
@@ -1317,32 +1317,32 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
return ( nMax && nSrc > nMax ) ? nMax : nSrc;
else if( cdp->fCustom )
{
for( ulS = ulD = 0; ulS < nSrc; ++ulS )
for( ulS = ulD = 0; ulS < nSrc; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
++ulS;
if( n == 0 )
{
if( n == 0 )
{
i = HB_CDPCHAR_LEN( cdp, wc );
if( nMax && ulD + i > nMax )
break;
ulD += i;
}
i = HB_CDPCHAR_LEN( cdp, wc );
if( nMax && ulD + i > nMax )
break;
ulD += i;
}
}
}
else
{
for( ulS = ulD = 0; ulS < nSrc; ++ulS )
for( ulS = ulD = 0; ulS < nSrc; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
++ulS;
if( n == 0 )
{
if( n == 0 )
{
++ulD;
if( nMax && ulD >= nMax )
break;
}
++ulD;
if( nMax && ulD >= nMax )
break;
}
}
}
@@ -1370,15 +1370,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
}
else if( cdp->fCustom )
{
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
++ulS;
if( n == 0 )
{
if( n == 0 )
{
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) )
break;
}
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) )
break;
}
}
}
@@ -1389,17 +1389,17 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
uniTrans = cdp->uniTable->uniTrans;
wcMax = cdp->uniTable->wcMax;
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
++ulS;
if( n == 0 )
{
if( n == 0 )
{
if( wc <= wcMax && uniTrans[ wc ] )
pDst[ ulD++ ] = uniTrans[ wc ];
else
pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
}
if( wc <= wcMax && uniTrans[ wc ] )
pDst[ ulD++ ] = uniTrans[ wc ];
else
pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
}
}
}
@@ -1533,26 +1533,26 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian,
HB_WCHAR wc = 0;
int n = 0;
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
++ulS;
if( n == 0 )
{
if( n == 0 )
{
#if defined( HB_CDP_ENDIAN_SWAP )
if( iEndian == HB_CDP_ENDIAN_SWAP )
wc = HB_SWAP_UINT16( wc );
pDst[ ulD++ ] = wc;
if( iEndian == HB_CDP_ENDIAN_SWAP )
wc = HB_SWAP_UINT16( wc );
pDst[ ulD++ ] = wc;
#else
if( iEndian == HB_CDP_ENDIAN_LITTLE )
HB_PUT_LE_UINT16( &pDst[ ulD ], wc );
else if( iEndian == HB_CDP_ENDIAN_BIG )
HB_PUT_BE_UINT16( &pDst[ ulD ], wc );
else
pDst[ ulD ] = wc;
++ulD;
if( iEndian == HB_CDP_ENDIAN_LITTLE )
HB_PUT_LE_UINT16( &pDst[ ulD ], wc );
else if( iEndian == HB_CDP_ENDIAN_BIG )
HB_PUT_BE_UINT16( &pDst[ ulD ], wc );
else
pDst[ ulD ] = wc;
++ulD;
#endif
}
}
}
}

View File

@@ -7,7 +7,7 @@
* The CodePages API
*
* Copyright 2002 Alexander S.Kresin <alex@belacy.belgorod.su>
* Copyright 2009 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
* Copyright 2009-2012 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
* www - http://harbour-project.org
*
* This program is free software; you can redistribute it and/or modify
@@ -74,16 +74,16 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos )
HB_WCHAR uc;
int n = 0;
for( n1 = n2 = 0; n1 < nLen; ++n1 )
for( n1 = n2 = 0; n1 < nLen; )
{
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) )
++n1;
if( n == 0 )
{
if( n == 0 )
{
if( --nUTF8Pos == 0 )
return n2 + 1;
n2 = n1 + 1;
}
if( --nUTF8Pos == 0 )
return n2 + 1;
n2 = n1;
}
}
}
@@ -197,13 +197,14 @@ HB_FUNC( HB_UTF8ASC )
HB_WCHAR wc = 0;
int n = 0;
while( nLen-- )
while( nLen )
{
if( hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString++, &n, &wc ) )
{
if( n == 0 )
break;
}
if( !hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) )
break;
if( n == 0 )
break;
pszString++;
nLen--;
}
hb_retnint( wc );
}
@@ -335,7 +336,7 @@ HB_FUNC( HB_UTF8SUBSTR )
else if( nFrom )
--nFrom;
if( nLen && nCount > 0 )
if( nLen > ( HB_SIZE ) nFrom && nCount > 0 )
szDest = hb_cdpUTF8StringSubstr( szString, nLen,
nFrom, nCount, &nDest );
if( szDest )
@@ -468,9 +469,10 @@ HB_FUNC( HB_UTF8POKE )
HB_FUNC( HB_UTF8STUFF )
{
const char * szString = hb_parc( 1 );
const char * szText = hb_parc( 1 );
const char * szIns = hb_parc( 4 );
if( szString && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) && HB_ISCHAR( 4 ) )
if( szText && szIns && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) )
{
HB_SIZE nLen = hb_parclen( 1 );
HB_SIZE nPos = hb_parns( 2 );
@@ -480,7 +482,7 @@ HB_FUNC( HB_UTF8STUFF )
if( nPos )
{
nPos = utf8pos( szString, nLen, nPos );
nPos = utf8pos( szText, nLen, nPos );
if( nPos == 0 )
nPos = nLen;
else
@@ -490,7 +492,7 @@ HB_FUNC( HB_UTF8STUFF )
{
if( nPos < nLen )
{
nDel = utf8pos( szString + nPos, nLen - nPos, nDel + 1 );
nDel = utf8pos( szText + nPos, nLen - nPos, nDel + 1 );
if( nDel == 0 )
nDel = nLen - nPos;
else
@@ -504,9 +506,9 @@ HB_FUNC( HB_UTF8STUFF )
{
char * szResult = ( char * ) hb_xgrab( nTot + 1 );
hb_xmemcpy( szResult, szString, nPos );
hb_xmemcpy( szResult + nPos, hb_parc( 4 ), nIns );
hb_xmemcpy( szResult + nPos + nIns, szString + nPos + nDel,
hb_xmemcpy( szResult, szText, nPos );
hb_xmemcpy( szResult + nPos, szIns, nIns );
hb_xmemcpy( szResult + nPos + nIns, szText + nPos + nDel,
nLen - ( nPos + nDel ) );
hb_retclen_buffer( szResult, nTot );
}
@@ -519,10 +521,10 @@ HB_FUNC( HB_UTF8STUFF )
HB_FUNC( HB_UTF8LEN )
{
const char * szString = hb_parc( 1 );
const char * szText = hb_parc( 1 );
if( szString )
hb_retnint( hb_cdpUTF8StringLength( szString, hb_parclen( 1 ) ) );
if( szText )
hb_retnint( hb_cdpUTF8StringLength( szText, hb_parclen( 1 ) ) );
else
hb_errRT_BASE_SubstR( EG_ARG, 3012, NULL, HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS );
}