2012-03-20 19:06 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
* harbour/src/rtl/cdpapi.c
* harbour/src/rtl/cdpapihb.c
* harbour/src/codepage/cp_utf8.c
* slightly modified algorithms for decoding UTF8 characters to work
better with broken UTF8 strings (i.e. to not eat characters after
broken and unclosed multibyte one)
* some minor improvements in HB_UTF8STUFF()
* formating
This commit is contained in:
@@ -16,6 +16,16 @@
|
||||
The license applies to all entries newer than 2009-04-28.
|
||||
*/
|
||||
|
||||
2012-03-20 19:06 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
|
||||
* harbour/src/rtl/cdpapi.c
|
||||
* harbour/src/rtl/cdpapihb.c
|
||||
* harbour/src/codepage/cp_utf8.c
|
||||
* slightly modified algorithms for decoding UTF8 characters to work
|
||||
better with broken UTF8 strings (i.e. to not eat characters after
|
||||
broken and unclosed multibyte one)
|
||||
* some minor improvements in HB_UTF8STUFF()
|
||||
* formating
|
||||
|
||||
2012-03-20 18:44 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl)
|
||||
* harbour/include/hbapicdp.h
|
||||
+ added missing declaration of hb_fsNameConvU16()
|
||||
|
||||
@@ -63,13 +63,12 @@ static HB_CDP_GET_FUNC( UTF8_get )
|
||||
*wc = 0;
|
||||
while( nIndex < nLen )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) )
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
|
||||
++nIndex;
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
*pnIndex = nIndex;
|
||||
return HB_TRUE;
|
||||
}
|
||||
*pnIndex = nIndex;
|
||||
return HB_TRUE;
|
||||
}
|
||||
}
|
||||
return HB_FALSE;
|
||||
|
||||
@@ -234,15 +234,19 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp,
|
||||
*wc = 0;
|
||||
while( nIndex < nLen )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) )
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) )
|
||||
++nIndex;
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
*pnIndex = nIndex;
|
||||
return HB_TRUE;
|
||||
}
|
||||
*pnIndex = nIndex;
|
||||
return HB_TRUE;
|
||||
}
|
||||
}
|
||||
if( n > 0 )
|
||||
{
|
||||
*pnIndex = nIndex;
|
||||
return HB_TRUE;
|
||||
}
|
||||
return HB_FALSE;
|
||||
}
|
||||
|
||||
@@ -891,7 +895,10 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc )
|
||||
if( *n > 0 )
|
||||
{
|
||||
if( ( ucChar & 0xc0 ) != 0x80 )
|
||||
{
|
||||
*n = 0;
|
||||
return HB_FALSE;
|
||||
}
|
||||
*pwc = ( *pwc << 6 ) | ( ucChar & 0x3f );
|
||||
( *n )--;
|
||||
return HB_TRUE;
|
||||
@@ -936,14 +943,15 @@ HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen )
|
||||
HB_WCHAR wc;
|
||||
int n = 0;
|
||||
|
||||
for( ul = nDst = 0; ul < nLen; ++ul )
|
||||
for( ul = nDst = 0; ul < nLen; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
++nDst;
|
||||
}
|
||||
++ul;
|
||||
if( n == 0 )
|
||||
++nDst;
|
||||
}
|
||||
if( n > 0 )
|
||||
++nDst;
|
||||
|
||||
return nDst;
|
||||
}
|
||||
@@ -968,24 +976,20 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN,
|
||||
{
|
||||
do
|
||||
{
|
||||
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH++ ], &nH, &wcH ) )
|
||||
{
|
||||
--nPosH;
|
||||
nH = 0;
|
||||
}
|
||||
} while( nH );
|
||||
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) )
|
||||
break;
|
||||
++nPosH;
|
||||
} while( nH && nPosH < nLenH );
|
||||
|
||||
if( ++nPos < nStart )
|
||||
continue;
|
||||
|
||||
do
|
||||
{
|
||||
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN++ ], &nN, &wcN ) )
|
||||
{
|
||||
--nPosN;
|
||||
nN = 0;
|
||||
}
|
||||
} while( nN );
|
||||
if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) )
|
||||
break;
|
||||
++nPosN;
|
||||
} while( nN && nPosN < nLenN );
|
||||
|
||||
if( wcH == wcN )
|
||||
{
|
||||
@@ -1034,13 +1038,12 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
|
||||
HB_WCHAR wc = 0;
|
||||
int n = 0;
|
||||
|
||||
for( ul = 0; ul < nLen && nPos; ++ul )
|
||||
for( ul = 0; ul < nLen && nPos; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
--nPos;
|
||||
}
|
||||
++ul;
|
||||
if( n == 0 )
|
||||
--nPos;
|
||||
}
|
||||
|
||||
if( ul < nLen )
|
||||
@@ -1049,12 +1052,11 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos )
|
||||
do
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
return wc;
|
||||
}
|
||||
++ul;
|
||||
if( n == 0 )
|
||||
return wc;
|
||||
}
|
||||
while( ++ul < nLen );
|
||||
while( ul < nLen );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1073,13 +1075,12 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
|
||||
if( nCount && nLen )
|
||||
{
|
||||
n = 0;
|
||||
for( ul = 0; ul < nLen && nFrom; ++ul )
|
||||
for( ul = 0; ul < nLen && nFrom; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
--nFrom;
|
||||
}
|
||||
++ul;
|
||||
if( n == 0 )
|
||||
--nFrom;
|
||||
}
|
||||
|
||||
if( ul < nLen )
|
||||
@@ -1090,12 +1091,11 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen,
|
||||
do
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
--nCnt;
|
||||
}
|
||||
++ul;
|
||||
if( n == 0 )
|
||||
--nCnt;
|
||||
}
|
||||
while( ++ul < nLen && nCnt );
|
||||
while( ul < nLen && nCnt );
|
||||
|
||||
nDst = ul - nFrom;
|
||||
pDst = ( char * ) hb_xgrab( nDst + 1 );
|
||||
@@ -1317,32 +1317,32 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc,
|
||||
return ( nMax && nSrc > nMax ) ? nMax : nSrc;
|
||||
else if( cdp->fCustom )
|
||||
{
|
||||
for( ulS = ulD = 0; ulS < nSrc; ++ulS )
|
||||
for( ulS = ulD = 0; ulS < nSrc; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
|
||||
++ulS;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
i = HB_CDPCHAR_LEN( cdp, wc );
|
||||
if( nMax && ulD + i > nMax )
|
||||
break;
|
||||
ulD += i;
|
||||
}
|
||||
i = HB_CDPCHAR_LEN( cdp, wc );
|
||||
if( nMax && ulD + i > nMax )
|
||||
break;
|
||||
ulD += i;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for( ulS = ulD = 0; ulS < nSrc; ++ulS )
|
||||
for( ulS = ulD = 0; ulS < nSrc; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
|
||||
++ulS;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
++ulD;
|
||||
if( nMax && ulD >= nMax )
|
||||
break;
|
||||
}
|
||||
++ulD;
|
||||
if( nMax && ulD >= nMax )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1370,15 +1370,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
|
||||
}
|
||||
else if( cdp->fCustom )
|
||||
{
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
|
||||
++ulS;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) )
|
||||
break;
|
||||
}
|
||||
if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1389,17 +1389,17 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp,
|
||||
uniTrans = cdp->uniTable->uniTrans;
|
||||
wcMax = cdp->uniTable->wcMax;
|
||||
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
|
||||
++ulS;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
if( wc <= wcMax && uniTrans[ wc ] )
|
||||
pDst[ ulD++ ] = uniTrans[ wc ];
|
||||
else
|
||||
pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
|
||||
}
|
||||
if( wc <= wcMax && uniTrans[ wc ] )
|
||||
pDst[ ulD++ ] = uniTrans[ wc ];
|
||||
else
|
||||
pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1533,26 +1533,26 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian,
|
||||
HB_WCHAR wc = 0;
|
||||
int n = 0;
|
||||
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS )
|
||||
for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) )
|
||||
++ulS;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
#if defined( HB_CDP_ENDIAN_SWAP )
|
||||
if( iEndian == HB_CDP_ENDIAN_SWAP )
|
||||
wc = HB_SWAP_UINT16( wc );
|
||||
pDst[ ulD++ ] = wc;
|
||||
if( iEndian == HB_CDP_ENDIAN_SWAP )
|
||||
wc = HB_SWAP_UINT16( wc );
|
||||
pDst[ ulD++ ] = wc;
|
||||
#else
|
||||
if( iEndian == HB_CDP_ENDIAN_LITTLE )
|
||||
HB_PUT_LE_UINT16( &pDst[ ulD ], wc );
|
||||
else if( iEndian == HB_CDP_ENDIAN_BIG )
|
||||
HB_PUT_BE_UINT16( &pDst[ ulD ], wc );
|
||||
else
|
||||
pDst[ ulD ] = wc;
|
||||
++ulD;
|
||||
if( iEndian == HB_CDP_ENDIAN_LITTLE )
|
||||
HB_PUT_LE_UINT16( &pDst[ ulD ], wc );
|
||||
else if( iEndian == HB_CDP_ENDIAN_BIG )
|
||||
HB_PUT_BE_UINT16( &pDst[ ulD ], wc );
|
||||
else
|
||||
pDst[ ulD ] = wc;
|
||||
++ulD;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* The CodePages API
|
||||
*
|
||||
* Copyright 2002 Alexander S.Kresin <alex@belacy.belgorod.su>
|
||||
* Copyright 2009 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
|
||||
* Copyright 2009-2012 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
|
||||
* www - http://harbour-project.org
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
@@ -74,16 +74,16 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos )
|
||||
HB_WCHAR uc;
|
||||
int n = 0;
|
||||
|
||||
for( n1 = n2 = 0; n1 < nLen; ++n1 )
|
||||
for( n1 = n2 = 0; n1 < nLen; )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) )
|
||||
++n1;
|
||||
|
||||
if( n == 0 )
|
||||
{
|
||||
if( n == 0 )
|
||||
{
|
||||
if( --nUTF8Pos == 0 )
|
||||
return n2 + 1;
|
||||
n2 = n1 + 1;
|
||||
}
|
||||
if( --nUTF8Pos == 0 )
|
||||
return n2 + 1;
|
||||
n2 = n1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -197,13 +197,14 @@ HB_FUNC( HB_UTF8ASC )
|
||||
HB_WCHAR wc = 0;
|
||||
int n = 0;
|
||||
|
||||
while( nLen-- )
|
||||
while( nLen )
|
||||
{
|
||||
if( hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString++, &n, &wc ) )
|
||||
{
|
||||
if( n == 0 )
|
||||
break;
|
||||
}
|
||||
if( !hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) )
|
||||
break;
|
||||
if( n == 0 )
|
||||
break;
|
||||
pszString++;
|
||||
nLen--;
|
||||
}
|
||||
hb_retnint( wc );
|
||||
}
|
||||
@@ -335,7 +336,7 @@ HB_FUNC( HB_UTF8SUBSTR )
|
||||
else if( nFrom )
|
||||
--nFrom;
|
||||
|
||||
if( nLen && nCount > 0 )
|
||||
if( nLen > ( HB_SIZE ) nFrom && nCount > 0 )
|
||||
szDest = hb_cdpUTF8StringSubstr( szString, nLen,
|
||||
nFrom, nCount, &nDest );
|
||||
if( szDest )
|
||||
@@ -468,9 +469,10 @@ HB_FUNC( HB_UTF8POKE )
|
||||
|
||||
HB_FUNC( HB_UTF8STUFF )
|
||||
{
|
||||
const char * szString = hb_parc( 1 );
|
||||
const char * szText = hb_parc( 1 );
|
||||
const char * szIns = hb_parc( 4 );
|
||||
|
||||
if( szString && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) && HB_ISCHAR( 4 ) )
|
||||
if( szText && szIns && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) )
|
||||
{
|
||||
HB_SIZE nLen = hb_parclen( 1 );
|
||||
HB_SIZE nPos = hb_parns( 2 );
|
||||
@@ -480,7 +482,7 @@ HB_FUNC( HB_UTF8STUFF )
|
||||
|
||||
if( nPos )
|
||||
{
|
||||
nPos = utf8pos( szString, nLen, nPos );
|
||||
nPos = utf8pos( szText, nLen, nPos );
|
||||
if( nPos == 0 )
|
||||
nPos = nLen;
|
||||
else
|
||||
@@ -490,7 +492,7 @@ HB_FUNC( HB_UTF8STUFF )
|
||||
{
|
||||
if( nPos < nLen )
|
||||
{
|
||||
nDel = utf8pos( szString + nPos, nLen - nPos, nDel + 1 );
|
||||
nDel = utf8pos( szText + nPos, nLen - nPos, nDel + 1 );
|
||||
if( nDel == 0 )
|
||||
nDel = nLen - nPos;
|
||||
else
|
||||
@@ -504,9 +506,9 @@ HB_FUNC( HB_UTF8STUFF )
|
||||
{
|
||||
char * szResult = ( char * ) hb_xgrab( nTot + 1 );
|
||||
|
||||
hb_xmemcpy( szResult, szString, nPos );
|
||||
hb_xmemcpy( szResult + nPos, hb_parc( 4 ), nIns );
|
||||
hb_xmemcpy( szResult + nPos + nIns, szString + nPos + nDel,
|
||||
hb_xmemcpy( szResult, szText, nPos );
|
||||
hb_xmemcpy( szResult + nPos, szIns, nIns );
|
||||
hb_xmemcpy( szResult + nPos + nIns, szText + nPos + nDel,
|
||||
nLen - ( nPos + nDel ) );
|
||||
hb_retclen_buffer( szResult, nTot );
|
||||
}
|
||||
@@ -519,10 +521,10 @@ HB_FUNC( HB_UTF8STUFF )
|
||||
|
||||
HB_FUNC( HB_UTF8LEN )
|
||||
{
|
||||
const char * szString = hb_parc( 1 );
|
||||
const char * szText = hb_parc( 1 );
|
||||
|
||||
if( szString )
|
||||
hb_retnint( hb_cdpUTF8StringLength( szString, hb_parclen( 1 ) ) );
|
||||
if( szText )
|
||||
hb_retnint( hb_cdpUTF8StringLength( szText, hb_parclen( 1 ) ) );
|
||||
else
|
||||
hb_errRT_BASE_SubstR( EG_ARG, 3012, NULL, HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS );
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user