diff --git a/harbour/ChangeLog b/harbour/ChangeLog index d114803731..c725e60207 100644 --- a/harbour/ChangeLog +++ b/harbour/ChangeLog @@ -16,6 +16,16 @@ The license applies to all entries newer than 2009-04-28. */ +2012-03-20 19:06 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl) + * harbour/src/rtl/cdpapi.c + * harbour/src/rtl/cdpapihb.c + * harbour/src/codepage/cp_utf8.c + * slightly modified algorithms for decoding UTF8 characters to work + better with broken UTF8 strings (i.e. to not eat characters after + broken and unclosed multibyte one) + * some minor improvements in HB_UTF8STUFF() + * formating + 2012-03-20 18:44 UTC+0100 Przemyslaw Czerpak (druzus/at/poczta.onet.pl) * harbour/include/hbapicdp.h + added missing declaration of hb_fsNameConvU16() diff --git a/harbour/src/codepage/cp_utf8.c b/harbour/src/codepage/cp_utf8.c index 135deb49e1..b99e40db90 100644 --- a/harbour/src/codepage/cp_utf8.c +++ b/harbour/src/codepage/cp_utf8.c @@ -63,13 +63,12 @@ static HB_CDP_GET_FUNC( UTF8_get ) *wc = 0; while( nIndex < nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) ) + if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) ) + ++nIndex; + if( n == 0 ) { - if( n == 0 ) - { - *pnIndex = nIndex; - return HB_TRUE; - } + *pnIndex = nIndex; + return HB_TRUE; } } return HB_FALSE; diff --git a/harbour/src/rtl/cdpapi.c b/harbour/src/rtl/cdpapi.c index 90929da756..73aa963f57 100644 --- a/harbour/src/rtl/cdpapi.c +++ b/harbour/src/rtl/cdpapi.c @@ -234,15 +234,19 @@ static HB_BOOL hb_cdpUTF8_get( PHB_CODEPAGE cdp, *wc = 0; while( nIndex < nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex++ ], &n, wc ) ) + if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ nIndex ], &n, wc ) ) + ++nIndex; + if( n == 0 ) { - if( n == 0 ) - { - *pnIndex = nIndex; - return HB_TRUE; - } + *pnIndex = nIndex; + return HB_TRUE; } } + if( n > 0 ) + { + *pnIndex = nIndex; + return HB_TRUE; + } return HB_FALSE; } @@ -891,7 +895,10 @@ HB_BOOL hb_cdpUTF8ToU16NextChar( HB_UCHAR ucChar, int * n, HB_WCHAR * pwc ) if( *n > 0 ) { if( ( ucChar & 0xc0 ) != 0x80 ) + { + *n = 0; return HB_FALSE; + } *pwc = ( *pwc << 6 ) | ( ucChar & 0x3f ); ( *n )--; return HB_TRUE; @@ -936,14 +943,15 @@ HB_SIZE hb_cdpUTF8StringLength( const char * pSrc, HB_SIZE nLen ) HB_WCHAR wc; int n = 0; - for( ul = nDst = 0; ul < nLen; ++ul ) + for( ul = nDst = 0; ul < nLen; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) ) - { - if( n == 0 ) - ++nDst; - } + ++ul; + if( n == 0 ) + ++nDst; } + if( n > 0 ) + ++nDst; return nDst; } @@ -968,24 +976,20 @@ HB_SIZE hb_cdpUTF8StringAt( const char * szNeedle, HB_SIZE nLenN, { do { - if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH++ ], &nH, &wcH ) ) - { - --nPosH; - nH = 0; - } - } while( nH ); + if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szHaystack[ nPosH ], &nH, &wcH ) ) + break; + ++nPosH; + } while( nH && nPosH < nLenH ); if( ++nPos < nStart ) continue; do { - if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN++ ], &nN, &wcN ) ) - { - --nPosN; - nN = 0; - } - } while( nN ); + if( !hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szNeedle[ nPosN ], &nN, &wcN ) ) + break; + ++nPosN; + } while( nN && nPosN < nLenN ); if( wcH == wcN ) { @@ -1034,13 +1038,12 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ) HB_WCHAR wc = 0; int n = 0; - for( ul = 0; ul < nLen && nPos; ++ul ) + for( ul = 0; ul < nLen && nPos; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) ) - { - if( n == 0 ) - --nPos; - } + ++ul; + if( n == 0 ) + --nPos; } if( ul < nLen ) @@ -1049,12 +1052,11 @@ HB_WCHAR hb_cdpUTF8StringPeek( const char * pSrc, HB_SIZE nLen, HB_SIZE nPos ) do { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ul ], &n, &wc ) ) - { - if( n == 0 ) - return wc; - } + ++ul; + if( n == 0 ) + return wc; } - while( ++ul < nLen ); + while( ul < nLen ); } } @@ -1073,13 +1075,12 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, if( nCount && nLen ) { n = 0; - for( ul = 0; ul < nLen && nFrom; ++ul ) + for( ul = 0; ul < nLen && nFrom; ) { if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) ) - { - if( n == 0 ) - --nFrom; - } + ++ul; + if( n == 0 ) + --nFrom; } if( ul < nLen ) @@ -1090,12 +1091,11 @@ char * hb_cdpUTF8StringSubstr( const char * pSrc, HB_SIZE nLen, do { if( hb_cdpUTF8ToU16NextChar( pSrc[ ul ], &n, &wc ) ) - { - if( n == 0 ) - --nCnt; - } + ++ul; + if( n == 0 ) + --nCnt; } - while( ++ul < nLen && nCnt ); + while( ul < nLen && nCnt ); nDst = ul - nFrom; pDst = ( char * ) hb_xgrab( nDst + 1 ); @@ -1317,32 +1317,32 @@ HB_SIZE hb_cdpUTF8AsStrLen( PHB_CODEPAGE cdp, const char * pSrc, HB_SIZE nSrc, return ( nMax && nSrc > nMax ) ? nMax : nSrc; else if( cdp->fCustom ) { - for( ulS = ulD = 0; ulS < nSrc; ++ulS ) + for( ulS = ulD = 0; ulS < nSrc; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) ) + ++ulS; + + if( n == 0 ) { - if( n == 0 ) - { - i = HB_CDPCHAR_LEN( cdp, wc ); - if( nMax && ulD + i > nMax ) - break; - ulD += i; - } + i = HB_CDPCHAR_LEN( cdp, wc ); + if( nMax && ulD + i > nMax ) + break; + ulD += i; } } } else { - for( ulS = ulD = 0; ulS < nSrc; ++ulS ) + for( ulS = ulD = 0; ulS < nSrc; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) ) + ++ulS; + + if( n == 0 ) { - if( n == 0 ) - { - ++ulD; - if( nMax && ulD >= nMax ) - break; - } + ++ulD; + if( nMax && ulD >= nMax ) + break; } } } @@ -1370,15 +1370,15 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp, } else if( cdp->fCustom ) { - for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS ) + for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) ) + ++ulS; + + if( n == 0 ) { - if( n == 0 ) - { - if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) ) - break; - } + if( ! HB_CDPCHAR_PUT( cdp, pDst, nDst, &ulD, wc ) ) + break; } } } @@ -1389,17 +1389,17 @@ HB_SIZE hb_cdpUTF8ToStr( PHB_CODEPAGE cdp, uniTrans = cdp->uniTable->uniTrans; wcMax = cdp->uniTable->wcMax; - for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS ) + for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) ) + ++ulS; + + if( n == 0 ) { - if( n == 0 ) - { - if( wc <= wcMax && uniTrans[ wc ] ) - pDst[ ulD++ ] = uniTrans[ wc ]; - else - pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; - } + if( wc <= wcMax && uniTrans[ wc ] ) + pDst[ ulD++ ] = uniTrans[ wc ]; + else + pDst[ ulD++ ] = wc >= 0x100 ? '?' : ( HB_UCHAR ) wc; } } } @@ -1533,26 +1533,26 @@ HB_SIZE hb_cdpStrToU16( PHB_CODEPAGE cdp, int iEndian, HB_WCHAR wc = 0; int n = 0; - for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ++ulS ) + for( ulS = ulD = 0; ulS < nSrc && ulD < nDst; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) pSrc[ ulS ], &n, &wc ) ) + ++ulS; + + if( n == 0 ) { - if( n == 0 ) - { #if defined( HB_CDP_ENDIAN_SWAP ) - if( iEndian == HB_CDP_ENDIAN_SWAP ) - wc = HB_SWAP_UINT16( wc ); - pDst[ ulD++ ] = wc; + if( iEndian == HB_CDP_ENDIAN_SWAP ) + wc = HB_SWAP_UINT16( wc ); + pDst[ ulD++ ] = wc; #else - if( iEndian == HB_CDP_ENDIAN_LITTLE ) - HB_PUT_LE_UINT16( &pDst[ ulD ], wc ); - else if( iEndian == HB_CDP_ENDIAN_BIG ) - HB_PUT_BE_UINT16( &pDst[ ulD ], wc ); - else - pDst[ ulD ] = wc; - ++ulD; + if( iEndian == HB_CDP_ENDIAN_LITTLE ) + HB_PUT_LE_UINT16( &pDst[ ulD ], wc ); + else if( iEndian == HB_CDP_ENDIAN_BIG ) + HB_PUT_BE_UINT16( &pDst[ ulD ], wc ); + else + pDst[ ulD ] = wc; + ++ulD; #endif - } } } } diff --git a/harbour/src/rtl/cdpapihb.c b/harbour/src/rtl/cdpapihb.c index 725331ffb9..f6680d2a75 100644 --- a/harbour/src/rtl/cdpapihb.c +++ b/harbour/src/rtl/cdpapihb.c @@ -7,7 +7,7 @@ * The CodePages API * * Copyright 2002 Alexander S.Kresin - * Copyright 2009 Przemyslaw Czerpak + * Copyright 2009-2012 Przemyslaw Czerpak * www - http://harbour-project.org * * This program is free software; you can redistribute it and/or modify @@ -74,16 +74,16 @@ static HB_SIZE utf8pos( const char * szUTF8, HB_SIZE nLen, HB_SIZE nUTF8Pos ) HB_WCHAR uc; int n = 0; - for( n1 = n2 = 0; n1 < nLen; ++n1 ) + for( n1 = n2 = 0; n1 < nLen; ) { if( hb_cdpUTF8ToU16NextChar( ( HB_UCHAR ) szUTF8[ n1 ], &n, &uc ) ) + ++n1; + + if( n == 0 ) { - if( n == 0 ) - { - if( --nUTF8Pos == 0 ) - return n2 + 1; - n2 = n1 + 1; - } + if( --nUTF8Pos == 0 ) + return n2 + 1; + n2 = n1; } } } @@ -197,13 +197,14 @@ HB_FUNC( HB_UTF8ASC ) HB_WCHAR wc = 0; int n = 0; - while( nLen-- ) + while( nLen ) { - if( hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString++, &n, &wc ) ) - { - if( n == 0 ) - break; - } + if( !hb_cdpUTF8ToU16NextChar( ( unsigned char ) *pszString, &n, &wc ) ) + break; + if( n == 0 ) + break; + pszString++; + nLen--; } hb_retnint( wc ); } @@ -335,7 +336,7 @@ HB_FUNC( HB_UTF8SUBSTR ) else if( nFrom ) --nFrom; - if( nLen && nCount > 0 ) + if( nLen > ( HB_SIZE ) nFrom && nCount > 0 ) szDest = hb_cdpUTF8StringSubstr( szString, nLen, nFrom, nCount, &nDest ); if( szDest ) @@ -468,9 +469,10 @@ HB_FUNC( HB_UTF8POKE ) HB_FUNC( HB_UTF8STUFF ) { - const char * szString = hb_parc( 1 ); + const char * szText = hb_parc( 1 ); + const char * szIns = hb_parc( 4 ); - if( szString && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) && HB_ISCHAR( 4 ) ) + if( szText && szIns && HB_ISNUM( 2 ) && HB_ISNUM( 3 ) ) { HB_SIZE nLen = hb_parclen( 1 ); HB_SIZE nPos = hb_parns( 2 ); @@ -480,7 +482,7 @@ HB_FUNC( HB_UTF8STUFF ) if( nPos ) { - nPos = utf8pos( szString, nLen, nPos ); + nPos = utf8pos( szText, nLen, nPos ); if( nPos == 0 ) nPos = nLen; else @@ -490,7 +492,7 @@ HB_FUNC( HB_UTF8STUFF ) { if( nPos < nLen ) { - nDel = utf8pos( szString + nPos, nLen - nPos, nDel + 1 ); + nDel = utf8pos( szText + nPos, nLen - nPos, nDel + 1 ); if( nDel == 0 ) nDel = nLen - nPos; else @@ -504,9 +506,9 @@ HB_FUNC( HB_UTF8STUFF ) { char * szResult = ( char * ) hb_xgrab( nTot + 1 ); - hb_xmemcpy( szResult, szString, nPos ); - hb_xmemcpy( szResult + nPos, hb_parc( 4 ), nIns ); - hb_xmemcpy( szResult + nPos + nIns, szString + nPos + nDel, + hb_xmemcpy( szResult, szText, nPos ); + hb_xmemcpy( szResult + nPos, szIns, nIns ); + hb_xmemcpy( szResult + nPos + nIns, szText + nPos + nDel, nLen - ( nPos + nDel ) ); hb_retclen_buffer( szResult, nTot ); } @@ -519,10 +521,10 @@ HB_FUNC( HB_UTF8STUFF ) HB_FUNC( HB_UTF8LEN ) { - const char * szString = hb_parc( 1 ); + const char * szText = hb_parc( 1 ); - if( szString ) - hb_retnint( hb_cdpUTF8StringLength( szString, hb_parclen( 1 ) ) ); + if( szText ) + hb_retnint( hb_cdpUTF8StringLength( szText, hb_parclen( 1 ) ) ); else hb_errRT_BASE_SubstR( EG_ARG, 3012, NULL, HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS ); }