Files
harbour-core/harbour/contrib/libct/strdiff.c
Przemyslaw Czerpak 0242db11df 2007-07-19 18:45 UTC+0200 Przemyslaw Czerpak (druzus/at/priv.onet.pl)
* harbour/contrib/libct/strdiff.c
  * harbour/contrib/libct/token2.c
  * harbour/contrib/libct/Makefile
  * harbour/contrib/libct/wordrepl.c
  * harbour/contrib/libct/strswap.c
  * harbour/contrib/libct/makefile.bc
  * harbour/contrib/libct/screen1.c
  * harbour/contrib/libct/token1.c
  * harbour/contrib/libct/makefile.vc
  * harbour/contrib/libct/ctwfunc.c
  * harbour/contrib/libct/relation.c
  + harbour/contrib/libct/setlast.c
  * harbour/contrib/libct/tab.c
  * harbour/contrib/libct/replace.c
  - harbour/contrib/libct/setlast.prg
  * harbour/contrib/libct/wordtoch.c
  * harbour/contrib/libct/remove.c
    * synced with xHarbour modifications and fixes
    * new functions or .prg functions rewritten in C
    ! some fixes
    * indenting
2007-07-19 16:45:42 +00:00

259 lines
9.4 KiB
C

/*
* $Id$
*/
/*
* Harbour Project source code:
* STRDIFF() CT3 string function
*
* Copyright 2002 IntTec GmbH, Neunlindenstr 32, 79106 Freiburg, Germany
* Author: Martin Vogel <vogel@inttec.de>
*
* www - http://www.harbour-project.org
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307 USA (or visit the web site http://www.gnu.org/).
*
* As a special exception, the Harbour Project gives permission for
* additional uses of the text contained in its release of Harbour.
*
* The exception is that, if you link the Harbour libraries with other
* files to produce an executable, this does not by itself cause the
* resulting executable to be covered by the GNU General Public License.
* Your use of that executable is in no way restricted on account of
* linking the Harbour library code into it.
*
* This exception does not however invalidate any other reasons why
* the executable file might be covered by the GNU General Public License.
*
* This exception applies only to the code released by the Harbour
* Project under the name Harbour. If you copy code from other
* Harbour Project or Free Software Foundation releases into a copy of
* Harbour, as the General Public License permits, the exception does
* not apply to the code that you add in this way. To avoid misleading
* anyone as to the status of such modified files, you must delete
* this exception notice from them.
*
* If you write modifications of your own for Harbour, it is your choice
* whether to permit this exception to apply to your modifications.
* If you do not wish that, delete this exception notice.
*
*/
#include "ct.h"
#include <limits.h>
/* $DOC$
* $FUNCNAME$
* STRDIFF()
* $CATEGORY$
* CT3 string functions
* $ONELINER$
* Evaluate the "Edit (Levensthein) Distance" of two strings
* $SYNTAX$
* STRDIFF (<cString1>, <cString2>, [<nReplacementPenalty>], [<nDeletionPenalty>],
* [<nInsertionPenalty>]) -> <nDistance>
* $ARGUMENTS$
* <cString1> string at the "starting point" of the transformation process, default is ""
* <cString2> string at the "end point" of the transformation process, default is ""
* <nReplacementPenalty> penalty points for a replacement of one character, default is 3
* <nDeletionPenalty> penalty points for a deletion of one character, default is 6
* <nInsertionPenalty> penalty points for an insertion of one character, default is 1
* $RETURNS$
* <nDistance> penalty point sum of all operations needed to transform <cString1> to <cString2>
* $DESCRIPTION$
* The STRDIFF() functions calculates the so called "Edit" or "Levensthein" distance of two strings.
* This distance is a measure for the number of single character replace/insert/delete operations (so called
* "point mutations") required to transform <cString1> into <cString2> and its value will be the smallest sum of
* the penalty points of the required operations.
*
* Be aware that this function is both quite time - O(len(cString1)*len(cString2)) - and memory consuming -
* O((len(cString1)+1)*(len(cString2)+1)*sizeof(int)) - so keep the strings as short as possible.
* E.g., on common 32 bit systems (sizeof(int) == 4), calling strdiff() with two strings of 1024 bytes
* in length will consume 4 MB of memory. To not impose unneeded restrictions, the function will only check if
* (len(cString1)+1)*(len(cString2)+1)*sizeof(int) <= UINT_MAX, although allocing UINT_MAX bytes will not
* work on most systems. If this simple check fails, -1 is returned.
*
* Also, be aware that there can be an overflow when the penalty points are summed up: Assuming that the
* number of transformation operations is in the order of max(len(cString1),len(cString2)), the penalty point
* sum, that is internally stored in an "int" variable, is in the order of
* (max(len(cString1),len(cString2))*max(nReplacementPenalty,nDeletionPenalty,nInsertionPentaly).
* The STRDIFF() does not do an overflow check due to time performance reasons. Future versions of STRDIFF()
* could use a type different to "int" to store the penalty point sum to save memory or to avoid overflows.
*
* The function is aware of the settings done by SETATLIKE(), that means that the wildchar character
* is considered equal to ALL characters.
*
* $EXAMPLES$
* ? strdiff("ABC", "ADC") // 3, one character replaced
* ? strdiff("ABC", "AEC") // 3, dito
* ? strdiff("CBA", "ABC") // 6, two characters replaced
* ? strdiff("ABC", "AXBC") // 1, one character inserted
* ? strdiff("AXBC", "ABC") // 6, one character removed
* ? strdiff("AXBC", "ADC") // 9, one character removed and one replaced
* $TESTS$
* strdiff("ABC", "ADC") == 3
* strdiff("ABC", "AEC") == 3
* strdiff("CBA", "ABC") == 6
* strdiff("ABC", "AXBC") == 1
* strdiff("AXBC", "ABC") == 6
* strdiff("AXBC", "ADC") == 9
* $STATUS$
* Ready
* $COMPLIANCE$
* STRDIFF() is compatible with CT3's STRDIFF().
* $PLATFORMS$
* All
* $FILES$
* Source is strdiff.c, library is libct.
* $SEEALSO$
* SETATLIKE()
* $END$
*/
#define MATRIXELEMENT(__row,__col) *(piPenalty+((__row)*(sStrLen2+1))+(__col))
static int min3( int a, int b, int c )
{
if( a < b )
{
return ( ( a < c ? a : c ) );
}
return ( ( b < c ? b : c ) );
}
HB_FUNC( STRDIFF )
{
/* param check */
if( ISCHAR( 1 ) || ISCHAR( 2 ) )
{
/* get parameters */
char *pcStr1, *pcStr2;
size_t sStrLen1, sStrLen2;
int iReplace, iDelete, iInsert;
int iAtLike = ct_getatlike();
char cAtLike = ct_getatlikechar();
int *piPenalty;
size_t sRowCnt, sColCnt;
if( ISCHAR( 1 ) )
{
pcStr1 = ( char * ) hb_parc( 1 );
sStrLen1 = ( size_t ) hb_parclen( 1 );
}
else
{
pcStr1 = ( char * ) "";
sStrLen1 = 0;
}
if( ISCHAR( 2 ) )
{
pcStr2 = ( char * ) hb_parc( 2 );
sStrLen2 = ( size_t ) hb_parclen( 2 );
}
else
{
pcStr2 = ( char * ) "";
sStrLen2 = 0;
}
/* check for memory consumption */
if( ( ( double ) sStrLen1 + 1.0 ) *
( ( double ) sStrLen2 + 1.0 ) *
( ( double ) sizeof( int ) ) >= ( double ) UINT_MAX )
{
int iArgErrorMode = ct_getargerrormode();
if( iArgErrorMode != CT_ARGERR_IGNORE )
{
ct_error( ( USHORT ) iArgErrorMode, EG_ARG, CT_ERROR_STRDIFF, NULL,
"STRDIFF", 0, EF_CANDEFAULT, HB_ERR_ARGS_BASEPARAMS );
}
hb_retni( -1 );
return;
}
/* get penalty points */
if( ISNUM( 3 ) )
iReplace = hb_parni( 3 );
else
iReplace = 3;
if( ISNUM( 4 ) )
iDelete = hb_parni( 4 );
else
iDelete = 6;
if( ISNUM( 5 ) )
iInsert = hb_parni( 5 );
else
iInsert = 1;
piPenalty = ( int * ) hb_xgrab( ( sStrLen1 + 1 ) *
( sStrLen2 + 1 ) * sizeof( int ) );
MATRIXELEMENT( 0, 0 ) = 0;
for( sColCnt = 0; sColCnt <= sStrLen2 - 1; sColCnt++ )
{
MATRIXELEMENT( 0, sColCnt + 1 ) = MATRIXELEMENT( 0, sColCnt ) + iInsert;
}
for( sRowCnt = 0; sRowCnt <= sStrLen1 - 1; sRowCnt++ )
{
MATRIXELEMENT( sRowCnt + 1, 0 ) = MATRIXELEMENT( sRowCnt, 0 ) + iDelete;
for( sColCnt = 0; sColCnt <= sStrLen2 - 1; sColCnt++ )
{
int iReplaceCost;
if( pcStr1[sRowCnt] == pcStr2[sColCnt] ||
( iAtLike == CT_SETATLIKE_WILDCARD &&
( pcStr1[sRowCnt] == cAtLike ||
pcStr2[sColCnt] == cAtLike ) ) )
iReplaceCost = 0;
else
iReplaceCost = iReplace;
MATRIXELEMENT( sRowCnt + 1, sColCnt + 1 ) =
min3( MATRIXELEMENT( sRowCnt, sColCnt ) + iReplaceCost,
MATRIXELEMENT( sRowCnt, sColCnt + 1 ) + iDelete,
MATRIXELEMENT( sRowCnt + 1, sColCnt ) + iInsert );
}
}
hb_retni( MATRIXELEMENT( sStrLen1, sStrLen2 ) );
hb_xfree( piPenalty );
}
else /* ISCHAR( 1 ) || ISCHAR( 2 ) */
{
PHB_ITEM pSubst = NULL;
int iArgErrorMode = ct_getargerrormode();
if( iArgErrorMode != CT_ARGERR_IGNORE )
{
pSubst = ct_error_subst( ( USHORT ) iArgErrorMode, EG_ARG,
CT_ERROR_STRDIFF, NULL, "STRDIFF", 0,
EF_CANSUBSTITUTE, HB_ERR_ARGS_BASEPARAMS );
}
if( pSubst != NULL )
hb_itemReturnRelease( pSubst );
else
hb_retni( 0 );
}
}