/* * $Id$ */ /* * Harbour Project source code: * STRDIFF() CT3 string function * * Copyright 2002 IntTec GmbH, Neunlindenstr 32, 79106 Freiburg, Germany * Author: Martin Vogel * * www - http://www.harbour-project.org * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this software; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307 USA (or visit the web site http://www.gnu.org/). * * As a special exception, the Harbour Project gives permission for * additional uses of the text contained in its release of Harbour. * * The exception is that, if you link the Harbour libraries with other * files to produce an executable, this does not by itself cause the * resulting executable to be covered by the GNU General Public License. * Your use of that executable is in no way restricted on account of * linking the Harbour library code into it. * * This exception does not however invalidate any other reasons why * the executable file might be covered by the GNU General Public License. * * This exception applies only to the code released by the Harbour * Project under the name Harbour. If you copy code from other * Harbour Project or Free Software Foundation releases into a copy of * Harbour, as the General Public License permits, the exception does * not apply to the code that you add in this way. To avoid misleading * anyone as to the status of such modified files, you must delete * this exception notice from them. * * If you write modifications of your own for Harbour, it is your choice * whether to permit this exception to apply to your modifications. * If you do not wish that, delete this exception notice. * */ #include "ct.h" #include /* $DOC$ * $FUNCNAME$ * STRDIFF() * $CATEGORY$ * CT3 string functions * $ONELINER$ * Evaluate the "Edit (Levensthein) Distance" of two strings * $SYNTAX$ * STRDIFF (, , [], [], * []) -> * $ARGUMENTS$ * string at the "starting point" of the transformation process, default is "" * string at the "end point" of the transformation process, default is "" * penalty points for a replacement of one character, default is 3 * penalty points for a deletion of one character, default is 6 * penalty points for an insertion of one character, default is 1 * $RETURNS$ * penalty point sum of all operations needed to transform to * $DESCRIPTION$ * The STRDIFF() functions calculates the so called "Edit" or "Levensthein" distance of two strings. * This distance is a measure for the number of single character replace/insert/delete operations (so called * "point mutations") required to transform into and its value will be the smallest sum of * the penalty points of the required operations. * * Be aware that this function is both quite time - O(len(cString1)*len(cString2)) - and memory consuming - * O((len(cString1)+1)*(len(cString2)+1)*sizeof(int)) - so keep the strings as short as possible. * E.g., on common 32 bit systems (sizeof(int) == 4), calling strdiff() with two strings of 1024 bytes * in length will consume 4 MB of memory. To not impose unneeded restrictions, the function will only check if * (len(cString1)+1)*(len(cString2)+1)*sizeof(int) <= UINT_MAX, although allocing UINT_MAX bytes will not * work on most systems. If this simple check fails, -1 is returned. * * Also, be aware that there can be an overflow when the penalty points are summed up: Assuming that the * number of transformation operations is in the order of max(len(cString1),len(cString2)), the penalty point * sum, that is internally stored in an "int" variable, is in the order of * (max(len(cString1),len(cString2))*max(nReplacementPenalty,nDeletionPenalty,nInsertionPentaly). * The STRDIFF() does not do an overflow check due to time performance reasons. Future versions of STRDIFF() * could use a type different to "int" to store the penalty point sum to save memory or to avoid overflows. * * The function is aware of the settings done by SETATLIKE(), that means that the wildchar character * is considered equal to ALL characters. * * $EXAMPLES$ * ? strdiff("ABC", "ADC") // 3, one character replaced * ? strdiff("ABC", "AEC") // 3, dito * ? strdiff("CBA", "ABC") // 6, two characters replaced * ? strdiff("ABC", "AXBC") // 1, one character inserted * ? strdiff("AXBC", "ABC") // 6, one character removed * ? strdiff("AXBC", "ADC") // 9, one character removed and one replaced * $TESTS$ * strdiff("ABC", "ADC") == 3 * strdiff("ABC", "AEC") == 3 * strdiff("CBA", "ABC") == 6 * strdiff("ABC", "AXBC") == 1 * strdiff("AXBC", "ABC") == 6 * strdiff("AXBC", "ADC") == 9 * $STATUS$ * Ready * $COMPLIANCE$ * STRDIFF() is compatible with CT3's STRDIFF(). * $PLATFORMS$ * All * $FILES$ * Source is strdiff.c, library is libct. * $SEEALSO$ * SETATLIKE() * $END$ */ #define MATRIXELEMENT(__row,__col) *(piPenalty+((__row)*(sStrLen2+1))+(__col)) static int min3( int a, int b, int c ) { if( a < b ) { return ( ( a < c ? a : c ) ); } return ( ( b < c ? b : c ) ); } HB_FUNC( STRDIFF ) { /* param check */ if( ISCHAR( 1 ) || ISCHAR( 2 ) ) { /* get parameters */ char *pcStr1, *pcStr2; size_t sStrLen1, sStrLen2; int iReplace, iDelete, iInsert; int iAtLike = ct_getatlike(); char cAtLike = ct_getatlikechar(); int *piPenalty; size_t sRowCnt, sColCnt; if( ISCHAR( 1 ) ) { pcStr1 = ( char * ) hb_parc( 1 ); sStrLen1 = ( size_t ) hb_parclen( 1 ); } else { pcStr1 = ( char * ) ""; sStrLen1 = 0; } if( ISCHAR( 2 ) ) { pcStr2 = ( char * ) hb_parc( 2 ); sStrLen2 = ( size_t ) hb_parclen( 2 ); } else { pcStr2 = ( char * ) ""; sStrLen2 = 0; } /* check for memory consumption */ if( ( ( double ) sStrLen1 + 1.0 ) * ( ( double ) sStrLen2 + 1.0 ) * ( ( double ) sizeof( int ) ) >= ( double ) UINT_MAX ) { int iArgErrorMode = ct_getargerrormode(); if( iArgErrorMode != CT_ARGERR_IGNORE ) { ct_error( ( USHORT ) iArgErrorMode, EG_ARG, CT_ERROR_STRDIFF, NULL, "STRDIFF", 0, EF_CANDEFAULT, HB_ERR_ARGS_BASEPARAMS ); } hb_retni( -1 ); return; } /* get penalty points */ if( ISNUM( 3 ) ) iReplace = hb_parni( 3 ); else iReplace = 3; if( ISNUM( 4 ) ) iDelete = hb_parni( 4 ); else iDelete = 6; if( ISNUM( 5 ) ) iInsert = hb_parni( 5 ); else iInsert = 1; piPenalty = ( int * ) hb_xgrab( ( sStrLen1 + 1 ) * ( sStrLen2 + 1 ) * sizeof( int ) ); MATRIXELEMENT( 0, 0 ) = 0; for( sColCnt = 0; sColCnt <= sStrLen2 - 1; sColCnt++ ) { MATRIXELEMENT( 0, sColCnt + 1 ) = MATRIXELEMENT( 0, sColCnt ) + iInsert; } for( sRowCnt = 0; sRowCnt <= sStrLen1 - 1; sRowCnt++ ) { MATRIXELEMENT( sRowCnt + 1, 0 ) = MATRIXELEMENT( sRowCnt, 0 ) + iDelete; for( sColCnt = 0; sColCnt <= sStrLen2 - 1; sColCnt++ ) { int iReplaceCost; if( pcStr1[sRowCnt] == pcStr2[sColCnt] || ( iAtLike == CT_SETATLIKE_WILDCARD && ( pcStr1[sRowCnt] == cAtLike || pcStr2[sColCnt] == cAtLike ) ) ) iReplaceCost = 0; else iReplaceCost = iReplace; MATRIXELEMENT( sRowCnt + 1, sColCnt + 1 ) = min3( MATRIXELEMENT( sRowCnt, sColCnt ) + iReplaceCost, MATRIXELEMENT( sRowCnt, sColCnt + 1 ) + iDelete, MATRIXELEMENT( sRowCnt + 1, sColCnt ) + iInsert ); } } hb_retni( MATRIXELEMENT( sStrLen1, sStrLen2 ) ); hb_xfree( piPenalty ); } else /* ISCHAR( 1 ) || ISCHAR( 2 ) */ { PHB_ITEM pSubst = NULL; int iArgErrorMode = ct_getargerrormode(); if( iArgErrorMode != CT_ARGERR_IGNORE ) { pSubst = ct_error_subst( ( USHORT ) iArgErrorMode, EG_ARG, CT_ERROR_STRDIFF, NULL, "STRDIFF", 0, EF_CANSUBSTITUTE, HB_ERR_ARGS_BASEPARAMS ); } if( pSubst != NULL ) hb_itemReturnRelease( pSubst ); else hb_retni( 0 ); } }