291 lines
9.2 KiB
C
291 lines
9.2 KiB
C
/*
|
|
* $Id$
|
|
*/
|
|
|
|
/*
|
|
* Harbour Project source code:
|
|
* STRDIFF() CT3 string function
|
|
*
|
|
* Copyright 2002 IntTec GmbH, Neunlindenstr 32, 79106 Freiburg, Germany
|
|
* Author: Martin Vogel <vogel@inttec.de>
|
|
*
|
|
* www - http://www.harbour-project.org
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2, or (at your option)
|
|
* any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this software; see the file COPYING. If not, write to
|
|
* the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
|
|
* Boston, MA 02111-1307 USA (or visit the web site http://www.gnu.org/).
|
|
*
|
|
* As a special exception, the Harbour Project gives permission for
|
|
* additional uses of the text contained in its release of Harbour.
|
|
*
|
|
* The exception is that, if you link the Harbour libraries with other
|
|
* files to produce an executable, this does not by itself cause the
|
|
* resulting executable to be covered by the GNU General Public License.
|
|
* Your use of that executable is in no way restricted on account of
|
|
* linking the Harbour library code into it.
|
|
*
|
|
* This exception does not however invalidate any other reasons why
|
|
* the executable file might be covered by the GNU General Public License.
|
|
*
|
|
* This exception applies only to the code released by the Harbour
|
|
* Project under the name Harbour. If you copy code from other
|
|
* Harbour Project or Free Software Foundation releases into a copy of
|
|
* Harbour, as the General Public License permits, the exception does
|
|
* not apply to the code that you add in this way. To avoid misleading
|
|
* anyone as to the status of such modified files, you must delete
|
|
* this exception notice from them.
|
|
*
|
|
* If you write modifications of your own for Harbour, it is your choice
|
|
* whether to permit this exception to apply to your modifications.
|
|
* If you do not wish that, delete this exception notice.
|
|
*
|
|
*/
|
|
|
|
|
|
#include "ct.h"
|
|
#include <limits.h>
|
|
|
|
|
|
/* $DOC$
|
|
* $FUNCNAME$
|
|
* STRDIFF()
|
|
* $CATEGORY$
|
|
* CT3 string functions
|
|
* $ONELINER$
|
|
* Evaluate the "Edit (Levensthein) Distance" of two strings
|
|
* $SYNTAX$
|
|
* STRDIFF (<cString1>, <cString2>, [<nReplacementPenalty>], [<nDeletionPenalty>],
|
|
* [<nInsertionPenalty>]) -> <nDistance>
|
|
* $ARGUMENTS$
|
|
* <cString1> string at the "starting point" of the transformation process, default is ""
|
|
* <cString2> string at the "end point" of the transformation process, default is ""
|
|
* <nReplacementPenalty> penalty points for a replacement of one character, default is 3
|
|
* <nDeletionPenalty> penalty points for a deletion of one character, default is 6
|
|
* <nInsertionPenalty> penalty points for an insertion of one character, default is 1
|
|
* $RETURNS$
|
|
* <nDistance> penalty point sum of all operations needed to transform <cString1> to <cString2>
|
|
* $DESCRIPTION$
|
|
* The STRDIFF() functions calculates the so called "Edit" or "Levensthein" distance of two strings.
|
|
* This distance is a measure for the number of single character replace/insert/delete operations (so called
|
|
* "point mutations") required to transform <cString1> into <cString2> and its value will be the smallest sum of
|
|
* the penalty points of the required operations.
|
|
*
|
|
* Be aware that this function is both quite time - O(len(cString1)*len(cString2)) - and memory consuming -
|
|
* O((len(cString1)+1)*(len(cString2)+1)*sizeof(int)) - so keep the strings as short as possible.
|
|
* E.g., on common 32 bit systems (sizeof(int) == 4), calling strdiff() with two strings of 1024 bytes
|
|
* in length will consume 4 MB of memory. To not impose unneeded restrictions, the function will only check if
|
|
* (len(cString1)+1)*(len(cString2)+1)*sizeof(int) <= UINT_MAX, although allocing UINT_MAX bytes will not
|
|
* work on most systems. If this simple check fails, -1 is returned.
|
|
*
|
|
* Also, be aware that there can be an overflow when the penalty points are summed up: Assuming that the
|
|
* number of transformation operations is in the order of max(len(cString1),len(cString2)), the penalty point
|
|
* sum, that is internally stored in an "int" variable, is in the order of
|
|
* (max(len(cString1),len(cString2))*max(nReplacementPenalty,nDeletionPenalty,nInsertionPentaly).
|
|
* The STRDIFF() does not do an overflow check due to time performance reasons. Future versions of STRDIFF()
|
|
* could use a type different to "int" to store the penalty point sum to save memory or to avoid overflows.
|
|
*
|
|
* The function is aware of the settings done by SETATLIKE(), that means that the wildchar character
|
|
* is considered equal to ALL characters.
|
|
*
|
|
* $EXAMPLES$
|
|
* ? strdiff("ABC", "ADC") // 3, one character replaced
|
|
* ? strdiff("ABC", "AEC") // 3, dito
|
|
* ? strdiff("CBA", "ABC") // 6, two characters replaced
|
|
* ? strdiff("ABC", "AXBC") // 1, one character inserted
|
|
* ? strdiff("AXBC", "ABC") // 6, one character removed
|
|
* ? strdiff("AXBC", "ADC") // 9, one character removed and one replaced
|
|
* $TESTS$
|
|
* strdiff("ABC", "ADC") == 3
|
|
* strdiff("ABC", "AEC") == 3
|
|
* strdiff("CBA", "ABC") == 6
|
|
* strdiff("ABC", "AXBC") == 1
|
|
* strdiff("AXBC", "ABC") == 6
|
|
* strdiff("AXBC", "ADC") == 9
|
|
* $STATUS$
|
|
* Ready
|
|
* $COMPLIANCE$
|
|
* STRDIFF() is compatible with CT3's STRDIFF().
|
|
* $PLATFORMS$
|
|
* All
|
|
* $FILES$
|
|
* Source is strdiff.c, library is libct.
|
|
* $SEEALSO$
|
|
* SETATLIKE()
|
|
* $END$
|
|
*/
|
|
|
|
#define MATRIXELEMENT(__row,__col) *(piPenalty+((__row)*(sStrLen2+1))+(__col))
|
|
|
|
static int min3 (int a, int b, int c)
|
|
{
|
|
if (a < b)
|
|
{
|
|
return ((a < c ? a : c));
|
|
}
|
|
return ((b < c ? b : c));
|
|
}
|
|
|
|
HB_FUNC (STRDIFF)
|
|
{
|
|
|
|
/* param check */
|
|
if ((ISCHAR (1)) ||
|
|
(ISCHAR (2)))
|
|
{
|
|
|
|
/* get parameters */
|
|
char *pcStr1, *pcStr2;
|
|
size_t sStrLen1, sStrLen2;
|
|
int iReplace, iDelete, iInsert;
|
|
int iAtLike = ct_getatlike();
|
|
char cAtLike = ct_getatlikechar();
|
|
int *piPenalty;
|
|
size_t sRowCnt, sColCnt;
|
|
|
|
if (ISCHAR (1))
|
|
{
|
|
pcStr1 = (char *)hb_parc (1);
|
|
sStrLen1 = (size_t)hb_parclen (1);
|
|
}
|
|
else
|
|
{
|
|
pcStr1 = (char *)"";
|
|
sStrLen1 = 0;
|
|
}
|
|
|
|
if (ISCHAR (2))
|
|
{
|
|
pcStr2 = (char *)hb_parc (2);
|
|
sStrLen2 = (size_t)hb_parclen (2);
|
|
}
|
|
else
|
|
{
|
|
pcStr2 = (char *)"";
|
|
sStrLen2 = 0;
|
|
}
|
|
|
|
/* check for memory consumption */
|
|
if ((double)(((double)sStrLen1+1.0)*((double)sStrLen2+1.0)*((double)sizeof(int))) >= ((double)UINT_MAX))
|
|
{
|
|
int iArgErrorMode = ct_getargerrormode();
|
|
if (iArgErrorMode != CT_ARGERR_IGNORE)
|
|
{
|
|
ct_error ((USHORT)iArgErrorMode, EG_ARG, CT_ERROR_STRDIFF,
|
|
NULL, "STRDIFF", 0, EF_CANDEFAULT, 5,
|
|
hb_paramError (1), hb_paramError (2),
|
|
hb_paramError (3), hb_paramError (4),
|
|
hb_paramError (5));
|
|
}
|
|
hb_retni (-1);
|
|
return;
|
|
}
|
|
|
|
/* get penalty points */
|
|
if (ISNUM (3))
|
|
{
|
|
iReplace = hb_parni (3);
|
|
}
|
|
else
|
|
{
|
|
iReplace = 3;
|
|
}
|
|
|
|
if (ISNUM (4))
|
|
{
|
|
iDelete = hb_parni (4);
|
|
}
|
|
else
|
|
{
|
|
iDelete = 6;
|
|
}
|
|
|
|
if (ISNUM (5))
|
|
{
|
|
iInsert = hb_parni (5);
|
|
}
|
|
else
|
|
{
|
|
iInsert = 1;
|
|
}
|
|
|
|
piPenalty = (int *)hb_xgrab ((sStrLen1+1)*(sStrLen2+1)*sizeof(int));
|
|
|
|
MATRIXELEMENT(0,0) = 0;
|
|
|
|
for (sColCnt = 0; sColCnt <= sStrLen2-1; sColCnt++)
|
|
{
|
|
MATRIXELEMENT (0,sColCnt+1) = MATRIXELEMENT (0,sColCnt)+iInsert;
|
|
}
|
|
|
|
for (sRowCnt = 0; sRowCnt <= sStrLen1-1; sRowCnt++)
|
|
{
|
|
MATRIXELEMENT (sRowCnt+1,0) = MATRIXELEMENT (sRowCnt,0)+iDelete;
|
|
for (sColCnt = 0; sColCnt <= sStrLen2-1; sColCnt++)
|
|
{
|
|
int iReplaceCost;
|
|
if ((*(pcStr1+sRowCnt) == *(pcStr2+sColCnt)) ||
|
|
((iAtLike == CT_SETATLIKE_WILDCARD) &&
|
|
((*(pcStr1+sRowCnt) == cAtLike) ||
|
|
(*(pcStr2+sColCnt) == cAtLike))))
|
|
{
|
|
iReplaceCost = 0;
|
|
}
|
|
else
|
|
{
|
|
iReplaceCost = iReplace;
|
|
}
|
|
|
|
MATRIXELEMENT(sRowCnt+1,sColCnt+1) = min3 (MATRIXELEMENT(sRowCnt,sColCnt)+iReplaceCost,
|
|
MATRIXELEMENT(sRowCnt,sColCnt+1)+iDelete,
|
|
MATRIXELEMENT(sRowCnt+1,sColCnt)+iInsert);
|
|
}
|
|
}
|
|
|
|
hb_retni (MATRIXELEMENT(sStrLen1,sStrLen2));
|
|
hb_xfree (piPenalty);
|
|
|
|
}
|
|
else /* (ISCHAR (1)) ||
|
|
(ISCHAR (2)) */
|
|
{
|
|
PHB_ITEM pSubst = NULL;
|
|
int iArgErrorMode = ct_getargerrormode();
|
|
if (iArgErrorMode != CT_ARGERR_IGNORE)
|
|
{
|
|
pSubst = ct_error_subst ((USHORT)iArgErrorMode, EG_ARG, CT_ERROR_STRDIFF,
|
|
NULL, "STRDIFF", 0, EF_CANSUBSTITUTE, 5,
|
|
hb_paramError (1), hb_paramError (2), hb_paramError (3),
|
|
hb_paramError (4), hb_paramError (5));
|
|
}
|
|
|
|
if (pSubst != NULL)
|
|
{
|
|
hb_itemReturn (pSubst);
|
|
hb_itemRelease (pSubst);
|
|
}
|
|
else
|
|
{
|
|
hb_retni (0);
|
|
}
|
|
return;
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|