harbour-core/contrib/xhb/fparse.c

/*
 * FParse*(), F*Count()
 *
 * Copyright 2004 Andi Jahja <xharbour@cbn.net.id>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file LICENSE.txt.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301 USA (or visit https://www.gnu.org/licenses/).
 *
 * As a special exception, the Harbour Project gives permission for
 * additional uses of the text contained in its release of Harbour.
 *
 * The exception is that, if you link the Harbour libraries with other
 * files to produce an executable, this does not by itself cause the
 * resulting executable to be covered by the GNU General Public License.
 * Your use of that executable is in no way restricted on account of
 * linking the Harbour library code into it.
 *
 * This exception does not however invalidate any other reasons why
 * the executable file might be covered by the GNU General Public License.
 *
 * This exception applies only to the code released by the Harbour
 * Project under the name Harbour.  If you copy code from other
 * Harbour Project or Free Software Foundation releases into a copy of
 * Harbour, as the General Public License permits, the exception does
 * not apply to the code that you add in this way.  To avoid misleading
 * anyone as to the status of such modified files, you must delete
 * this exception notice from them.
 *
 * If you write modifications of your own for Harbour, it is your choice
 * whether to permit this exception to apply to your modifications.
 * If you do not wish that, delete this exception notice.
 *
 */

/* FIXME: use Harbour VF IO API */

/* FParse( cFile, cDelimiter ) --> array

   Purpose:
      Parse a delimited text file.

   Parameters:
      cFile - file to process
      cDelimiter - delimiter, default is comma

   Returns:
      Upon success -> Two dimensional array, of which each element contains
                      the results of parsing
      Upon error   -> An empty array
 */

#include "hbapi.h"
#include "hbapifs.h"
#include "hbapiitm.h"
#include "hbfast.h"

/* adjustable, but this should be sufficient in normal situation */
#define MAX_READ  4096

static void hb_ParseLine( PHB_ITEM pReturn, const char * szText, int iDelimiter, int * iWord )
{
   if( szText )
   {
      HB_ISIZ nLen = strlen( szText );

      if( nLen > 0 )
      {
         PHB_ITEM pTemp      = hb_itemNew( NULL );
         HB_ISIZ  i          = 0;
         int      word_count = 0;
         /* booked enough memory */
         char * szResult = ( char * ) hb_xgrab( nLen + 1 );

#if 0
         while( nLen )
         {
            if( szText[ nLen - 1 ] && ! HB_ISSPACE( szText[ nLen - 1 ] ) )
               break;

            nLen--;
         }

         szText[ nLen ] = 0;

         nLen = strlen( szText );
#endif

         while( i < nLen )
         {
            HB_ISIZ ui = 0;

            hb_xmemset( szResult, ' ', nLen + 1 );

            /* an '"' found, loop until the next one is found */
            if( szText[ i ] == '"' )
            {
               /* an '"' after '"' ? */
               if( szText[ i + 1 ] != '"' )
                  szResult[ ui ] = szText[ i + 1 ];
               else
                  szResult[ ui ] = '\0';

               ++i;

               while( ++i < nLen )
               {
                  if( szText[ i - 1 ] == '"' )
                  {
                     szResult[ ui + 1 ] = '\0';
                     break;
                  }
                  else
                  {
                     if( szText[ i ] == '"' )
                        szResult[ ui + 1 ] = '\0';
                     else
                        szResult[ ++ui ] = szText[ i ];
                  }
               }
               word_count++;
               hb_arrayAddForward( pReturn, hb_itemPutC( pTemp, szResult ) );
            }
            /* delimiter found */
            else if( szText[ i ] == iDelimiter )
            {
               /* first delimiter found but no word yet */
               if( word_count == 0 )
               {
                  /* add an empty string */
                  szResult[ ui ] = '\0';
               }
               else
               {
                  /* we have already have the first word */
                  /* check next character */
                  if( szText[ i - 1 ] == iDelimiter )
                  {
                     /* delimiter after delimiter */
                     /* just add an empty string */
                     szResult[ ui ] = '\0';
                  }
                  else
                  {
                     /* ",,0" */
                     /* it is not a delimiter */
                     /* move to next character */
                     ++i;
                     szResult[ ui ] = szText[ i ];

                     while( ++i < nLen )
                     {
                        if( szText[ i ] == iDelimiter )
                           break;
                        else
                           szResult[ ++ui ] = szText[ i ];
                     }
                  }
               }
               word_count++;
               szResult[ ui + 1 ] = '\0';
               hb_arrayAddForward( pReturn, hb_itemPutC( pTemp, szResult ) );
            }
            else
            {
               szResult[ ui ] = szText[ i ];

               while( ++i < nLen )
               {
                  if( szText[ i ] == iDelimiter )
                  {
                     szResult[ ui + 1 ] = '\0';
                     break;
                  }
                  else if( szText[ i ] == '"' )
                  {
                     szResult[ ui ] = szText[ i + 1 ];
                     ++i;

                     while( ++i < nLen )
                     {
                        if( szText[ i - 1 ] == '"' )
                        {
                           szResult[ ui + 1 ] = '\0';
                           break;
                        }
                        else
                        {
                           if( szText[ i ] == '"' )
                           {
                              szResult[ ui + 1 ] = '\0';
                              break;
                           }
                           else
                              szResult[ ++ui ] = szText[ i ];
                        }
                     }
                  }
                  else
                     szResult[ ++ui ] = szText[ i ];
               }
               word_count++;
               szResult[ ui + 1 ] = '\0';
               hb_arrayAddForward( pReturn, hb_itemPutC( pTemp, szResult ) );
            }

            i++;
         }

         /* last character in passed string is a delimiter */
         /* just add an empty string */
         if( szText[ nLen - 1 ] == iDelimiter )
         {
            word_count++;
            hb_arrayAddForward( pReturn, hb_itemPutC( pTemp, NULL ) );
         }

         /* store number of words */
         *iWord = word_count;

         /* clean up */
         hb_xfree( szResult );

         hb_itemRelease( pTemp );
      }
   }
}

static char ** hb_tokensplit( const char * string, HB_BYTE delimiter, int iCharCount, int * iWord )
{
   char *  buffer, * bufptr;
   char ** token_list;
   char    last_char  = '\0';
   int     word_count = 0, word_nbr;

   buffer = ( char * ) hb_xgrab( iCharCount + 1 );

   bufptr = buffer;

   while( *string )
   {
      if( ( HB_BYTE ) *string == delimiter )
      {
         while( ( HB_BYTE ) *string == delimiter )
            string++;

         if( bufptr > buffer )
         {
            word_count++;
            last_char = *bufptr++ = '\0';
         }
      }
      else
         last_char = *bufptr++ = *string++;
   }

   if( last_char > 0 )
      word_count++;

   *bufptr = '\0';

   token_list      = ( char ** ) hb_xgrab( sizeof( char * ) * ( word_count + 2 ) );
   token_list[ 0 ] = buffer;
   token_list++;

   bufptr = buffer;

   for( word_nbr = 0; word_nbr < word_count; word_nbr++ )
   {
      token_list[ word_nbr ] = bufptr;
      bufptr += strlen( bufptr ) + 1;
   }

   token_list[ word_count ] = NULL;

   *iWord = word_count;

   return token_list;
}

static HB_BOOL file_read( FILE * stream, char * string, int * iCharCount )
{
   int cnbr = 0;

   memset( string, ' ', MAX_READ );

   for( ;; )
   {
      int ch = fgetc( stream );

      if( ch == '\n' || ch == EOF || ch == 26 )
      {
         *iCharCount    = cnbr;
         string[ cnbr ] = '\0';
         return ch == '\n' || cnbr;
      }
      else
      {
         if( cnbr < MAX_READ && ch != '\r' )
            string[ cnbr++ ] = ( char ) ch;
      }

      if( cnbr >= MAX_READ )
      {
         *iCharCount        = cnbr;
         string[ MAX_READ ] = '\0';
         return HB_TRUE;
      }
   }
}

HB_FUNC( FPARSE )
{
   FILE *   inFile;
   PHB_ITEM pSrc   = hb_param( 1, HB_IT_STRING );
   PHB_ITEM pDelim = hb_param( 2, HB_IT_STRING );
   PHB_ITEM pArray;
   PHB_ITEM pItem;
   char *   string;
   int      iToken, iCharCount = 0;
   HB_BYTE  nByte;

   /* file parameter correctly passed */
   if( ! pSrc )
   {
      hb_reta( 0 );
      return;
   }

   if( hb_itemGetCLen( pSrc ) == 0 )
   {
      hb_reta( 0 );
      return;
   }

   /* open file for read */
   inFile = hb_fopen( hb_itemGetCPtr( pSrc ), "r" );

   /* return empty array on failure */
   if( ! inFile )
   {
      hb_reta( 0 );
      return;
   }

   /* default delimiter to comma */
   nByte = pDelim ? ( HB_BYTE ) hb_itemGetCPtr( pDelim )[ 0 ] : ( HB_BYTE ) ',';

   /* the main array */
   pArray = hb_itemArrayNew( 0 );
   pItem  = hb_itemNew( NULL );

   /* book memory for line to read */
   string = ( char * ) hb_xgrab( MAX_READ + 1 );

   /* read the file until EOF */
   while( file_read( inFile, string, &iCharCount ) )
   {
      /* parse the read line */
      int iWord = 0;

      char ** tokens = hb_tokensplit( string, nByte, iCharCount, &iWord );

      /* prepare empty array */
      hb_arrayNew( pItem, iWord );

      /* add parsed text to array */
      for( iToken = 0; tokens[ iToken ]; iToken++ )
         hb_arraySetC( pItem, iToken + 1, tokens[ iToken ] );

      /* add array containing parsed text to main array */
      hb_arrayAddForward( pArray, pItem );

      /* clean up */
      tokens--;
      hb_xfree( tokens[ 0 ] );
      hb_xfree( tokens );
   }

   /* return main array */
   hb_itemReturnRelease( pArray );
   hb_itemRelease( pItem );

   /* clean up */
   hb_xfree( string );
   fclose( inFile );
}

HB_FUNC( FPARSEEX )
{
   FILE *   inFile;
   PHB_ITEM pSrc   = hb_param( 1, HB_IT_STRING );
   PHB_ITEM pDelim = hb_param( 2, HB_IT_STRING );
   PHB_ITEM pArray;
   PHB_ITEM pSubArray;
   char *   string;
   int      iCharCount = 0;
   HB_BYTE  nByte;

   /* file parameter correctly passed */
   if( ! pSrc )
   {
      hb_reta( 0 );
      return;
   }

   if( hb_itemGetCLen( pSrc ) == 0 )
   {
      hb_reta( 0 );
      return;
   }

   /* open file for read */
   inFile = hb_fopen( hb_itemGetCPtr( pSrc ), "r" );

   /* return empty array on failure */
   if( ! inFile )
   {
      hb_reta( 0 );
      return;
   }

   /* default delimiter to comma */
   nByte = pDelim ? ( HB_BYTE ) hb_itemGetCPtr( pDelim )[ 0 ] : ( HB_BYTE ) ',';

   /* the main array */
   pArray    = hb_itemArrayNew( 0 );
   pSubArray = hb_itemNew( NULL );

   /* book memory for line to read */
   string = ( char * ) hb_xgrab( MAX_READ + 1 );

   /* read the file until EOF */
   while( file_read( inFile, string, &iCharCount ) )
   {
      /* parse the read line */
      int iWord = 0;
      hb_arrayNew( pSubArray, 0 );

      hb_ParseLine( pSubArray, string, nByte, &iWord );

      /* add array containing parsed text to main array */
      hb_arrayAddForward( pArray, pSubArray );
   }

   /* return main array */
   hb_itemReturnRelease( pArray );
   hb_itemRelease( pSubArray );

   /* clean up */
   hb_xfree( string );
   fclose( inFile );
}

HB_FUNC( FWORDCOUNT )
{
   FILE *   inFile;
   PHB_ITEM pSrc = hb_param( 1, HB_IT_STRING );
   char *   string;
   int      iCharCount = 0;
   HB_BYTE  nByte      = ' ';
   HB_SIZE  nWordCount = 0;

   /* file parameter correctly passed */
   if( ! pSrc )
   {
      hb_retns( 0 );
      return;
   }

   if( hb_itemGetCLen( pSrc ) == 0 )
   {
      hb_retns( 0 );
      return;
   }

   /* open file for read */
   inFile = hb_fopen( hb_itemGetCPtr( pSrc ), "r" );

   /* return 0 on failure */
   if( ! inFile )
   {
      hb_retns( 0 );
      return;
   }

   /* book memory for line to read */
   string = ( char * ) hb_xgrab( MAX_READ + 1 );

   /* read the file until EOF */
   while( file_read( inFile, string, &iCharCount ) )
   {
      int iWord = 0;

      char ** tokens = hb_tokensplit( string, nByte, iCharCount, &iWord );

      nWordCount += iWord;

      /* clean up */
      tokens--;
      hb_xfree( tokens[ 0 ] );
      hb_xfree( tokens );
   }

   /* return number of words */
   hb_retns( nWordCount );

   /* clean up */
   hb_xfree( string );
   fclose( inFile );
}

HB_FUNC( FLINECOUNT )
{
   FILE *   inFile;
   PHB_ITEM pSrc       = hb_param( 1, HB_IT_STRING );
   HB_SIZE  nLineCount = 0;
   int      ch;

   /* file parameter correctly passed */
   if( ! pSrc )
   {
      hb_retns( 0 );
      return;
   }

   if( hb_itemGetCLen( pSrc ) == 0 )
   {
      hb_retns( 0 );
      return;
   }

   /* open file for read */
   inFile = hb_fopen( hb_itemGetCPtr( pSrc ), "r" );

   /* return 0 on failure */
   if( ! inFile )
   {
      hb_retns( 0 );
      return;
   }

   /* read the file until EOF */
   while( ( ch = fgetc( inFile ) ) != EOF )
   {
      if( ch == '\n' )
         nLineCount++;
   }

   /* return number of lines */
   hb_retns( nLineCount );

   /* clean up */
   fclose( inFile );
}

HB_FUNC( FCHARCOUNT )
{
   FILE *   inFile;
   PHB_ITEM pSrc    = hb_param( 1, HB_IT_STRING );
   HB_SIZE  nResult = 0;
   int      ch;

   /* file parameter correctly passed */
   if( ! pSrc )
   {
      hb_retns( 0 );
      return;
   }

   if( hb_itemGetCLen( pSrc ) == 0 )
   {
      hb_retns( 0 );
      return;
   }

   /* open file for read */
   inFile = hb_fopen( hb_itemGetCPtr( pSrc ), "r" );

   /* return 0 on failure */
   if( ! inFile )
   {
      hb_retns( 0 );
      return;
   }

   /* read the file until EOF */
   while( ( ch = fgetc( inFile ) ) != EOF )
   {
      switch( ch )
      {
         case '\n':
         case '\r':
         case ' ':
         case '\t':
            break;
         default:
            nResult++;
      }
   }

   /* return number of characters */
   hb_retns( nResult );

   /* clean up */
   fclose( inFile );
}

HB_FUNC( FPARSELINE )
{
   PHB_ITEM     pArray;
   HB_ISIZ      nWords = 0;
   const char * szText;

   pArray = hb_itemArrayNew( 0 );
   szText = hb_parc( 1 );

   if( szText )
   {
      const char * szDelim = hb_parc( 2 );
      int          iWords  = 0;
      hb_ParseLine( pArray, szText, szDelim ? ( unsigned char ) *szDelim : ',', &iWords );
      nWords = iWords;
   }

   hb_itemReturnRelease( pArray );
   hb_storns( nWords, 3 );
}