/* * $Id$ */ /* * Harbour Project source code: * HTML Classes * * Copyright 2007 Hannes Ziegler * www - http://harbour-project.org * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this software; see the file COPYING. If not, write to * the Free Software Foundation, Inc., 59 Temple Place, Suite 330, * Boston, MA 02111-1307 USA (or visit the web site http://www.gnu.org/). * * As a special exception, the Harbour Project gives permission for * additional uses of the text contained in its release of Harbour. * * The exception is that, if you link the Harbour libraries with other * files to produce an executable, this does not by itself cause the * resulting executable to be covered by the GNU General Public License. * Your use of that executable is in no way restricted on account of * linking the Harbour library code into it. * * This exception does not however invalidate any other reasons why * the executable file might be covered by the GNU General Public License. * * This exception applies only to the code released by the Harbour * Project under the name Harbour. If you copy code from other * Harbour Project or Free Software Foundation releases into a copy of * Harbour, as the General Public License permits, the exception does * not apply to the code that you add in this way. To avoid misleading * anyone as to the status of such modified files, you must delete * this exception notice from them. * * If you write modifications of your own for Harbour, it is your choice * whether to permit this exception to apply to your modifications. * */ #include "error.ch" #include "hbclass.ch" #include "thtml.ch" // A Html document can have more than 16 nesting levels. // The current implementation of FOR EACH is not suitable for the HTML classes // Directives for a light weight html parser #xtrans P_PARSER( ) => { , 0, Len( ), 0 } #define P_STR 1 // the string to parse #define P_POS 2 // current parser position #define P_LEN 3 // length of string #define P_END 4 // last parser position #xtrans :p_str => \[P_STR] #xtrans :p_pos => \[P_POS] #xtrans :p_len => \[P_LEN] #xtrans :p_end => \[P_END] #xtrans P_SEEK( , ) => (:p_end:=:p_pos, :p_pos:=hb_At(,:p_str,:p_end+1)) #xtrans P_SEEKI( , ) => (:p_end:=:p_pos, :p_pos:=hb_AtI(,:p_str,:p_end+1)) #xtrans P_PEEK( , ) => (:p_end:=:p_pos, __tip_PStrCompi( :p_str, :p_pos, )) #xtrans P_NEXT( ) => (:p_end:=:p_pos, SubStr(:p_str,++:p_pos,1)) #xtrans P_PREV( ) => (:p_end:=:p_pos, SubStr(:p_str,--:p_pos,1)) // Directives for a light weight stack #define S_DATA 1 // array holding data elements #define S_NUM 2 // number of occupied data elements #define S_SIZE 3 // total size of data array #define S_STEP 4 // number of elements for auto sizing #xtrans S_STACK() => S_STACK(64) #xtrans S_STACK( ) => {Array(),0,,Max(32,Int(/2))} #xtrans S_GROW( ) => (iif(++\[S_NUM]>\[S_SIZE],ASize(\[S_DATA],(\[S_SIZE]+=\[S_STEP])),)) #xtrans S_SHRINK( ) => (iif(\[S_NUM]>0 .AND. --\[S_NUM]\<\[S_SIZE]-\[S_STEP],ASize(\[S_DATA],\[S_SIZE]-=\[S_STEP]),)) #xtrans S_COMPRESS( ) => (ASize(\[S_DATA],\[S_SIZE]:=\[S_NUM])) #xtrans S_PUSH(,) => (S_GROW(),\[S_DATA,\[S_NUM]]:=) #xtrans S_POP(,@) => (:=\[S_DATA,\[S_NUM]],\[S_DATA,\[S_NUM]]:=NIL,S_SHRINK()) #xtrans S_POP() => (\[S_DATA,\[S_NUM]]:=NIL,S_SHRINK()) #xtrans S_TOP() => (\[S_DATA,\[S_NUM]]) THREAD STATIC t_aHtmlAttr // data for HTML attributes THREAD STATIC t_hTagTypes // data for HTML tags THREAD STATIC t_aHtmlAnsiEntities // HTML character entities (ANSI character set) THREAD STATIC t_lInit := .F. // initilization flag for HTML data // #define _DEBUG_ #ifdef _DEBUG_ #xtranslate HIDDEN: => EXPORTED: // debugger can't see HIDDEN iVars #endif /* * Class for handling an entire HTML document */ CREATE CLASS THtmlDocument MODULE FRIENDLY HIDDEN: VAR oIterator VAR nodes EXPORTED: VAR root READONLY VAR head READONLY VAR body READONLY VAR changed INIT .T. METHOD new( cHtmlString ) METHOD readFile( cFileName ) METHOD writeFile( cFileName ) METHOD collect() METHOD toString() METHOD getNode( cTagName ) METHOD getNodes( cTagName ) METHOD findFirst( cName, cAttrib, cValue, cData ) METHOD findFirstRegex( cName, cAttrib, cValue, cData ) METHOD findNext() INLINE ::oIterator:Next() ENDCLASS // accepts a HTML formatted string METHOD new( cHtmlString ) CLASS THtmlDocument LOCAL cEmptyHtmlDoc, oNode, oSubNode, oErrNode, aHead, aBody, nMode := 0 cEmptyHtmlDoc := '' + hb_eol() + ; '' + hb_eol() + ; ' ' + hb_eol() + ; ' ' + hb_eol() + ; ' ' + hb_eol() + ; ' ' + hb_eol() + ; '' IF ! HB_ISSTRING( cHtmlString ) ::root := THtmlNode():new( cEmptyHtmlDoc ) ELSE IF ! ", and tags // Although they are optional, the THtmlDocument class enforces them // so that the instance variables :head and :body are always available aHead := {} aBody := {} FOR EACH oSubNode IN ::root:htmlContent IF oSubNode:isType( CM_HEAD ) AAdd( aHead, oSubNode ) ELSE AAdd( aBody, oSubNode ) ENDIF NEXT ::root := THtmlNode():new( cEmptyHtmlDoc ) ::root:document := Self ::changed := .T. ::head := ::getNode( "head" ) ::body := ::getNode( "body" ) FOR EACH oSubNode IN aHead IF oSubNode:isType( CM_HEAD ) ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF NEXT FOR EACH oSubNode IN aBody IF Lower( oSubNode:htmlTagName ) $ "html,head,body" // This node is an error in the HTML string. // We gracefully add its subnodes to the tag FOR EACH oErrNode IN oSubNode:htmlContent ::body:addNode( oErrNode ) NEXT ELSE IF oSubNode:isType( CM_HEAD ) oSubNode:delete() ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF ENDIF NEXT ELSEIF ::head == NIL ::head := ::body:insertBefore( THtmlNode():new( ::body, "head" ) ) ELSEIF ::body == NIL ::head := ::head:insertAfter( THtmlNode():new( ::head, "body" ) ) ENDIF IF nMode == 1 oNode := THtmlNode():new( cHtmlString ) FOR EACH oSubNode IN oNode:htmlContent IF oSubNode:isType( CM_HEAD ) ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF NEXT ENDIF RETURN Self // Builds a HTML formatted string METHOD toString() CLASS THtmlDocument RETURN ::root:toString() // reads HTML file and parses it into tree of objects METHOD readFile( cFileName ) CLASS THtmlDocument IF ! hb_FileExists( cFileName ) RETURN .F. ENDIF ::changed := .T. ::new( MemoRead( cFileName ) ) RETURN .T. // writes the entire tree of HTML objects into a file METHOD writeFile( cFileName ) CLASS THtmlDocument LOCAL cHtml := ::toString() LOCAL nFileHandle := FCreate( cFileName ) IF FError() != 0 RETURN .F. ENDIF FWrite( nFileHandle, cHtml ) FClose( nFileHandle ) ::changed := .F. RETURN FError() == 0 // builds a one dimensional array of all nodes contained in the HTML document METHOD collect() CLASS THtmlDocument IF ::changed ::nodes := ::root:collect() ::changed := .F. ENDIF RETURN ::nodes // returns the first tag matching the passed tag name METHOD getNode( cTagName ) CLASS THtmlDocument LOCAL oNode IF ::changed ::collect() ENDIF FOR EACH oNode IN ::nodes IF Lower( oNode:htmlTagName ) == Lower( cTagName ) RETURN oNode ENDIF NEXT RETURN NIL // returns all tags matching the passed tag name METHOD getNodes( cTagName ) CLASS THtmlDocument LOCAL oNode, stack := S_STACK() IF ::changed ::collect() ENDIF FOR EACH oNode IN ::nodes IF Lower( oNode:htmlTagName ) == Lower( cTagName ) S_PUSH( stack, oNode ) ENDIF NEXT S_COMPRESS( stack ) RETURN stack[ S_DATA ] // finds the first HTML tag matching the search criteria METHOD findFirst( cName, cAttrib, cValue, cData ) CLASS THtmlDocument ::oIterator := THtmlIteratorScan():New( Self ) RETURN ::oIterator:Find( cName, cAttrib, cValue, cData ) // finds the first HTML tag matching the RegEx search criteria METHOD findFirstRegex( cName, cAttrib, cValue, cData ) CLASS THtmlDocument ::oIterator := THtmlIteratorRegex():New( Self ) RETURN ::oIterator:Find( cName, cAttrib, cValue, cData ) /* * Abstract super class for THtmlIteratorScan and THtmlIteratorScanRegEx * * (Adopted from TXMLIterator -> source\rtl\txml.prg) */ CREATE CLASS THtmlIterator MODULE FRIENDLY METHOD New( oHtml ) CONSTRUCTOR METHOD Next() METHOD Rewind() METHOD Find( cName, cAttribute, cValue, cData ) METHOD GetNode() INLINE ::oNode METHOD SetContext() METHOD Clone() HIDDEN: VAR cName VAR cAttribute VAR cValue VAR cData VAR oNode VAR oTop VAR aNodes VAR nCurrent VAR nLast METHOD MatchCriteria() ENDCLASS // accepts a THtmlNode or THtmlDocument object METHOD New( oHtml ) CLASS THtmlIterator IF oHtml:isDerivedFrom ( "THtmlDocument" ) ::oNode := oHtml:root ::aNodes := oHtml:nodes ELSE ::oNode := oHtml ::aNodes := ::oNode:collect() ENDIF ::oTop := ::oNode ::nCurrent := 1 ::nLast := Len( ::aNodes ) RETURN Self METHOD rewind() CLASS THtmlIterator ::oNode := ::oTop ::nCurrent := 0 RETURN Self METHOD Clone() CLASS THtmlIterator LOCAL oRet oRet := THtmlIterator():New( ::oTop ) oRet:cName := ::cName oRet:cAttribute := ::cAttribute oRet:cValue := ::cValue oRet:cData := ::cData oRet:nCurrent := 0 oRet:nLast := Len( ::aNodes ) oRet:aNodes := ::aNodes RETURN oRet METHOD SetContext() CLASS THtmlIterator ::oTop := ::oNode ::aNodes := ::oNode:collect() ::nCurrent := 0 ::nLast := Len( ::aNodes ) RETURN Self METHOD Find( cName, cAttribute, cValue, cData ) CLASS THtmlIterator ::cName := cName ::cAttribute := cAttribute ::cValue := cValue ::cData := cData IF ::nLast == 0 ::nCurrent := 0 RETURN NIL ENDIF IF ::MatchCriteria( ::oNode ) RETURN ::oNode ENDIF RETURN ::Next() METHOD Next() CLASS THtmlIterator LOCAL oFound, lExit := .F. DO WHILE ! lExit BEGIN SEQUENCE WITH {| oErr | Break( oErr ) } oFound := ::aNodes[ ++::nCurrent ] IF ::MatchCriteria( oFound ) ::oNode := oFound lExit := .T. ENDIF RECOVER lExit := .T. oFound := NIL ::nCurrent := 0 END SEQUENCE ENDDO RETURN oFound METHOD MatchCriteria() CLASS THtmlIterator RETURN .T. /******************************************** Iterator scan class *********************************************/ CLASS THtmlIteratorScan FROM THtmlIterator MODULE FRIENDLY METHOD New( oNodeTop ) CONSTRUCTOR HIDDEN: METHOD MatchCriteria( oFound ) ENDCLASS METHOD New( oNodeTop ) CLASS THtmlIteratorScan ::Super:New( oNodeTop ) RETURN Self METHOD MatchCriteria( oFound ) CLASS THtmlIteratorScan LOCAL xData IF ::cName != NIL .AND. !( Lower( ::cName ) == Lower( oFound:htmlTagName ) ) RETURN .F. ENDIF IF ::cAttribute != NIL .AND. ! hb_HHasKey( oFound:getAttributes(), ::cAttribute ) RETURN .F. ENDIF IF ::cValue != NIL xData := oFound:getAttributes() IF hb_HScan( xData, {| xKey, cValue | HB_SYMBOL_UNUSED( xKey ), Lower( ::cValue ) == Lower( cValue ) } ) == 0 RETURN .F. ENDIF ENDIF IF ::cData != NIL xData := oFound:getText( " " ) /* NOTE: != changed to !( == ) */ IF Empty( xData ) .OR. !( AllTrim( ::cData ) == AllTrim( xData ) ) RETURN .F. ENDIF ENDIF RETURN .T. /******************************************** Iterator regex class *********************************************/ CLASS THtmlIteratorRegex FROM THtmlIterator MODULE FRIENDLY METHOD New( oNodeTop ) CONSTRUCTOR HIDDEN: METHOD MatchCriteria( oFound ) ENDCLASS METHOD New( oNodeTop ) CLASS THtmlIteratorRegex ::Super:New( oNodeTop ) RETURN Self METHOD MatchCriteria( oFound ) CLASS THtmlIteratorRegex LOCAL xData IF ::cName != NIL .AND. ! hb_regexLike( Lower( oFound:htmlTagName ), Lower( ::cName ) ) RETURN .F. ENDIF IF ::cAttribute != NIL .AND. ; hb_HScan( oFound:getAttributes(), {| cKey | hb_regexLike( Lower( ::cAttribute ), cKey ) } ) == 0 RETURN .F. ENDIF IF ::cValue != NIL .AND. ; hb_HScan( oFound:getAttributes(), {| xKey, cValue | HB_SYMBOL_UNUSED( xKey ), hb_regexLike( ::cValue, cValue ) } ) == 0 RETURN .F. ENDIF IF ::cData != NIL xData := oFound:getText( " " ) IF Empty( xData ) .OR. ! hb_regexHas( AllTrim( ::cData ), AllTrim( xData ) ) RETURN .F. ENDIF ENDIF RETURN .T. /* * Class representing a HTML node tree. * It parses a HTML formatted string */ CREATE CLASS THtmlNode MODULE FRIENDLY HIDDEN: VAR root VAR _document VAR parent VAR htmlContent METHOD parseHtml( parser ) METHOD parseHtmlFixed( parser ) METHOD _getTextNode() METHOD _setTextNode( cText ) METHOD keepFormatting() EXPORTED: VAR htmlTagName READONLY VAR htmlEndTagName READONLY VAR htmlTagType READONLY VAR htmlAttributes READONLY METHOD New( oParent, cTagName, cAttrib, cContent ) METHOD isType( nType ) ACCESS isEmpty() ACCESS isInline() ACCESS isOptional() ACCESS isNode() ACCESS HB_ISBLOCK() METHOD addNode( oTHtmlNode ) METHOD insertAfter( oTHtmlNode ) METHOD insertBefore( oTHtmlNode ) METHOD Delete() // Messages from TXmlNode MESSAGE insertBelow METHOD addNode MESSAGE unlink METHOD Delete METHOD firstNode( lRoot ) METHOD lastNode( lRoot ) ACCESS nextNode() ACCESS prevNode() ACCESS siblingNodes() INLINE iif( ::parent == NIL, NIL, ::parent:htmlContent ) ACCESS childNodes() INLINE iif( ::isNode(), ::htmlContent, NIL ) ACCESS parentNode() INLINE ::parent ACCESS document() INLINE iif( ::root == NIL, NIL, ::root:_document ) METHOD toString( nIndent ) METHOD attrToString() METHOD collect( oEndNode ) METHOD getText( cEOL ) METHOD getAttribute( cName ) METHOD getAttributes() METHOD setAttribute( cName, cValue ) METHOD setAttributes( cHtml ) METHOD delAttribute( cName ) METHOD delAttributes() METHOD isAttribute( cName ) ACCESS TEXT INLINE ::_getTextNode() ASSIGN TEXT( x ) INLINE ::_setTextNode( x ) ACCESS attr INLINE ::getAttributes() ASSIGN attr( x ) INLINE ::setAttributes( x ) METHOD pushNode OPERATOR + METHOD popNode OPERATOR - METHOD findNodeByTagName METHOD findNodesByTagName ERROR HANDLER noMessage METHOD noAttribute ENDCLASS METHOD new( oParent, cTagName, cAttrib, cContent ) CLASS THtmlNode IF ! t_lInit THtmlInit( .T. ) ENDIF IF HB_ISSTRING( oParent ) // a HTML string is passed -> build new tree of objects IF Chr( 9 ) $ oParent oParent := StrTran( oParent, Chr( 9 ), Chr( 32 ) ) ENDIF ::root := Self ::htmlTagName := "_root_" ::htmlTagType := THtmlTagType( "_root_" ) ::htmlContent := {} ::parseHtml( P_PARSER( oParent ) ) ELSEIF HB_ISOBJECT( oParent ) // a HTML object is passed -> we are in the course of building an object tree ::root := oParent:root ::parent := oParent IF HB_ISSTRING( cAttrib ) IF Right( cAttrib, 1 ) == "/" cAttrib := Stuff( cAttrib, Len( cAttrib ), 1, " " ) ::htmlEndTagName := "/" ::htmlAttributes := RTrim( cAttrib ) ELSE ::htmlAttributes := cAttrib ENDIF ELSE ::htmlAttributes := cAttrib ENDIF ::htmlTagName := cTagName ::htmlTagType := THtmlTagType( cTagName ) ::htmlContent := iif( cContent == NIL, {}, cContent ) ELSE RETURN ::error( "Parameter error", ::className(), ":new()", EG_ARG, hb_AParams() ) ENDIF RETURN Self METHOD isType( nType ) CLASS THtmlNode LOCAL lRet BEGIN SEQUENCE WITH {| oErr | Break( oErr ) } lRet := hb_bitAnd( ::htmlTagType[ 2 ], nType ) > 0 RECOVER lRet := .F. END SEQUENCE RETURN lRet // checks if this is a node that is always empty and never has HTML text, e.g. ,, METHOD isEmpty() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_EMPTY ) > 0 // checks if this is a node that may occur inline, eg. , METHOD isInline() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_INLINE ) > 0 // checks if this is a node that may appear without a closing tag, eg.
,, METHOD isOptional() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_OPT ) > 0 // checks if this is a node (leafs contain no further nodes, e.g.
,
,_text_) METHOD isNode() CLASS THtmlNode RETURN HB_ISARRAY( ::htmlContent ) .AND. Len( ::htmlContent ) > 0 // checks if this is a block node that must be closed with an ending tag: eg:
,
METHOD HB_ISBLOCK() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_BLOCK ) > 0 // checks if this is a node whose text line formatting must be preserved:
,