/* * HTML Classes * * Copyright 2007 Hannes Ziegler * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file LICENSE.txt. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301 USA (or visit https://www.gnu.org/licenses/). * * As a special exception, the Harbour Project gives permission for * additional uses of the text contained in its release of Harbour. * * The exception is that, if you link the Harbour libraries with other * files to produce an executable, this does not by itself cause the * resulting executable to be covered by the GNU General Public License. * Your use of that executable is in no way restricted on account of * linking the Harbour library code into it. * * This exception does not however invalidate any other reasons why * the executable file might be covered by the GNU General Public License. * * This exception applies only to the code released by the Harbour * Project under the name Harbour. If you copy code from other * Harbour Project or Free Software Foundation releases into a copy of * Harbour, as the General Public License permits, the exception does * not apply to the code that you add in this way. To avoid misleading * anyone as to the status of such modified files, you must delete * this exception notice from them. * * If you write modifications of your own for Harbour, it is your choice * whether to permit this exception to apply to your modifications. * If you do not wish that, delete this exception notice. * */ #include "error.ch" #include "hbclass.ch" #include "thtml.ch" // A Html document can have more than 16 nesting levels. // The current implementation of FOR EACH is not suitable for the HTML classes // Directives for a light weight html parser #xtrans P_PARSER( ) => { , 0, Len( ), 0 } #xtrans :p_str => \[ 1 ] // the string to parse #xtrans :p_pos => \[ 2 ] // current parser position #xtrans :p_len => \[ 3 ] // length of string #xtrans :p_end => \[ 4 ] // last parser position #xtrans P_SEEK( , ) => ( :p_end := :p_pos, :p_pos := hb_At( , :p_str, :p_end + 1 ) ) #xtrans P_SEEKI( , ) => ( :p_end := :p_pos, :p_pos := hb_AtI( , :p_str, :p_end + 1 ) ) #xtrans P_PEEK( , ) => ( :p_end := :p_pos, hb_LeftEqI( SubStr( :p_str, :p_pos ), ) ) #xtrans P_NEXT( ) => ( :p_end := :p_pos, SubStr( :p_str, ++:p_pos, 1 ) ) #xtrans P_PREV( ) => ( :p_end := :p_pos, SubStr( :p_str, --:p_pos, 1 ) ) // Directives for a light weight stack #define S_DATA 1 // array holding data elements #define S_NUM 2 // number of occupied data elements #define S_SIZE 3 // total size of data array #define S_STEP 4 // number of elements for auto sizing #xtrans S_STACK() => S_STACK( 64 ) #xtrans S_STACK( ) => { Array( ), 0, , Max( 32, Int( / 2 ) ) } #xtrans S_GROW( ) => ( iif( ++\[S_NUM] > \[S_SIZE], ASize( \[S_DATA], ( \[S_SIZE] += \[S_STEP] ) ), ) ) #xtrans S_SHRINK( ) => ( iif( \[S_NUM] > 0 .AND. --\[S_NUM] \< \[S_SIZE] - \[S_STEP], ASize( \[S_DATA], \[S_SIZE] -= \[S_STEP] ), ) ) #xtrans S_COMPRESS( ) => ( ASize( \[S_DATA], \[S_SIZE] := \[S_NUM] ) ) #xtrans S_PUSH( , ) => ( S_GROW( ), \[S_DATA, \[S_NUM]] := ) #xtrans S_POP( , @ ) => ( := \[S_DATA, \[S_NUM]], \[S_DATA, \[S_NUM]] := NIL, S_SHRINK( ) ) #xtrans S_POP( ) => ( \[S_DATA, \[S_NUM]] := NIL, S_SHRINK( ) ) #xtrans S_TOP( ) => ( \[S_DATA, \[S_NUM]] ) THREAD STATIC t_aHA // data for HTML attributes THREAD STATIC t_hHT // data for HTML tags THREAD STATIC t_cHtmlCP := "" THREAD STATIC t_aHtmlUnicEntities // HTML character entities THREAD STATIC t_cHtmlUnicChars #ifdef HB_LEGACY_LEVEL4 THREAD STATIC t_aHtmlAnsiEntities // HTML character entities (ANSI character set) THREAD STATIC t_cHtmlAnsiChars #endif THREAD STATIC t_lInit := .F. // initialization flag for HTML data #ifdef _DEBUG_ #xtranslate HIDDEN: => EXPORTED: // debugger cannot see HIDDEN iVars #endif /* Class for handling an entire HTML document */ CREATE CLASS THtmlDocument MODULE FRIENDLY HIDDEN: VAR oIterator VAR nodes EXPORTED: VAR root READONLY VAR head READONLY VAR body READONLY VAR changed INIT .T. METHOD new( cHtmlString ) METHOD readFile( cFileName ) METHOD writeFile( cFileName ) METHOD collect() METHOD toString() METHOD getNode( cTagName ) METHOD getNodes( cTagName ) METHOD findFirst( cName, cAttrib, cValue, cData ) METHOD findFirstRegex( cName, cAttrib, cValue, cData ) METHOD findNext() INLINE ::oIterator:Next() ENDCLASS // accepts a HTML formatted string METHOD new( cHtmlString ) CLASS THtmlDocument LOCAL oSubNode, oErrNode, aHead, aBody, nMode := 0 LOCAL cEmptyHtmlDoc := ; "" + hb_eol() + ; "" + hb_eol() + ; " " + hb_eol() + ; " " + hb_eol() + ; " " + hb_eol() + ; " " + hb_eol() + ; "" IF ! HB_ISSTRING( cHtmlString ) ::root := THtmlNode():new( cEmptyHtmlDoc ) ELSEIF ", and tags // Although they are optional, the THtmlDocument class enforces them // so that the instance variables :head and :body are always available aHead := {} aBody := {} FOR EACH oSubNode IN ::root:htmlContent IF oSubNode:isType( CM_HEAD ) AAdd( aHead, oSubNode ) ELSE AAdd( aBody, oSubNode ) ENDIF NEXT ::root := THtmlNode():new( cEmptyHtmlDoc ) ::root:document := Self ::changed := .T. ::head := ::getNode( "head" ) ::body := ::getNode( "body" ) FOR EACH oSubNode IN aHead IF oSubNode:isType( CM_HEAD ) ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF NEXT FOR EACH oSubNode IN aBody IF Lower( oSubNode:htmlTagName ) $ "html,head,body" // This node is an error in the HTML string. // We gracefully add its subnodes to the tag FOR EACH oErrNode IN oSubNode:htmlContent ::body:addNode( oErrNode ) NEXT ELSE IF oSubNode:isType( CM_HEAD ) oSubNode:delete() ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF ENDIF NEXT ELSEIF ::head == NIL ::head := ::body:insertBefore( THtmlNode():new( ::body, "head" ) ) ELSEIF ::body == NIL ::head := ::head:insertAfter( THtmlNode():new( ::head, "body" ) ) ENDIF IF nMode == 1 FOR EACH oSubNode IN THtmlNode():new( cHtmlString ):htmlContent IF oSubNode:isType( CM_HEAD ) ::head:addNode( oSubNode ) ELSE ::body:addNode( oSubNode ) ENDIF NEXT ENDIF RETURN Self // Builds a HTML formatted string METHOD toString() CLASS THtmlDocument RETURN ::root:toString() // reads HTML file and parses it into tree of objects METHOD readFile( cFileName ) CLASS THtmlDocument IF hb_vfExists( cFileName ) ::changed := .T. ::new( MemoRead( cFileName ) ) RETURN .T. ENDIF RETURN .F. // writes the entire tree of HTML objects into a file METHOD writeFile( cFileName ) CLASS THtmlDocument LOCAL lSuccess := hb_MemoWrit( cFileName, ::toString() ) IF lSuccess ::changed := .F. ENDIF RETURN lSuccess // builds a one dimensional array of all nodes contained in the HTML document METHOD collect() CLASS THtmlDocument IF ::changed ::nodes := ::root:collect() ::changed := .F. ENDIF RETURN ::nodes // returns the first tag matching the passed tag name METHOD getNode( cTagName ) CLASS THtmlDocument LOCAL oNode IF ::changed ::collect() ENDIF FOR EACH oNode IN ::nodes IF Lower( oNode:htmlTagName ) == Lower( cTagName ) RETURN oNode ENDIF NEXT RETURN NIL // returns all tags matching the passed tag name METHOD getNodes( cTagName ) CLASS THtmlDocument LOCAL oNode, stack := S_STACK() IF ::changed ::collect() ENDIF FOR EACH oNode IN ::nodes IF Lower( oNode:htmlTagName ) == Lower( cTagName ) S_PUSH( stack, oNode ) ENDIF NEXT S_COMPRESS( stack ) RETURN stack[ S_DATA ] // finds the first HTML tag matching the search criteria METHOD findFirst( cName, cAttrib, cValue, cData ) CLASS THtmlDocument ::oIterator := THtmlIteratorScan():New( Self ) RETURN ::oIterator:Find( cName, cAttrib, cValue, cData ) // finds the first HTML tag matching the RegEx search criteria METHOD findFirstRegex( cName, cAttrib, cValue, cData ) CLASS THtmlDocument ::oIterator := THtmlIteratorRegex():New( Self ) RETURN ::oIterator:Find( cName, cAttrib, cValue, cData ) /* Abstract super class for THtmlIteratorScan and THtmlIteratorScanRegEx (Adopted from TXMLIterator -> contrib/xhb/txml.prg) */ CREATE CLASS THtmlIterator MODULE FRIENDLY METHOD New( oHtml ) CONSTRUCTOR METHOD Next() METHOD Rewind() METHOD Find( cName, cAttribute, cValue, cData ) METHOD GetNode() INLINE ::oNode METHOD SetContext() METHOD Clone() HIDDEN: VAR cName VAR cAttribute VAR cValue VAR cData VAR oNode VAR oTop VAR aNodes VAR nCurrent VAR nLast METHOD MatchCriteria() ENDCLASS // accepts a THtmlNode or THtmlDocument object METHOD New( oHtml ) CLASS THtmlIterator IF oHtml:isDerivedFrom ( "THtmlDocument" ) ::oNode := oHtml:root ::aNodes := oHtml:nodes ELSE ::oNode := oHtml ::aNodes := ::oNode:collect() ENDIF ::oTop := ::oNode ::nCurrent := 1 ::nLast := Len( ::aNodes ) RETURN Self METHOD rewind() CLASS THtmlIterator ::oNode := ::oTop ::nCurrent := 0 RETURN Self METHOD Clone() CLASS THtmlIterator LOCAL oRet oRet := THtmlIterator():New( ::oTop ) oRet:cName := ::cName oRet:cAttribute := ::cAttribute oRet:cValue := ::cValue oRet:cData := ::cData oRet:nCurrent := 0 oRet:nLast := Len( ::aNodes ) oRet:aNodes := ::aNodes RETURN oRet METHOD SetContext() CLASS THtmlIterator ::oTop := ::oNode ::aNodes := ::oNode:collect() ::nCurrent := 0 ::nLast := Len( ::aNodes ) RETURN Self METHOD Find( cName, cAttribute, cValue, cData ) CLASS THtmlIterator ::cName := cName ::cAttribute := cAttribute ::cValue := cValue ::cData := cData IF ::nLast == 0 ::nCurrent := 0 RETURN NIL ENDIF IF ::MatchCriteria( ::oNode ) RETURN ::oNode ENDIF RETURN ::Next() METHOD Next() CLASS THtmlIterator LOCAL oFound, lExit := .F. DO WHILE ! lExit BEGIN SEQUENCE WITH __BreakBlock() oFound := ::aNodes[ ++::nCurrent ] IF ::MatchCriteria( oFound ) ::oNode := oFound lExit := .T. ENDIF RECOVER lExit := .T. oFound := NIL ::nCurrent := 0 END SEQUENCE ENDDO RETURN oFound METHOD MatchCriteria() CLASS THtmlIterator RETURN .T. /* Iterator scan class */ CREATE CLASS THtmlIteratorScan INHERIT THtmlIterator MODULE FRIENDLY METHOD New( oNodeTop ) CONSTRUCTOR HIDDEN: METHOD MatchCriteria( oFound ) ENDCLASS METHOD New( oNodeTop ) CLASS THtmlIteratorScan ::Super:New( oNodeTop ) RETURN Self METHOD MatchCriteria( oFound ) CLASS THtmlIteratorScan LOCAL xData IF ::cName != NIL .AND. ! Lower( ::cName ) == Lower( oFound:htmlTagName ) RETURN .F. ENDIF IF ::cAttribute != NIL .AND. ! ::cAttribute $ oFound:getAttributes() RETURN .F. ENDIF IF ::cValue != NIL xData := oFound:getAttributes() IF hb_HScan( xData, {| xKey, cValue | HB_SYMBOL_UNUSED( xKey ), Lower( ::cValue ) == Lower( cValue ) } ) == 0 RETURN .F. ENDIF ENDIF IF ::cData != NIL xData := oFound:getText( " " ) /* NOTE: != changed to ! == */ IF Empty( xData ) .OR. ! AllTrim( ::cData ) == AllTrim( xData ) RETURN .F. ENDIF ENDIF RETURN .T. /* Iterator regex class */ CREATE CLASS THtmlIteratorRegex INHERIT THtmlIterator MODULE FRIENDLY METHOD New( oNodeTop ) CONSTRUCTOR HIDDEN: METHOD MatchCriteria( oFound ) ENDCLASS METHOD New( oNodeTop ) CLASS THtmlIteratorRegex ::Super:New( oNodeTop ) RETURN Self METHOD MatchCriteria( oFound ) CLASS THtmlIteratorRegex LOCAL xData IF ::cName != NIL .AND. ! hb_regexLike( Lower( oFound:htmlTagName ), Lower( ::cName ) ) RETURN .F. ENDIF IF ::cAttribute != NIL .AND. ; hb_HScan( oFound:getAttributes(), {| cKey | hb_regexLike( Lower( ::cAttribute ), cKey ) } ) == 0 RETURN .F. ENDIF IF ::cValue != NIL .AND. ; hb_HScan( oFound:getAttributes(), {| xKey, cValue | HB_SYMBOL_UNUSED( xKey ), hb_regexLike( ::cValue, cValue ) } ) == 0 RETURN .F. ENDIF IF ::cData != NIL xData := oFound:getText( " " ) IF Empty( xData ) .OR. ! hb_regexHas( AllTrim( ::cData ), AllTrim( xData ) ) RETURN .F. ENDIF ENDIF RETURN .T. /* Class representing a HTML node tree. It parses a HTML formatted string */ CREATE CLASS THtmlNode MODULE FRIENDLY HIDDEN: VAR root VAR _document VAR parent VAR htmlContent METHOD parseHtml( parser ) METHOD parseHtmlFixed( parser ) METHOD _getTextNode() METHOD _setTextNode( cText ) METHOD keepFormatting() EXPORTED: VAR htmlTagName READONLY VAR htmlEndTagName READONLY VAR htmlTagType READONLY VAR htmlAttributes READONLY METHOD New( oParent, cTagName, cAttrib, cContent ) METHOD isType( nType ) ACCESS isEmpty() ACCESS isInline() ACCESS isOptional() ACCESS isNode() ACCESS isBlock() METHOD addNode( oTHtmlNode ) METHOD insertAfter( oTHtmlNode ) METHOD insertBefore( oTHtmlNode ) METHOD Delete() // Messages from TXmlNode MESSAGE insertBelow METHOD addNode MESSAGE unlink METHOD Delete METHOD firstNode( lRoot ) METHOD lastNode( lRoot ) ACCESS nextNode() ACCESS prevNode() ACCESS siblingNodes() INLINE iif( ::parent == NIL, NIL, ::parent:htmlContent ) ACCESS childNodes() INLINE iif( ::isNode(), ::htmlContent, NIL ) ACCESS parentNode() INLINE ::parent ACCESS document() INLINE iif( ::root == NIL, NIL, ::root:_document ) METHOD toString( nIndent ) METHOD attrToString() METHOD collect( oEndNode ) METHOD getText( cEOL ) METHOD getAttribute( cName ) METHOD getAttributes() METHOD setAttribute( cName, cValue ) METHOD setAttributes( cHtml ) METHOD delAttribute( cName ) METHOD delAttributes() METHOD isAttribute( cName ) ACCESS TEXT INLINE ::_getTextNode() ASSIGN TEXT( x ) INLINE ::_setTextNode( x ) ACCESS attr INLINE ::getAttributes() ASSIGN attr( x ) INLINE ::setAttributes( x ) METHOD pushNode OPERATOR + METHOD popNode OPERATOR - METHOD findNodeByTagName METHOD findNodesByTagName ERROR HANDLER noMessage METHOD noAttribute ENDCLASS METHOD new( oParent, cTagName, cAttrib, cContent ) CLASS THtmlNode IF ! t_lInit THtmlInit( .T. ) ENDIF IF HB_ISSTRING( oParent ) // a HTML string is passed -> build new tree of objects oParent := StrTran( oParent, Chr( 9 ), " " ) ::root := Self ::htmlTagName := "_root_" ::htmlTagType := THtmlTagType( "_root_" ) ::htmlContent := {} ::parseHtml( P_PARSER( oParent ) ) ELSEIF HB_ISOBJECT( oParent ) // a HTML object is passed -> we are in the course of building an object tree ::root := oParent:root ::parent := oParent IF HB_ISSTRING( cAttrib ) IF Right( cAttrib, 1 ) == "/" ::htmlEndTagName := "/" ::htmlAttributes := hb_StrShrink( cAttrib ) ELSE ::htmlAttributes := cAttrib ENDIF ELSE ::htmlAttributes := cAttrib ENDIF ::htmlTagName := cTagName ::htmlTagType := THtmlTagType( cTagName ) ::htmlContent := iif( cContent == NIL, {}, cContent ) ELSE RETURN ::error( "Parameter error", ::className(), ":new()", EG_ARG, hb_AParams() ) ENDIF RETURN Self METHOD isType( nType ) CLASS THtmlNode LOCAL lRet BEGIN SEQUENCE WITH __BreakBlock() lRet := hb_bitAnd( ::htmlTagType[ 2 ], nType ) != 0 RECOVER lRet := .F. END SEQUENCE RETURN lRet // checks if this is a node that is always empty and never has HTML text, e.g. ,, METHOD isEmpty() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_EMPTY ) != 0 // checks if this is a node that may occur inline, eg. , METHOD isInline() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_INLINE ) != 0 // checks if this is a node that may appear without a closing tag, eg.
,, METHOD isOptional() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_OPT ) != 0 // checks if this is a node (leafs contain no further nodes, e.g.
,
,_text_) METHOD isNode() CLASS THtmlNode RETURN HB_ISARRAY( ::htmlContent ) .AND. Len( ::htmlContent ) > 0 // checks if this is a block node that must be closed with an ending tag: eg:
,
METHOD isBlock() CLASS THtmlNode RETURN hb_bitAnd( ::htmlTagType[ 2 ], CM_BLOCK ) != 0 // checks if this is a node whose text line formatting must be preserved:
,