feat(pp): Phase B — COUNT / SUM / AVERAGE via std.ch

Three xBase analytical commands that were silent no-ops in the parser now execute as Harbour-style PP rewrites: COUNT [TO <v>] [FOR <for>] [WHILE <while>] ... -> dbEval() SUM <x> TO <v> [FOR <for>] [WHILE <while>] ... -> dbEval() AVERAGE <x> TO <v> [FOR ...] -> __dbAverage() COUNT and SUM expand to a `<v> := 0 ; dbEval( {|| ... } )` pair matching harbour-core/include/std.ch verbatim. AVERAGE delegates to a new RTL function rtlDbAverage (sum + count + divide; returns 0 on empty match) — the chained-private-variable trick Harbour uses to keep AVERAGE inline doesn't translate cleanly through Five's PP. Wiring up these rules surfaced four PP issues that had to be fixed for the rewrite to even reach the parser: * Result template did not implement <{name}> blockify. So a rule body like `{|| x := x + <x> }, <{for}>` left the literal text `<{for}>` in the output. Added blockify substitution: captured -> `{|| <captured> }`, missing -> NIL. * findMarkerEnd did not recognise `{`/`}` so unreferenced blockify markers were not cleaned up either. Added `{`/`}` to its prefix/suffix sets. * Optional-clause matching had no view of the outer pattern, so a regular marker at the end of `[TO <v>]` would swallow the rest of the line — `COUNT TO n FOR x>5` captured `<v>` as "n FOR x>5". matchSegment now takes outerTail and stops at its first literal. * `#command` directives could not span multiple physical lines. A trailing `;` is harbour-core's line-continuation marker for std.ch and now joins the next line into the directive before parsing. Parser cleanup: COUNT, SUM, AVERAGE removed from the IDENT-statement no-op switch in parseIdentStmt + parseExprStmt. The remaining xBase verbs (COPY, SORT, TOTAL, JOIN, LIST, DISPLAY, LABEL, REPORT, ...) stay in the parser until their RTL backends arrive. Gates green: go test ./... : PASS FiveSql2 SQL:1999 : 43/43 Harbour compat : 56/56 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 14:11:20 +09:00
parent c4f85f494c
commit c2e7f7ea27
6 changed files with 209 additions and 13 deletions
--- a/compiler/parser/parser.go
+++ b/compiler/parser/parser.go
@@ -1155,7 +1155,7 @@ func (p *Parser) parseIdentStmt() ast.Stmt {
 	// rewritten by compiler/pp/std.ch into function calls before the
 	// parser sees them.
 	switch upper {
-	case "COPY", "SORT", "COUNT", "SUM", "AVERAGE", "TOTAL", "UPDATE",
+	case "COPY", "SORT", "TOTAL", "UPDATE",
 		"LABEL", "REPORT", "ACCEPT", "INPUT",
 		"JOIN", "RELEASE", "SAVE", "RESTORE",
 		"DIR", "STORE", "NOTE", "TEXT", "ENDTEXT",
--- a/compiler/pp/command.go
+++ b/compiler/pp/command.go
@@ -304,9 +304,16 @@ func (r *Rule) matchPattern(line string) map[string]string {
 				bodyEnd++
 			}
 			body := patternWords[bodyStart:bodyEnd]
+			// Outer-pattern tail (everything after the matching `]`) is
+			// needed so a regular marker at the end of `body` knows where
+			// to stop capturing. Without this, `[TO <v>] [FOR <for>]`
+			// against `TO n FOR age >= 30` would let `<v>` swallow the
+			// rest of the line because `body` itself has no literal that
+			// follows the marker.
+			outerTail := patternWords[bodyEnd+1:]
 			for li < len(lineWords) {
 				snapshotLi := li
-				iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens)
+				iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens, outerTail)
 				if !ok {
 					li = snapshotLi
 					break
@@ -380,7 +387,7 @@ func (r *Rule) matchPattern(line string) map[string]string {
 // A "mini-matcher" that mirrors the main loop for MarkerRegular and
 // literal keywords. MarkerList and MarkerWild inside `[...]` would
 // need additional plumbing; defer those until real patterns need them.
-func matchSegment(segment, lineWords []string, startLi int, caseSens bool) (map[string]string, int, bool) {
+func matchSegment(segment, lineWords []string, startLi int, caseSens bool, outerTail []string) (map[string]string, int, bool) {
 	caps := make(map[string]string)
 	li := startLi

@@ -406,12 +413,20 @@ func matchSegment(segment, lineWords []string, startLi int, caseSens bool) (map[
 				return nil, startLi, false
 			}
 			// Build a pseudo-pattern tail so captureExpression picks the
-			// right delimiter. If there's a next literal inside `segment`,
-			// use it; otherwise fall back to the repeat boundary so the
-			// capture stops before the next iteration starts.
+			// right delimiter. Priority:
+			//   1. Next literal inside the same segment.
+			//   2. First literal in the outer-pattern tail — this is what
+			//      stops `[TO <v>] [FOR <for>]` from letting `<v>` swallow
+			//      the FOR clause.
+			//   3. Repeat boundary (the segment's leading literal) so a
+			//      multi-iteration capture stops before the next iter.
 			tail := segment[pi+1:]
-			if !hasLiteralAfter(tail) && repeatBoundary != "" {
-				tail = []string{repeatBoundary}
+			if !hasLiteralAfter(tail) {
+				if outerLit := firstLiteral(outerTail); outerLit != "" {
+					tail = []string{outerLit}
+				} else if repeatBoundary != "" {
+					tail = []string{repeatBoundary}
+				}
 			}
 			captured := captureExpression(lineWords, &li, tail, 0, caseSens)
 			caps[m.Name] = captured
@@ -425,6 +440,22 @@ func matchSegment(segment, lineWords []string, startLi int, caseSens bool) (map[
 	return caps, li, true
 }

+// firstLiteral returns the first non-marker, non-bracket token in pw,
+// or "" if none. Used to give matchSegment a stop-boundary drawn from
+// the outer pattern when its body ends in a regular marker.
+func firstLiteral(pw []string) string {
+	for _, w := range pw {
+		if w == "[" || w == "]" || w == "" {
+			continue
+		}
+		if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
+			continue
+		}
+		return w
+	}
+	return ""
+}
+
 // hasLiteralAfter reports whether a pattern slice contains any literal
 // keyword token (non-marker, non-bracket) — used to decide whether a
 // marker's capture has a real delimiter or needs a synthetic one.
@@ -510,16 +541,58 @@ func (r *Rule) applyResult(captures map[string]string) string {
 		} else {
 			result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
 		}
+		// <{name}> — blockify: wrap captured expression in {|| ... }.
+		// Empty capture → NIL so the call site sees a nil block, matching
+		// how Harbour's std.ch expects __dbLocate / dbEval to interpret a
+		// missing FOR/WHILE clause.
+		if val != "" {
+			result = strings.ReplaceAll(result, "<{"+name+"}>", "{|| "+val+" }")
+		} else {
+			result = strings.ReplaceAll(result, "<{"+name+"}>", "NIL")
+		}
 		// <name> — bare substitution (must be LAST, after all wrappers).
 		result = strings.ReplaceAll(result, "<"+name+">", val)
 	}

+	// Any `<{name}>` still in the template means `name` was never
+	// captured — emit NIL so call sites see a missing block argument
+	// (matches Harbour: empty FOR/WHILE → NIL → bypass the condition).
+	result = replaceUnreferencedBlockify(result)
+
 	// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
 	result = cleanUnreferencedMarkers(result)

 	return result
 }

+// replaceUnreferencedBlockify rewrites every remaining `<{ident}>` to
+// NIL. Run after the main substitution loop, before the generic
+// unreferenced-marker cleanup.
+func replaceUnreferencedBlockify(s string) string {
+	var out strings.Builder
+	i := 0
+	for i < len(s) {
+		if i+2 < len(s) && s[i] == '<' && s[i+1] == '{' {
+			j := i + 2
+			// Identifier
+			if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
+				j++
+				for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
+					j++
+				}
+				if j+1 < len(s) && s[j] == '}' && s[j+1] == '>' {
+					out.WriteString("NIL")
+					i = j + 2
+					continue
+				}
+			}
+		}
+		out.WriteByte(s[i])
+		i++
+	}
+	return out.String()
+}
+
 // expandOptionalRepeat walks a result template and rewrites each top-
 // level `[ ... ]` block by examining the captures referenced inside:
 //
@@ -737,8 +810,9 @@ func findMarkerEnd(s string, start int) int {
 		return 0
 	}
 	i := start + 1
-	// Skip optional ( or . prefix
-	if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"') {
+	// Skip optional ( or . or " or { prefix (smart-stringify, logify,
+	// stringify, blockify respectively)
+	if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"' || s[i] == '{') {
 		i++
 	}
 	// Must start with letter or underscore (identifier)
@@ -749,8 +823,8 @@ func findMarkerEnd(s string, start int) int {
 	for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
 		i++
 	}
-	// Skip optional ) or . or " or ,... suffix
-	for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == ',' || s[i] == ' ') {
+	// Skip optional ) or . or " or } or , suffix
+	for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == '}' || s[i] == ',' || s[i] == ' ') {
 		i++
 	}
 	if i < len(s) && s[i] == '>' {
--- a/compiler/pp/pp.go
+++ b/compiler/pp/pp.go
@@ -97,7 +97,19 @@ func (pp *Preprocessor) processLines(filename, source string, depth int) string
 	dumpStartLine := 0      // 1-based line where BEGINDUMP appeared
 	var dumpLines []string  // accumulate Go code lines

-	for i, line := range lines {
+	for i := 0; i < len(lines); i++ {
+		line := lines[i]
+		// `#command`/`#translate` directives that end with a trailing `;`
+		// continue on the next physical line — this is how harbour-core
+		// formats its std.ch rules. Join the continuation here so the
+		// directive parser sees one logical line. Only `#`-directives
+		// participate; user code uses `;` differently.
+		if t := strings.TrimSpace(line); strings.HasPrefix(t, "#") {
+			for strings.HasSuffix(strings.TrimRight(line, " \t"), ";") && i+1 < len(lines) {
+				line = strings.TrimRight(line, " \t;") + " " + strings.TrimSpace(lines[i+1])
+				i++
+			}
+		}
 		// Handle #pragma BEGINDUMP ... ENDDUMP (inline Go code blocks)
 		if inPragmaDump {
 			trimCheck := strings.TrimSpace(line)
--- a/compiler/pp/std.ch
+++ b/compiler/pp/std.ch
@@ -40,6 +40,28 @@

 #command CONTINUE                        => __dbContinue()

+/* --- analytical (no extra RTL — just dbEval) ---
+   These mirror Harbour's std.ch but use single-value forms. Multi-
+   expression SUM/AVERAGE (`SUM x, y TO sx, sy`) use optional-repeat
+   syntax in Harbour and can be added here once a real test exercises
+   the more elaborate form. */
+#command COUNT [TO <v>] [FOR <for>] [WHILE <while>] ;
+               [NEXT <next>] [RECORD <rec>] [<rest:REST>] [ALL] => ;
+         <v> := 0 ; dbEval( {|| <v> := <v> + 1 }, ;
+                            <{for}>, <{while}>, <next>, <rec>, <.rest.> )
+
+#command SUM <x> TO <v> ;
+             [FOR <for>] [WHILE <while>] [NEXT <next>] ;
+             [RECORD <rec>] [<rest:REST>] [ALL] => ;
+         <v> := 0 ; dbEval( {|| <v> := <v> + <x> }, ;
+                            <{for}>, <{while}>, <next>, <rec>, <.rest.> )
+
+#command AVERAGE <x> TO <v> ;
+                 [FOR <for>] [WHILE <while>] [NEXT <next>] ;
+                 [RECORD <rec>] [<rest:REST>] [ALL] => ;
+         <v> := __dbAverage( <{x}>, ;
+                             <{for}>, <{while}>, <next>, <rec>, <.rest.> )
+
 /* --- bulk maintenance --- */
 #command REINDEX                         => DbReindex()
 #command PACK                            => DbPack()