Files
five/compiler/pp/pp.go
CharlesKWON c2e7f7ea27 feat(pp): Phase B — COUNT / SUM / AVERAGE via std.ch
Three xBase analytical commands that were silent no-ops in the
parser now execute as Harbour-style PP rewrites:

  COUNT [TO <v>]   [FOR <for>] [WHILE <while>] ... -> dbEval()
  SUM <x> TO <v>   [FOR <for>] [WHILE <while>] ... -> dbEval()
  AVERAGE <x> TO <v> [FOR ...]                     -> __dbAverage()

COUNT and SUM expand to a `<v> := 0 ; dbEval( {|| ... } )` pair
matching harbour-core/include/std.ch verbatim. AVERAGE delegates to
a new RTL function rtlDbAverage (sum + count + divide; returns 0 on
empty match) — the chained-private-variable trick Harbour uses to
keep AVERAGE inline doesn't translate cleanly through Five's PP.

Wiring up these rules surfaced four PP issues that had to be fixed
for the rewrite to even reach the parser:

  * Result template did not implement <{name}> blockify. So a rule
    body like `{|| x := x + <x> }, <{for}>` left the literal text
    `<{for}>` in the output. Added blockify substitution: captured
    -> `{|| <captured> }`, missing -> NIL.
  * findMarkerEnd did not recognise `{`/`}` so unreferenced
    blockify markers were not cleaned up either. Added `{`/`}` to
    its prefix/suffix sets.
  * Optional-clause matching had no view of the outer pattern, so a
    regular marker at the end of `[TO <v>]` would swallow the rest
    of the line — `COUNT TO n FOR x>5` captured `<v>` as
    "n FOR x>5". matchSegment now takes outerTail and stops at its
    first literal.
  * `#command` directives could not span multiple physical lines.
    A trailing `;` is harbour-core's line-continuation marker for
    std.ch and now joins the next line into the directive before
    parsing.

Parser cleanup: COUNT, SUM, AVERAGE removed from the IDENT-statement
no-op switch in parseIdentStmt + parseExprStmt. The remaining xBase
verbs (COPY, SORT, TOTAL, JOIN, LIST, DISPLAY, LABEL, REPORT, ...)
stay in the parser until their RTL backends arrive.

Gates green:
  go test ./...      : PASS
  FiveSql2 SQL:1999  : 43/43
  Harbour compat     : 56/56

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 14:11:20 +09:00

639 lines
19 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// Preprocessor for Five — handles #include, #define, #ifdef/#endif.
// Harbour: /mnt/d/harbour-core/src/pp/ppcore.c (6383 lines)
//
// Five PP is simplified but covers the essential directives:
// #include "file.ch" — file inclusion
// #define NAME VALUE — simple text substitution
// #undef NAME — remove definition
// #ifdef NAME / #ifndef NAME / #else / #endif — conditional compilation
// #pragma — compiler hints
//
// #command/#translate is supported via command.go (pattern matching + substitution).
// Five also handles CLASS syntax natively in the parser.
package pp
import (
_ "embed"
"fmt"
"os"
"path/filepath"
"strings"
)
// embeddedStdCh is include/std.ch baked into the compiler binary so
// xBase commands like ERASE, RENAME, COMMIT, LOCATE, ... reach the
// parser already rewritten as plain function calls. Equivalent to
// Harbour's auto-included std.ch.
//
//go:embed std.ch
var embeddedStdCh string
// Preprocessor processes source code before lexing.
type Preprocessor struct {
defines map[string]string // #define name → value
includeDirs []string // search paths for #include
included map[string]bool // prevent circular inclusion
commands []*Rule // #command rules
translates []*Rule // #translate rules
errors []string
GoDumps []string // collected #pragma BEGINDUMP Go code blocks
}
// New creates a new Preprocessor.
func New() *Preprocessor {
pp := &Preprocessor{
defines: make(map[string]string),
included: make(map[string]bool),
}
pp.addStdRules()
return pp
}
// addStdRules registers built-in #command rules from the embedded
// std.ch file. processLines walks the directives and stores #command
// entries in pp.commands as a side effect; we discard its output.
//
// Anything not safely expressible as a #command (e.g. parser-handled
// constructs like @ SAY/GET, READ, TRY/CATCH, WITH TIMEOUT) is left
// to the parser.
func (pp *Preprocessor) addStdRules() {
pp.processLines("std.ch", embeddedStdCh, 0)
}
// AddIncludeDir adds a directory to search for #include files.
func (pp *Preprocessor) AddIncludeDir(dir string) {
pp.includeDirs = append(pp.includeDirs, dir)
}
// Define adds a #define.
func (pp *Preprocessor) Define(name, value string) {
pp.defines[name] = value
}
// Process preprocesses the source code, resolving #include and #define.
func (pp *Preprocessor) Process(filename, source string) (string, []string) {
pp.errors = nil
result := pp.processLines(filename, source, 0)
return result, pp.errors
}
const maxIncludeDepth = 20
func (pp *Preprocessor) processLines(filename, source string, depth int) string {
if depth > maxIncludeDepth {
pp.errors = append(pp.errors, fmt.Sprintf("%s: #include depth exceeded (max %d)", filename, maxIncludeDepth))
return source
}
lines := strings.Split(source, "\n")
var result []string
var ifStack []bool // true = active section, false = skipping
active := true
inBlockComment := false // track multi-line /* */ comments
inPragmaDump := false // track #pragma BEGINDUMP ... ENDDUMP
dumpStartLine := 0 // 1-based line where BEGINDUMP appeared
var dumpLines []string // accumulate Go code lines
for i := 0; i < len(lines); i++ {
line := lines[i]
// `#command`/`#translate` directives that end with a trailing `;`
// continue on the next physical line — this is how harbour-core
// formats its std.ch rules. Join the continuation here so the
// directive parser sees one logical line. Only `#`-directives
// participate; user code uses `;` differently.
if t := strings.TrimSpace(line); strings.HasPrefix(t, "#") {
for strings.HasSuffix(strings.TrimRight(line, " \t"), ";") && i+1 < len(lines) {
line = strings.TrimRight(line, " \t;") + " " + strings.TrimSpace(lines[i+1])
i++
}
}
// Handle #pragma BEGINDUMP ... ENDDUMP (inline Go code blocks)
if inPragmaDump {
trimCheck := strings.TrimSpace(line)
if strings.HasPrefix(trimCheck, "#") {
dir := strings.TrimSpace(strings.TrimPrefix(trimCheck, "#"))
if strings.HasPrefix(strings.ToUpper(dir), "PRAGMA ") && strings.Contains(strings.ToUpper(dir), "ENDDUMP") {
inPragmaDump = false
body := strings.Join(dumpLines, "\n")
// Five's inline dumps are Go, not C. Harbour's own
// #pragma BEGINDUMP convention is C (hb_ret*, HB_FUNC,
// #include <stdio.h> etc.), so `.prg` files ported
// from Harbour will attempt to shove C through Five's
// Go-emit pipeline and fail with cryptic errors like
// "invalid character U+0023 '#'". Detect the C shape
// and report a clear, actionable error up front.
if looksLikeInlineC(body) {
pp.errors = append(pp.errors, fmt.Sprintf(
"%s:%d: #pragma BEGINDUMP contains C code — Five accepts inline Go only. Port the block to Go (or use an RTL function), then wrap in #pragma BEGINDUMP ... #pragma ENDDUMP.",
filename, dumpStartLine))
// Emit a syntactically invalid line so the parser
// also fails at the expected position rather than
// the build silently continuing.
result = append(result, "__FIVE_INLINE_C_ERROR__")
dumpLines = nil
continue
}
pp.GoDumps = append(pp.GoDumps, body)
dumpLines = nil
result = append(result, fmt.Sprintf("FIVE_GODUMP__ %d", len(pp.GoDumps)-1))
continue
}
}
dumpLines = append(dumpLines, line)
result = append(result, "") // blank out for line counting
continue
}
trimmed := strings.TrimSpace(line)
// Handle multi-line block comments
if inBlockComment {
if idx := strings.Index(line, "*/"); idx >= 0 {
inBlockComment = false
line = line[idx+2:] // keep content after */
trimmed = strings.TrimSpace(line)
if trimmed == "" {
result = append(result, "")
continue
}
} else {
result = append(result, "") // blank out comment lines
continue
}
}
// Strip block comments within a single line and detect opening /*
line = stripBlockComments(line, &inBlockComment)
trimmed = strings.TrimSpace(line)
// Check if in active section
if len(ifStack) > 0 {
active = ifStack[len(ifStack)-1]
} else {
active = true
}
// Preprocessor directives (always processed regardless of active state)
if strings.HasPrefix(trimmed, "#") {
directive := strings.TrimPrefix(trimmed, "#")
directive = strings.TrimSpace(directive)
// Detect #pragma BEGINDUMP
upperDir := strings.ToUpper(directive)
if strings.HasPrefix(upperDir, "PRAGMA ") && strings.Contains(upperDir, "BEGINDUMP") {
inPragmaDump = true
dumpStartLine = i + 1 // 1-based for error reporting
dumpLines = nil
result = append(result, "")
continue
}
if pp.handleConditional(directive, &ifStack, active) {
continue
}
if !active {
continue // skip non-conditional directives in inactive sections
}
if pp.handleDirective(filename, directive, depth, &result, i+1) {
continue
}
}
if !active {
continue // skip lines in inactive #ifdef sections
}
// Apply #command/#translate rules
if len(pp.commands) > 0 || len(pp.translates) > 0 {
line = pp.applyRules(line)
}
// Apply #define substitutions
if len(pp.defines) > 0 {
line = pp.applyDefines(line)
}
result = append(result, line)
}
if len(ifStack) > 0 {
pp.errors = append(pp.errors, fmt.Sprintf("%s: unterminated #ifdef/#ifndef", filename))
}
return strings.Join(result, "\n")
}
// handleConditional processes #ifdef, #ifndef, #else, #endif.
// Returns true if the line was a conditional directive.
func (pp *Preprocessor) handleConditional(directive string, ifStack *[]bool, active bool) bool {
upper := strings.ToUpper(directive)
if strings.HasPrefix(upper, "IFDEF ") {
name := strings.TrimSpace(directive[6:])
_, defined := pp.defines[name]
*ifStack = append(*ifStack, defined && active)
return true
}
if strings.HasPrefix(upper, "IFNDEF ") {
name := strings.TrimSpace(directive[7:])
_, defined := pp.defines[name]
*ifStack = append(*ifStack, !defined && active)
return true
}
// #if expr — simplified: support #if 0 (always false), #if 1 (always true),
// and #if __pragma(...) (treat as false for compatibility)
if strings.HasPrefix(upper, "IF ") || upper == "IF" {
rest := strings.TrimSpace(directive[2:])
val := false
if rest == "1" || rest == ".T." {
val = true
} else if rest == "0" || rest == ".F." {
val = false
} else {
// Unknown expression — default to false (conservative)
val = false
}
*ifStack = append(*ifStack, val && active)
return true
}
// #else — may have trailing comment
if upper == "ELSE" || strings.HasPrefix(upper, "ELSE ") || strings.HasPrefix(upper, "ELSE\t") {
if len(*ifStack) > 0 {
// Flip the top of stack (only if parent was active)
parentActive := true
if len(*ifStack) > 1 {
parentActive = (*ifStack)[len(*ifStack)-2]
}
(*ifStack)[len(*ifStack)-1] = !(*ifStack)[len(*ifStack)-1] && parentActive
}
return true
}
// #endif — may have trailing comment: #endif /* COMMENT */
stripped := strings.TrimSpace(upper)
if idx := strings.Index(stripped, " "); idx > 0 {
stripped = stripped[:idx]
}
if idx := strings.Index(stripped, "\t"); idx > 0 {
stripped = stripped[:idx]
}
if stripped == "ENDIF" {
if len(*ifStack) > 0 {
*ifStack = (*ifStack)[:len(*ifStack)-1]
}
return true
}
return false
}
// handleDirective processes non-conditional directives.
func (pp *Preprocessor) handleDirective(filename, directive string, depth int, result *[]string, lineNo int) bool {
upper := strings.ToUpper(directive)
// #include "file" or #include <file>
if strings.HasPrefix(upper, "INCLUDE ") {
rest := strings.TrimSpace(directive[8:])
inclFile := pp.extractIncludeFile(rest)
if inclFile == "" {
pp.errors = append(pp.errors, fmt.Sprintf("%s:%d: invalid #include", filename, lineNo))
return true
}
content := pp.resolveInclude(filename, inclFile)
if content == "" {
// Not found — not an error for Five (some .ch files are optional)
*result = append(*result, fmt.Sprintf("// #include %q — not found (skipped)", inclFile))
return true
}
// Process included content recursively
processed := pp.processLines(inclFile, content, depth+1)
*result = append(*result, strings.Split(processed, "\n")...)
return true
}
// #define NAME [VALUE]
if strings.HasPrefix(upper, "DEFINE ") {
rest := strings.TrimSpace(directive[7:])
// Detect function-like macro: #define NAME( params ) body
// For now, skip these (don't register as simple text substitution)
if idx := strings.IndexByte(rest, '('); idx > 0 && idx < strings.IndexAny(rest+" ", " \t") {
// Function-like macro — not yet supported, skip
return true
}
parts := strings.SplitN(rest, " ", 2)
name := parts[0]
value := ""
if len(parts) > 1 {
value = strings.TrimSpace(parts[1])
}
// Strip trailing // comment and /* */ comment from value
if idx := strings.Index(value, "//"); idx >= 0 {
// Make sure // is not inside a string literal
inStr := false
for i := 0; i < idx; i++ {
if value[i] == '"' || value[i] == '\'' {
inStr = !inStr
}
}
if !inStr {
value = strings.TrimSpace(value[:idx])
}
}
if idx := strings.Index(value, "/*"); idx >= 0 {
value = strings.TrimSpace(value[:idx])
}
pp.defines[name] = value
return true
}
// #undef NAME
if strings.HasPrefix(upper, "UNDEF ") {
name := strings.TrimSpace(directive[6:])
delete(pp.defines, name)
return true
}
// #pragma — just pass through as comment
if strings.HasPrefix(upper, "PRAGMA ") {
*result = append(*result, "// "+directive)
return true
}
// #warning, #error, #stdout — skip (emit as comment)
if strings.HasPrefix(upper, "WARNING") || strings.HasPrefix(upper, "ERROR") || strings.HasPrefix(upper, "STDOUT") {
*result = append(*result, "// #"+directive)
return true
}
// #command / #translate — parse and store rules
if strings.HasPrefix(upper, "COMMAND ") {
if rule := ParseRule(directive[8:], true, false); rule != nil {
pp.commands = append(pp.commands, rule)
}
return true
}
if strings.HasPrefix(upper, "TRANSLATE ") {
if rule := ParseRule(directive[10:], false, false); rule != nil {
pp.translates = append(pp.translates, rule)
}
return true
}
if strings.HasPrefix(upper, "XCOMMAND ") {
if rule := ParseRule(directive[9:], true, true); rule != nil {
pp.commands = append(pp.commands, rule)
}
return true
}
if strings.HasPrefix(upper, "XTRANSLATE ") {
if rule := ParseRule(directive[11:], false, true); rule != nil {
pp.translates = append(pp.translates, rule)
}
return true
}
return false
}
// extractIncludeFile gets the filename from #include "file" or #include <file>
func (pp *Preprocessor) extractIncludeFile(s string) string {
s = strings.TrimSpace(s)
if len(s) >= 2 {
if (s[0] == '"' && s[len(s)-1] == '"') || (s[0] == '<' && s[len(s)-1] == '>') {
return s[1 : len(s)-1]
}
}
return s // bare filename
}
// resolveInclude searches for an include file and returns its content.
func (pp *Preprocessor) resolveInclude(currentFile, inclFile string) string {
// Prevent circular inclusion
absKey := inclFile
if pp.included[absKey] {
return ""
}
pp.included[absKey] = true
defer func() { delete(pp.included, absKey) }()
// Search order:
// 1. Relative to current file
// 2. Include directories
// 3. Harbour include dir (for hbclass.ch etc.)
searchPaths := []string{}
// Relative to current file
if currentFile != "" {
dir := filepath.Dir(currentFile)
searchPaths = append(searchPaths, filepath.Join(dir, inclFile))
}
// Include directories
for _, dir := range pp.includeDirs {
searchPaths = append(searchPaths, filepath.Join(dir, inclFile))
}
// Try each path
for _, path := range searchPaths {
data, err := os.ReadFile(path)
if err == nil {
return string(data)
}
}
return ""
}
// applyRules applies #command and #translate rules to a line.
// #command rules are tried first (they match complete statements).
// #translate rules are tried on any part of a line.
func (pp *Preprocessor) applyRules(line string) string {
trimmed := strings.TrimSpace(line)
if trimmed == "" || strings.HasPrefix(trimmed, "//") {
return line
}
// Try #command rules (match from start of line)
for _, rule := range pp.commands {
if result, ok := rule.MatchLine(trimmed); ok {
// Preserve leading whitespace
indent := line[:len(line)-len(strings.TrimLeft(line, " \t"))]
return indent + result
}
}
// Try #translate rules (can match substrings)
for _, rule := range pp.translates {
if result, ok := rule.MatchLine(trimmed); ok {
indent := line[:len(line)-len(strings.TrimLeft(line, " \t"))]
return indent + result
}
}
return line
}
// stripBlockComments removes /* ... */ comments from a line.
// If a /* is found without closing */, sets inBlock to true.
func stripBlockComments(line string, inBlock *bool) string {
var out strings.Builder
i := 0
inStr := byte(0)
for i < len(line) {
// Track string literals
if inStr == 0 && (line[i] == '"' || line[i] == '\'') {
inStr = line[i]
out.WriteByte(line[i])
i++
continue
}
if inStr != 0 {
if line[i] == inStr {
inStr = 0
}
out.WriteByte(line[i])
i++
continue
}
// Block comment start
if i+1 < len(line) && line[i] == '/' && line[i+1] == '*' {
// Find closing */
end := strings.Index(line[i+2:], "*/")
if end >= 0 {
i = i + 2 + end + 2 // skip past */
out.WriteByte(' ') // replace comment with space
} else {
*inBlock = true
return out.String() // rest of line is comment
}
continue
}
out.WriteByte(line[i])
i++
}
return out.String()
}
// applyDefines substitutes #define macros in a line.
// Simple word-boundary replacement (not full macro expansion).
func (pp *Preprocessor) applyDefines(line string) string {
for name, value := range pp.defines {
if value == "" {
continue // flag-only define, no substitution
}
// Simple word replacement (not inside strings)
line = replaceWord(line, name, value)
}
return line
}
// replaceWord replaces whole-word occurrences of old with new,
// avoiding replacements inside string literals.
func replaceWord(line, old, new string) string {
if !strings.Contains(line, old) {
return line
}
var result strings.Builder
inString := byte(0)
i := 0
for i < len(line) {
// Track string literals
if inString == 0 && (line[i] == '"' || line[i] == '\'') {
inString = line[i]
result.WriteByte(line[i])
i++
continue
}
if inString != 0 && line[i] == inString {
inString = 0
result.WriteByte(line[i])
i++
continue
}
if inString != 0 {
result.WriteByte(line[i])
i++
continue
}
// Check for word match
if i+len(old) <= len(line) && line[i:i+len(old)] == old {
// Check word boundaries
before := i == 0 || !isWordChar(line[i-1])
after := i+len(old) >= len(line) || !isWordChar(line[i+len(old)])
if before && after {
result.WriteString(new)
i += len(old)
continue
}
}
result.WriteByte(line[i])
i++
}
return result.String()
}
func isWordChar(c byte) bool {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
}
// looksLikeInlineC heuristically detects Harbour-style inline C inside
// a #pragma BEGINDUMP block. Any ONE strong signal triggers detection
// so the user sees an early, readable error rather than an obscure Go
// syntax complaint far downstream.
//
// Signals (any match):
// - `HB_FUNC(` / `HB_FUNC_STATIC(` / `HB_FUNC_TRANSLATE(` — Harbour's C FFI macro
// - `hb_ret*(` / `hb_param*(` / `hb_stor*(` / `hb_itemNew(` — Harbour C API
// - `#include <` or `#include "` — C preprocessor include
// - `#define <ident>(` followed by typed arg list — C-style macro
// - bare `int main(` / `void main(` — C entry point
// - `typedef ` / `struct ` at line start — C declarations
//
// Go programs can use `import`, `package`, `func`, `var`, `:=` — none
// of which overlap with these C signatures, so false positives are
// unlikely.
func looksLikeInlineC(body string) bool {
// Quick-reject: empty body.
trimmed := strings.TrimSpace(body)
if trimmed == "" {
return false
}
for _, line := range strings.Split(body, "\n") {
l := strings.TrimSpace(line)
// #include <stdio.h> / "hbapi.h" — unambiguous C preprocessor.
// Go doesn't use #include at all.
if strings.HasPrefix(l, "#include <") || strings.HasPrefix(l, `#include "`) {
return true
}
// Bare `HB_FUNC( NAME )` with an unquoted identifier is the
// Harbour C FFI macro. The Go-side counterpart is
// `hbrt.HB_FUNC("NAME", fn)` — lowercase package prefix and a
// quoted string. Match the C form strictly.
if strings.HasPrefix(l, "HB_FUNC(") ||
strings.HasPrefix(l, "HB_FUNC_STATIC(") ||
strings.HasPrefix(l, "HB_FUNC_TRANSLATE(") {
return true
}
// C declarations at line start that have no Go analogue.
if strings.HasPrefix(l, "typedef ") || strings.HasPrefix(l, "struct ") ||
strings.HasPrefix(l, "int main(") || strings.HasPrefix(l, "void main(") {
return true
}
}
return false
}