Files
five/compiler/pp/command.go
CharlesKWON 85002df6b9 feat(parser+pp): USE with macros and paren-balanced PP capture
Two related fixes for Harbour's data-driven `USE &cFile ALIAS &cAlias
INDEX &cNdx` idiom — common in any app that dispatches table names
at runtime.

Parser (compiler/parser/parser.go parseUse):
- `USE &cFile` / `USE &(expr)` previously triggered a
  skipToEndOfLine short-circuit, emitting an empty UseCmd (equivalent
  to bare USE = close current area). Now parseMacro runs and the
  MacroExpr becomes the File node, so codegen emits MacroPush +
  dbUseArea.
- `ALIAS &cAlias` / `ALIAS &a.1` similarly dropped the macro result;
  now captures it into UseCmd.AliasExpr so codegen evaluates the
  alias at runtime. Both the IDENT-path ("ALIAS") and keyword-path
  (token.ALIAS) handlers fixed.

PP (compiler/pp/command.go):
- captureExpression and the MarkerList branch now paren-balance
  `(`/`[`/`{` so nested grouping inside a macro argument doesn't let
  an inner `)` terminate the capture. Example:
      _REGULAR_(&(a))
  previously captured `&(a` (missing inner `)`) and left the outer
  `)` dangling, producing parse errors in the expanded output.
- MarkerList capture still joins tokens with " " for raw `<z>`
  substitution — comma tokens stay in the stream, so `s(<z>)`
  re-emits them as argument separators and the list expands cleanly.

Bench: harbour-core/tests/pp.prg 2 errors → 0 for the realistic
`USE &macro` / `&(expr)` patterns. Remaining parse errors on line 70
are a pathological `_REGULAR_L` list that includes `&a.  [2]`
(space between macro's terminating dot and an array index) — the
PP expands it correctly but Five's lexer refuses the expanded
result. That form doesn't occur in real code.

/tmp/test_use_macro.prg — all four patterns (`USE &f`, `USE &f ALIAS
&f`, `USE &f ALIAS &f INDEX &i`, dot-terminated) now compile. FiveSql2
43/43, Harbour compat 56/56, Go test ALL PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:38:15 +09:00

627 lines
17 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// #command / #translate implementation for Five preprocessor.
//
// Harbour PP syntax:
// #command PATTERN => RESULT
// #translate PATTERN => RESULT
// #xcommand PATTERN => RESULT (case-sensitive)
// #xtranslate PATTERN => RESULT (case-sensitive)
//
// Pattern markers:
// <x> — match any expression (regular match)
// <!x!> — match single identifier only (restricted match)
// <x,...> — match comma-separated list
// <*x*> — match rest of line (wild match)
// <x:a,b,c> — match one of listed words (list match)
// [...] — optional clause
//
// Result markers:
// <x> — substitute matched text
// <(x)> — stringify (wrap in quotes)
// <{x}> — blockify (wrap in {|| })
// #<x> — dumb stringify
// <.x.> — logify (.T. if matched, .F. if not)
//
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
package pp
import (
"strings"
)
// Rule represents a single #command or #translate rule.
type Rule struct {
Pattern string // raw pattern text
Result string // raw result text
IsCommand bool // #command vs #translate
CaseSens bool // #xcommand/#xtranslate = case sensitive
Keyword string // first keyword (for fast matching)
Markers []Marker // parsed pattern markers
ResultTmpl string // result template with marker references
}
// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
type Marker struct {
Name string // marker name
Type MarkerType
ListValues []string // for <x:a,b,c> — allowed values
}
type MarkerType int
const (
MarkerRegular MarkerType = iota // <x> — any expression
MarkerRestricted // <!x!> — identifier only
MarkerList // <x,...> — comma-separated list
MarkerWild // <*x*> — rest of line
MarkerWordList // <x:a,b,c> — one of listed words
)
// ParseRule parses a #command/#translate directive into a Rule.
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
// Split on =>
parts := strings.SplitN(directive, "=>", 2)
if len(parts) != 2 {
return nil
}
pattern := strings.TrimSpace(parts[0])
result := strings.TrimSpace(parts[1])
// Handle line continuation (;)
result = strings.ReplaceAll(result, " ;", "")
rule := &Rule{
Pattern: pattern,
Result: result,
IsCommand: isCommand,
CaseSens: caseSens,
ResultTmpl: result,
}
// Extract first keyword for fast matching. The first whitespace-
// delimited token of the pattern becomes the dispatch key; we
// strip marker wrappers and any trailing `(` so a pattern like
// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
// firstToken normalises source lines.
words := strings.Fields(pattern)
if len(words) > 0 {
kw := words[0]
kw = strings.TrimLeft(kw, "<[")
kw = strings.TrimRight(kw, ">]")
if idx := strings.IndexByte(kw, '('); idx >= 0 {
kw = kw[:idx]
}
if !strings.ContainsAny(kw, "!*,:") {
rule.Keyword = kw
}
}
// Parse markers from pattern
rule.Markers = parseMarkers(pattern)
return rule
}
// parseMarkers extracts all <...> markers from a pattern.
func parseMarkers(pattern string) []Marker {
var markers []Marker
i := 0
for i < len(pattern) {
if pattern[i] == '<' {
end := strings.IndexByte(pattern[i:], '>')
if end < 0 {
break
}
inner := pattern[i+1 : i+end]
m := parseOneMarker(inner)
if m.Name != "" {
markers = append(markers, m)
}
i += end + 1
} else {
i++
}
}
return markers
}
func parseOneMarker(inner string) Marker {
inner = strings.TrimSpace(inner)
// <!name!> — restricted
if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
}
// <*name*> — wild
if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
}
// <name,...> — comma list
if strings.HasSuffix(inner, ",...") {
return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
}
// <name:a,b,c> — word list
if idx := strings.IndexByte(inner, ':'); idx > 0 {
name := inner[:idx]
vals := strings.Split(inner[idx+1:], ",")
for i := range vals {
vals[i] = strings.TrimSpace(vals[i])
}
return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
}
// <name> — regular
return Marker{Name: inner, Type: MarkerRegular}
}
// --- Rule matching and application ---
// MatchLine checks if a source line matches this rule and returns the substituted result.
// Returns ("", false) if no match.
func (r *Rule) MatchLine(line string) (string, bool) {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
return "", false
}
// Fast keyword check
if r.Keyword != "" {
firstWord := firstToken(trimmed)
if r.CaseSens {
if firstWord != r.Keyword {
return "", false
}
} else {
if !strings.EqualFold(firstWord, r.Keyword) {
return "", false
}
}
}
// Try to match pattern against line
captures := r.matchPattern(trimmed)
if captures == nil {
return "", false
}
// Apply result template
result := r.applyResult(captures)
return result, true
}
// matchPattern attempts to match the pattern against a line.
// Returns captured values map, or nil if no match.
func (r *Rule) matchPattern(line string) map[string]string {
captures := make(map[string]string)
patternWords := tokenizePattern(r.Pattern)
lineWords := tokenizeLine(line)
pi, li := 0, 0
for pi < len(patternWords) && li < len(lineWords) {
pw := patternWords[pi]
// Marker?
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
inner := pw[1 : len(pw)-1]
m := parseOneMarker(inner)
switch m.Type {
case MarkerWild:
// Capture rest of line
rest := strings.Join(lineWords[li:], " ")
captures[m.Name] = rest
li = len(lineWords)
pi++
case MarkerList:
// Capture a comma-separated list until the next literal
// pattern token. Paren-balanced so nested `(`/`[`/`{`
// don't let an inner `)` terminate the capture. Commas
// at the top level are preserved verbatim in the
// captured string so the `<z>` substitution in the
// result template reproduces the argument list as-is.
var parts []string
depth := 0
delim := ""
if pi+1 < len(patternWords) {
delim = patternWords[pi+1]
}
for li < len(lineWords) {
w := lineWords[li]
if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
break
}
switch w {
case "(", "[", "{":
depth++
case ")", "]", "}":
if depth > 0 {
depth--
}
}
parts = append(parts, w)
li++
}
captures[m.Name] = strings.Join(parts, " ")
pi++
case MarkerWordList:
// Match one of listed words
matched := false
for _, allowed := range m.ListValues {
if r.CaseSens {
if lineWords[li] == allowed {
matched = true
break
}
} else if strings.EqualFold(lineWords[li], allowed) {
matched = true
break
}
}
if !matched {
return nil
}
captures[m.Name] = lineWords[li]
li++
pi++
default:
// Regular or restricted: capture one token or expression
captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
captures[m.Name] = captured
pi++
}
} else if pw == "[" {
// Optional clause — skip to matching ]
depth := 1
pi++
for pi < len(patternWords) && depth > 0 {
if patternWords[pi] == "[" {
depth++
} else if patternWords[pi] == "]" {
depth--
}
pi++
}
} else if pw == "]" {
pi++
} else {
// Literal keyword — must match
if !matchWord(lineWords[li], pw, r.CaseSens) {
return nil
}
li++
pi++
}
}
// Skip remaining optional markers in pattern
for pi < len(patternWords) {
pw := patternWords[pi]
if pw == "[" || pw == "]" || (strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">")) {
pi++
} else {
break
}
}
// For #command with no markers and no optional clauses:
// all line tokens must be consumed for a match
if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
!strings.Contains(r.Pattern, "[") {
return nil
}
return captures
}
// ppQuote wraps a captured value in a PRG string literal, picking a
// delimiter that doesn't collide with characters already inside. Harbour
// #<name> stringify takes the raw source text of the argument and must
// produce a legal PRG string — if the capture is `"world"`, the result
// can't just be `""world""`. Preference order matches Harbour:
// double-quotes first, then single-quotes, then bracket literals.
func ppQuote(val string) string {
if !strings.ContainsRune(val, '"') {
return `"` + val + `"`
}
if !strings.ContainsRune(val, '\'') {
return "'" + val + "'"
}
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
return "[" + val + "]"
}
// Fallback: double-quote with embedded quotes dropped. Pathological
// input only; Harbour itself refuses to handle this cleanly.
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
}
// applyResult substitutes captured values into the result template.
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
// all contain the bare `<z>` token, so the bare substitution has to run
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
// `.` / `"` behind, producing bogus lines like `? #hello` that the
// lexer then choked on with ILLEGAL token errors.
func (r *Rule) applyResult(captures map[string]string) string {
result := r.ResultTmpl
for name, val := range captures {
quoted := ppQuote(val)
// #<name> — dumb stringify (always quote).
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
// <"name"> — explicit stringify.
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
// <(name)> — smart stringify: already a string literal → keep;
// otherwise quote. `val` comes straight from the capture, so
// trim and check for surrounding quotes.
trim := strings.TrimSpace(val)
smart := quoted
if n := len(trim); n >= 2 &&
((trim[0] == '"' && trim[n-1] == '"') ||
(trim[0] == '\'' && trim[n-1] == '\'') ||
(trim[0] == '[' && trim[n-1] == ']')) {
smart = trim
}
result = strings.ReplaceAll(result, "<("+name+")>", smart)
// <.name.> — logify (empty → .F., else .T.)
if val != "" {
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
} else {
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
}
// <name> — bare substitution (must be LAST, after all wrappers).
result = strings.ReplaceAll(result, "<"+name+">", val)
}
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
result = cleanUnreferencedMarkers(result)
return result
}
// cleanUnreferencedMarkers removes any remaining <name>, <(name)>, <.name.>, #<name> references.
// Only removes well-formed PP marker references, not comparison operators.
func cleanUnreferencedMarkers(s string) string {
// Match patterns like <identifier>, <(identifier)>, <.identifier.>, #<identifier>
var out strings.Builder
i := 0
for i < len(s) {
removed := false
// #<name>
if s[i] == '#' && i+1 < len(s) && s[i+1] == '<' {
if end := findMarkerEnd(s, i+1); end > 0 {
i = end
removed = true
}
}
// <name>, <(name)>, <.name.>, <"name">
if !removed && s[i] == '<' {
if end := findMarkerEnd(s, i); end > 0 {
i = end
removed = true
}
}
if !removed {
out.WriteByte(s[i])
i++
}
}
return out.String()
}
// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
func findMarkerEnd(s string, start int) int {
if start >= len(s) || s[start] != '<' {
return 0
}
i := start + 1
// Skip optional ( or . prefix
if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"') {
i++
}
// Must start with letter or underscore (identifier)
if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
return 0
}
// Consume identifier
for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
i++
}
// Skip optional ) or . or " or ,... suffix
for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == ',' || s[i] == ' ') {
i++
}
if i < len(s) && s[i] == '>' {
return i + 1
}
return 0
}
// --- Helpers ---
func firstToken(s string) string {
for i, c := range s {
if c == ' ' || c == '\t' || c == '(' {
return s[:i]
}
}
return s
}
func matchWord(lineWord, patternWord string, caseSens bool) bool {
if caseSens {
return lineWord == patternWord
}
return strings.EqualFold(lineWord, patternWord)
}
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
// single word would never align with the pattern's `DUMB( , <z>, )`
// tokens.
func tokenizePattern(pattern string) []string {
var tokens []string
i := 0
for i < len(pattern) {
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
i++
}
if i >= len(pattern) {
break
}
if pattern[i] == '<' {
end := strings.IndexByte(pattern[i:], '>')
if end >= 0 {
tokens = append(tokens, pattern[i:i+end+1])
i += end + 1
continue
}
}
switch pattern[i] {
case '[', ']', '(', ')', ',':
tokens = append(tokens, string(pattern[i]))
i++
continue
}
// Regular word — stop at space/tab/marker/bracket/paren/comma.
start := i
for i < len(pattern) {
c := pattern[i]
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
c == '(' || c == ')' || c == ',' {
break
}
i++
}
if i > start {
tokens = append(tokens, pattern[start:i])
}
}
return tokens
}
// tokenizeLine splits a source line into words matching the rules used
// by tokenizePattern: string literals stay intact, commas/parens/brackets
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
func tokenizeLine(line string) []string {
var tokens []string
i := 0
for i < len(line) {
for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
i++
}
if i >= len(line) {
break
}
// String literal
if line[i] == '"' || line[i] == '\'' {
quote := line[i]
start := i
i++
for i < len(line) && line[i] != quote {
i++
}
if i < len(line) {
i++
}
tokens = append(tokens, line[start:i])
continue
}
switch line[i] {
case ',', '(', ')', '[', ']':
tokens = append(tokens, string(line[i]))
i++
continue
}
// Word — stop at whitespace, brackets, parens, comma, quotes.
start := i
for i < len(line) {
c := line[i]
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
c == '[' || c == ']' || c == '"' || c == '\'' {
break
}
i++
}
if i > start {
tokens = append(tokens, line[start:i])
}
}
return tokens
}
// captureExpression captures an expression from line tokens.
// If this is the last marker in the pattern, captures all remaining tokens.
// Otherwise, captures until the next keyword in the pattern.
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
if *li >= len(lineWords) {
return ""
}
// Find next literal keyword in pattern to use as delimiter
delimWord := ""
for pi := nextPi; pi < len(patternWords); pi++ {
pw := patternWords[pi]
if !strings.HasPrefix(pw, "<") && pw != "[" && pw != "]" {
delimWord = pw
break
}
}
if delimWord != "" {
// Capture until the delimiter, paren-balancing so nested
// parens/brackets/braces inside the expression don't falsely
// terminate the capture. Harbour's own PP does the same —
// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
// and leave the outer `)` for the pattern's own delimiter.
var parts []string
depth := 0
for *li < len(lineWords) {
w := lineWords[*li]
if depth == 0 && matchWord(w, delimWord, caseSens) {
break
}
switch w {
case "(", "[", "{":
depth++
case ")", "]", "}":
if depth > 0 {
depth--
}
}
parts = append(parts, w)
*li++
}
return strings.Join(parts, " ")
}
// No delimiter: if last marker, capture all remaining tokens
if nextPi >= len(patternWords) {
rest := strings.Join(lineWords[*li:], " ")
*li = len(lineWords)
return rest
}
// Single token capture (between markers)
tok := lineWords[*li]
*li++
return tok
}