fix(pp): stringify markers + paren-attached calls — pp.prg 26→2 errors

Three cumulative fixes for Harbour's preprocessor stringify forms
surfaced by harbour-core/tests/pp.prg:

1. Token alignment — tokenizePattern and tokenizeLine now both
   split on parens and brackets, so `DUMB(a)` (no space) tokenises
   as `DUMB`, `(`, `a`, `)` on both sides. Previously the line
   tokenizer kept `DUMB(a)` as one token while the pattern split
   it three ways, and the match never engaged. Fixes `_DUMB_(a)`-
   style calls in pp.prg line 57+.

2. Substitution order — applyResult was replacing the bare `<z>`
   marker first, eating the inner `<z>` of `#<z>`, `<"z">`, `<(z)>`
   and `<.z.>` and leaving stray `#` / `<` / `.` characters that
   the lexer reported as ILLEGAL tokens. Run all compound forms
   first, bare `<z>` last.

3. Quote delimiter picker — ppQuote wraps a captured value in a
   legal PRG string literal by trying `"..."` first, then `'...'`,
   then `[...]`. Harbour's #<z> dumb-stringify needs this because
   the capture may already contain `"`, and Five was producing
   malformed `""world""` literals.

Bonus: smart-stringify `<(z)>` now recognises input that's already
a string literal (`"x"` / `'x'` / `[x]`) and keeps it verbatim
instead of double-quoting.

pp.prg 26 parse errors → 2 (remaining: `USE &b ALIAS &a.1` macro-
inside-command at line 21 and one related line, unrelated to this
fix). FiveSql2 43/43, Harbour compat 56/56, Go test ALL PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 17:26:16 +09:00
parent 385a4ec6a2
commit e9522772a7

View File

@@ -304,23 +304,62 @@ func (r *Rule) matchPattern(line string) map[string]string {
return captures
}
// ppQuote wraps a captured value in a PRG string literal, picking a
// delimiter that doesn't collide with characters already inside. Harbour
// #<name> stringify takes the raw source text of the argument and must
// produce a legal PRG string — if the capture is `"world"`, the result
// can't just be `""world""`. Preference order matches Harbour:
// double-quotes first, then single-quotes, then bracket literals.
func ppQuote(val string) string {
if !strings.ContainsRune(val, '"') {
return `"` + val + `"`
}
if !strings.ContainsRune(val, '\'') {
return "'" + val + "'"
}
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
return "[" + val + "]"
}
// Fallback: double-quote with embedded quotes dropped. Pathological
// input only; Harbour itself refuses to handle this cleanly.
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
}
// applyResult substitutes captured values into the result template.
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
// all contain the bare `<z>` token, so the bare substitution has to run
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
// `.` / `"` behind, producing bogus lines like `? #hello` that the
// lexer then choked on with ILLEGAL token errors.
func (r *Rule) applyResult(captures map[string]string) string {
result := r.ResultTmpl
for name, val := range captures {
// <name> — direct substitution
result = strings.ReplaceAll(result, "<"+name+">", val)
// <(name)> — stringify
result = strings.ReplaceAll(result, "<("+name+")>", `"`+val+`"`)
// <.name.> — logify
quoted := ppQuote(val)
// #<name> — dumb stringify (always quote).
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
// <"name"> — explicit stringify.
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
// <(name)> — smart stringify: already a string literal → keep;
// otherwise quote. `val` comes straight from the capture, so
// trim and check for surrounding quotes.
trim := strings.TrimSpace(val)
smart := quoted
if n := len(trim); n >= 2 &&
((trim[0] == '"' && trim[n-1] == '"') ||
(trim[0] == '\'' && trim[n-1] == '\'') ||
(trim[0] == '[' && trim[n-1] == ']')) {
smart = trim
}
result = strings.ReplaceAll(result, "<("+name+")>", smart)
// <.name.> — logify (empty → .F., else .T.)
if val != "" {
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
} else {
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
}
// #<name> — dumb stringify
result = strings.ReplaceAll(result, "#<"+name+">", `"`+val+`"`)
// <name> — bare substitution (must be LAST, after all wrappers).
result = strings.ReplaceAll(result, "<"+name+">", val)
}
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
@@ -406,11 +445,15 @@ func matchWord(lineWord, patternWord string, caseSens bool) bool {
}
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
// single word would never align with the pattern's `DUMB( , <z>, )`
// tokens.
func tokenizePattern(pattern string) []string {
var tokens []string
i := 0
for i < len(pattern) {
// Skip whitespace
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
i++
}
@@ -419,7 +462,6 @@ func tokenizePattern(pattern string) []string {
}
if pattern[i] == '<' {
// Find matching >
end := strings.IndexByte(pattern[i:], '>')
if end >= 0 {
tokens = append(tokens, pattern[i:i+end+1])
@@ -428,21 +470,21 @@ func tokenizePattern(pattern string) []string {
}
}
if pattern[i] == '[' {
tokens = append(tokens, "[")
i++
continue
}
if pattern[i] == ']' {
tokens = append(tokens, "]")
switch pattern[i] {
case '[', ']', '(', ')', ',':
tokens = append(tokens, string(pattern[i]))
i++
continue
}
// Regular word
// Regular word — stop at space/tab/marker/bracket/paren/comma.
start := i
for i < len(pattern) && pattern[i] != ' ' && pattern[i] != '\t' &&
pattern[i] != '<' && pattern[i] != '[' && pattern[i] != ']' {
for i < len(pattern) {
c := pattern[i]
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
c == '(' || c == ')' || c == ',' {
break
}
i++
}
if i > start {
@@ -452,7 +494,10 @@ func tokenizePattern(pattern string) []string {
return tokens
}
// tokenizeLine splits a source line into words (keeping strings and parens together).
// tokenizeLine splits a source line into words matching the rules used
// by tokenizePattern: string literals stay intact, commas/parens/brackets
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
func tokenizeLine(line string) []string {
var tokens []string
i := 0
@@ -479,17 +524,19 @@ func tokenizeLine(line string) []string {
continue
}
// Comma (standalone token)
if line[i] == ',' {
tokens = append(tokens, ",")
switch line[i] {
case ',', '(', ')', '[', ']':
tokens = append(tokens, string(line[i]))
i++
continue
}
// Word
// Word — stop at whitespace, brackets, parens, comma, quotes.
start := i
for i < len(line) && line[i] != ' ' && line[i] != '\t' && line[i] != ',' {
if line[i] == '"' || line[i] == '\'' {
for i < len(line) {
c := line[i]
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
c == '[' || c == ']' || c == '"' || c == '\'' {
break
}
i++