fix(pp): stringify markers + paren-attached calls — pp.prg 26→2 errors
Three cumulative fixes for Harbour's preprocessor stringify forms surfaced by harbour-core/tests/pp.prg: 1. Token alignment — tokenizePattern and tokenizeLine now both split on parens and brackets, so `DUMB(a)` (no space) tokenises as `DUMB`, `(`, `a`, `)` on both sides. Previously the line tokenizer kept `DUMB(a)` as one token while the pattern split it three ways, and the match never engaged. Fixes `_DUMB_(a)`- style calls in pp.prg line 57+. 2. Substitution order — applyResult was replacing the bare `<z>` marker first, eating the inner `<z>` of `#<z>`, `<"z">`, `<(z)>` and `<.z.>` and leaving stray `#` / `<` / `.` characters that the lexer reported as ILLEGAL tokens. Run all compound forms first, bare `<z>` last. 3. Quote delimiter picker — ppQuote wraps a captured value in a legal PRG string literal by trying `"..."` first, then `'...'`, then `[...]`. Harbour's #<z> dumb-stringify needs this because the capture may already contain `"`, and Five was producing malformed `""world""` literals. Bonus: smart-stringify `<(z)>` now recognises input that's already a string literal (`"x"` / `'x'` / `[x]`) and keeps it verbatim instead of double-quoting. pp.prg 26 parse errors → 2 (remaining: `USE &b ALIAS &a.1` macro- inside-command at line 21 and one related line, unrelated to this fix). FiveSql2 43/43, Harbour compat 56/56, Go test ALL PASS. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -304,23 +304,62 @@ func (r *Rule) matchPattern(line string) map[string]string {
|
||||
return captures
|
||||
}
|
||||
|
||||
// ppQuote wraps a captured value in a PRG string literal, picking a
|
||||
// delimiter that doesn't collide with characters already inside. Harbour
|
||||
// #<name> stringify takes the raw source text of the argument and must
|
||||
// produce a legal PRG string — if the capture is `"world"`, the result
|
||||
// can't just be `""world""`. Preference order matches Harbour:
|
||||
// double-quotes first, then single-quotes, then bracket literals.
|
||||
func ppQuote(val string) string {
|
||||
if !strings.ContainsRune(val, '"') {
|
||||
return `"` + val + `"`
|
||||
}
|
||||
if !strings.ContainsRune(val, '\'') {
|
||||
return "'" + val + "'"
|
||||
}
|
||||
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
|
||||
return "[" + val + "]"
|
||||
}
|
||||
// Fallback: double-quote with embedded quotes dropped. Pathological
|
||||
// input only; Harbour itself refuses to handle this cleanly.
|
||||
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
|
||||
}
|
||||
|
||||
// applyResult substitutes captured values into the result template.
|
||||
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
|
||||
// all contain the bare `<z>` token, so the bare substitution has to run
|
||||
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
|
||||
// `.` / `"` behind, producing bogus lines like `? #hello` that the
|
||||
// lexer then choked on with ILLEGAL token errors.
|
||||
func (r *Rule) applyResult(captures map[string]string) string {
|
||||
result := r.ResultTmpl
|
||||
|
||||
for name, val := range captures {
|
||||
// <name> — direct substitution
|
||||
result = strings.ReplaceAll(result, "<"+name+">", val)
|
||||
// <(name)> — stringify
|
||||
result = strings.ReplaceAll(result, "<("+name+")>", `"`+val+`"`)
|
||||
// <.name.> — logify
|
||||
quoted := ppQuote(val)
|
||||
// #<name> — dumb stringify (always quote).
|
||||
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
|
||||
// <"name"> — explicit stringify.
|
||||
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
|
||||
// <(name)> — smart stringify: already a string literal → keep;
|
||||
// otherwise quote. `val` comes straight from the capture, so
|
||||
// trim and check for surrounding quotes.
|
||||
trim := strings.TrimSpace(val)
|
||||
smart := quoted
|
||||
if n := len(trim); n >= 2 &&
|
||||
((trim[0] == '"' && trim[n-1] == '"') ||
|
||||
(trim[0] == '\'' && trim[n-1] == '\'') ||
|
||||
(trim[0] == '[' && trim[n-1] == ']')) {
|
||||
smart = trim
|
||||
}
|
||||
result = strings.ReplaceAll(result, "<("+name+")>", smart)
|
||||
// <.name.> — logify (empty → .F., else .T.)
|
||||
if val != "" {
|
||||
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
|
||||
} else {
|
||||
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
|
||||
}
|
||||
// #<name> — dumb stringify
|
||||
result = strings.ReplaceAll(result, "#<"+name+">", `"`+val+`"`)
|
||||
// <name> — bare substitution (must be LAST, after all wrappers).
|
||||
result = strings.ReplaceAll(result, "<"+name+">", val)
|
||||
}
|
||||
|
||||
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
|
||||
@@ -406,11 +445,15 @@ func matchWord(lineWord, patternWord string, caseSens bool) bool {
|
||||
}
|
||||
|
||||
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
|
||||
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
|
||||
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
|
||||
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
|
||||
// single word would never align with the pattern's `DUMB( , <z>, )`
|
||||
// tokens.
|
||||
func tokenizePattern(pattern string) []string {
|
||||
var tokens []string
|
||||
i := 0
|
||||
for i < len(pattern) {
|
||||
// Skip whitespace
|
||||
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
|
||||
i++
|
||||
}
|
||||
@@ -419,7 +462,6 @@ func tokenizePattern(pattern string) []string {
|
||||
}
|
||||
|
||||
if pattern[i] == '<' {
|
||||
// Find matching >
|
||||
end := strings.IndexByte(pattern[i:], '>')
|
||||
if end >= 0 {
|
||||
tokens = append(tokens, pattern[i:i+end+1])
|
||||
@@ -428,21 +470,21 @@ func tokenizePattern(pattern string) []string {
|
||||
}
|
||||
}
|
||||
|
||||
if pattern[i] == '[' {
|
||||
tokens = append(tokens, "[")
|
||||
i++
|
||||
continue
|
||||
}
|
||||
if pattern[i] == ']' {
|
||||
tokens = append(tokens, "]")
|
||||
switch pattern[i] {
|
||||
case '[', ']', '(', ')', ',':
|
||||
tokens = append(tokens, string(pattern[i]))
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
// Regular word
|
||||
// Regular word — stop at space/tab/marker/bracket/paren/comma.
|
||||
start := i
|
||||
for i < len(pattern) && pattern[i] != ' ' && pattern[i] != '\t' &&
|
||||
pattern[i] != '<' && pattern[i] != '[' && pattern[i] != ']' {
|
||||
for i < len(pattern) {
|
||||
c := pattern[i]
|
||||
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
|
||||
c == '(' || c == ')' || c == ',' {
|
||||
break
|
||||
}
|
||||
i++
|
||||
}
|
||||
if i > start {
|
||||
@@ -452,7 +494,10 @@ func tokenizePattern(pattern string) []string {
|
||||
return tokens
|
||||
}
|
||||
|
||||
// tokenizeLine splits a source line into words (keeping strings and parens together).
|
||||
// tokenizeLine splits a source line into words matching the rules used
|
||||
// by tokenizePattern: string literals stay intact, commas/parens/brackets
|
||||
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
|
||||
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
|
||||
func tokenizeLine(line string) []string {
|
||||
var tokens []string
|
||||
i := 0
|
||||
@@ -479,17 +524,19 @@ func tokenizeLine(line string) []string {
|
||||
continue
|
||||
}
|
||||
|
||||
// Comma (standalone token)
|
||||
if line[i] == ',' {
|
||||
tokens = append(tokens, ",")
|
||||
switch line[i] {
|
||||
case ',', '(', ')', '[', ']':
|
||||
tokens = append(tokens, string(line[i]))
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
// Word
|
||||
// Word — stop at whitespace, brackets, parens, comma, quotes.
|
||||
start := i
|
||||
for i < len(line) && line[i] != ' ' && line[i] != '\t' && line[i] != ',' {
|
||||
if line[i] == '"' || line[i] == '\'' {
|
||||
for i < len(line) {
|
||||
c := line[i]
|
||||
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
|
||||
c == '[' || c == ']' || c == '"' || c == '\'' {
|
||||
break
|
||||
}
|
||||
i++
|
||||
|
||||
Reference in New Issue
Block a user