fix(pp): stringify markers + paren-attached calls — pp.prg 26→2 errors

Three cumulative fixes for Harbour's preprocessor stringify forms surfaced by harbour-core/tests/pp.prg: 1. Token alignment — tokenizePattern and tokenizeLine now both split on parens and brackets, so `DUMB(a)` (no space) tokenises as `DUMB`, `(`, `a`, `)` on both sides. Previously the line tokenizer kept `DUMB(a)` as one token while the pattern split it three ways, and the match never engaged. Fixes `_DUMB_(a)`- style calls in pp.prg line 57+. 2. Substitution order — applyResult was replacing the bare `<z>` marker first, eating the inner `<z>` of `#<z>`, `<"z">`, `<(z)>` and `<.z.>` and leaving stray `#` / `<` / `.` characters that the lexer reported as ILLEGAL tokens. Run all compound forms first, bare `<z>` last. 3. Quote delimiter picker — ppQuote wraps a captured value in a legal PRG string literal by trying `"..."` first, then `'...'`, then `[...]`. Harbour's #<z> dumb-stringify needs this because the capture may already contain `"`, and Five was producing malformed `""world""` literals. Bonus: smart-stringify `<(z)>` now recognises input that's already a string literal (`"x"` / `'x'` / `[x]`) and keeps it verbatim instead of double-quoting. pp.prg 26 parse errors → 2 (remaining: `USE &b ALIAS &a.1` macro- inside-command at line 21 and one related line, unrelated to this fix). FiveSql2 43/43, Harbour compat 56/56, Go test ALL PASS. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:26:16 +09:00
parent 385a4ec6a2
commit e9522772a7
1 changed files with 73 additions and 26 deletions
--- a/compiler/pp/command.go
+++ b/compiler/pp/command.go
@@ -304,23 +304,62 @@ func (r *Rule) matchPattern(line string) map[string]string {
 	return captures
 }

+// ppQuote wraps a captured value in a PRG string literal, picking a
+// delimiter that doesn't collide with characters already inside. Harbour
+// #<name> stringify takes the raw source text of the argument and must
+// produce a legal PRG string — if the capture is `"world"`, the result
+// can't just be `""world""`. Preference order matches Harbour:
+// double-quotes first, then single-quotes, then bracket literals.
+func ppQuote(val string) string {
+	if !strings.ContainsRune(val, '"') {
+		return `"` + val + `"`
+	}
+	if !strings.ContainsRune(val, '\'') {
+		return "'" + val + "'"
+	}
+	if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
+		return "[" + val + "]"
+	}
+	// Fallback: double-quote with embedded quotes dropped. Pathological
+	// input only; Harbour itself refuses to handle this cleanly.
+	return `"` + strings.ReplaceAll(val, `"`, "") + `"`
+}
+
 // applyResult substitutes captured values into the result template.
+// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
+// all contain the bare `<z>` token, so the bare substitution has to run
+// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
+// `.` / `"` behind, producing bogus lines like `? #hello` that the
+// lexer then choked on with ILLEGAL token errors.
 func (r *Rule) applyResult(captures map[string]string) string {
 	result := r.ResultTmpl

 	for name, val := range captures {
-		// <name> — direct substitution
-		result = strings.ReplaceAll(result, "<"+name+">", val)
-		// <(name)> — stringify
-		result = strings.ReplaceAll(result, "<("+name+")>", `"`+val+`"`)
-		// <.name.> — logify
+		quoted := ppQuote(val)
+		// #<name> — dumb stringify (always quote).
+		result = strings.ReplaceAll(result, "#<"+name+">", quoted)
+		// <"name"> — explicit stringify.
+		result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
+		// <(name)> — smart stringify: already a string literal → keep;
+		// otherwise quote. `val` comes straight from the capture, so
+		// trim and check for surrounding quotes.
+		trim := strings.TrimSpace(val)
+		smart := quoted
+		if n := len(trim); n >= 2 &&
+			((trim[0] == '"' && trim[n-1] == '"') ||
+				(trim[0] == '\'' && trim[n-1] == '\'') ||
+				(trim[0] == '[' && trim[n-1] == ']')) {
+			smart = trim
+		}
+		result = strings.ReplaceAll(result, "<("+name+")>", smart)
+		// <.name.> — logify (empty → .F., else .T.)
 		if val != "" {
 			result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
 		} else {
 			result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
 		}
-		// #<name> — dumb stringify
-		result = strings.ReplaceAll(result, "#<"+name+">", `"`+val+`"`)
+		// <name> — bare substitution (must be LAST, after all wrappers).
+		result = strings.ReplaceAll(result, "<"+name+">", val)
 	}

 	// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
@@ -406,11 +445,15 @@ func matchWord(lineWord, patternWord string, caseSens bool) bool {
 }

 // tokenizePattern splits a pattern into words, keeping markers as single tokens.
+// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
+// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
+// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
+// single word would never align with the pattern's `DUMB( , <z>, )`
+// tokens.
 func tokenizePattern(pattern string) []string {
 	var tokens []string
 	i := 0
 	for i < len(pattern) {
-		// Skip whitespace
 		for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
 			i++
 		}
@@ -419,7 +462,6 @@ func tokenizePattern(pattern string) []string {
 		}

 		if pattern[i] == '<' {
-			// Find matching >
 			end := strings.IndexByte(pattern[i:], '>')
 			if end >= 0 {
 				tokens = append(tokens, pattern[i:i+end+1])
@@ -428,21 +470,21 @@ func tokenizePattern(pattern string) []string {
 			}
 		}

-		if pattern[i] == '[' {
-			tokens = append(tokens, "[")
-			i++
-			continue
-		}
-		if pattern[i] == ']' {
-			tokens = append(tokens, "]")
+		switch pattern[i] {
+		case '[', ']', '(', ')', ',':
+			tokens = append(tokens, string(pattern[i]))
 			i++
 			continue
 		}

-		// Regular word
+		// Regular word — stop at space/tab/marker/bracket/paren/comma.
 		start := i
-		for i < len(pattern) && pattern[i] != ' ' && pattern[i] != '\t' &&
-			pattern[i] != '<' && pattern[i] != '[' && pattern[i] != ']' {
+		for i < len(pattern) {
+			c := pattern[i]
+			if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
+				c == '(' || c == ')' || c == ',' {
+				break
+			}
 			i++
 		}
 		if i > start {
@@ -452,7 +494,10 @@ func tokenizePattern(pattern string) []string {
 	return tokens
 }

-// tokenizeLine splits a source line into words (keeping strings and parens together).
+// tokenizeLine splits a source line into words matching the rules used
+// by tokenizePattern: string literals stay intact, commas/parens/brackets
+// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
+// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
 func tokenizeLine(line string) []string {
 	var tokens []string
 	i := 0
@@ -479,17 +524,19 @@ func tokenizeLine(line string) []string {
 			continue
 		}

-		// Comma (standalone token)
-		if line[i] == ',' {
-			tokens = append(tokens, ",")
+		switch line[i] {
+		case ',', '(', ')', '[', ']':
+			tokens = append(tokens, string(line[i]))
 			i++
 			continue
 		}

-		// Word
+		// Word — stop at whitespace, brackets, parens, comma, quotes.
 		start := i
-		for i < len(line) && line[i] != ' ' && line[i] != '\t' && line[i] != ',' {
-			if line[i] == '"' || line[i] == '\'' {
+		for i < len(line) {
+			c := line[i]
+			if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
+				c == '[' || c == ']' || c == '"' || c == '\'' {
 				break
 			}
 			i++