From e9522772a7a982c88230ec5c1aa0c32903b17148 Mon Sep 17 00:00:00 2001 From: CharlesKWON Date: Sat, 18 Apr 2026 17:26:16 +0900 Subject: [PATCH] =?UTF-8?q?fix(pp):=20stringify=20markers=20+=20paren-atta?= =?UTF-8?q?ched=20calls=20=E2=80=94=20pp.prg=2026=E2=86=922=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three cumulative fixes for Harbour's preprocessor stringify forms surfaced by harbour-core/tests/pp.prg: 1. Token alignment — tokenizePattern and tokenizeLine now both split on parens and brackets, so `DUMB(a)` (no space) tokenises as `DUMB`, `(`, `a`, `)` on both sides. Previously the line tokenizer kept `DUMB(a)` as one token while the pattern split it three ways, and the match never engaged. Fixes `_DUMB_(a)`- style calls in pp.prg line 57+. 2. Substitution order — applyResult was replacing the bare `` marker first, eating the inner `` of `#`, `<"z">`, `<(z)>` and `<.z.>` and leaving stray `#` / `<` / `.` characters that the lexer reported as ILLEGAL tokens. Run all compound forms first, bare `` last. 3. Quote delimiter picker — ppQuote wraps a captured value in a legal PRG string literal by trying `"..."` first, then `'...'`, then `[...]`. Harbour's # dumb-stringify needs this because the capture may already contain `"`, and Five was producing malformed `""world""` literals. Bonus: smart-stringify `<(z)>` now recognises input that's already a string literal (`"x"` / `'x'` / `[x]`) and keeps it verbatim instead of double-quoting. pp.prg 26 parse errors → 2 (remaining: `USE &b ALIAS &a.1` macro- inside-command at line 21 and one related line, unrelated to this fix). FiveSql2 43/43, Harbour compat 56/56, Go test ALL PASS. Co-Authored-By: Claude Opus 4.7 (1M context) --- compiler/pp/command.go | 99 +++++++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/compiler/pp/command.go b/compiler/pp/command.go index 80b3a15..9360df6 100644 --- a/compiler/pp/command.go +++ b/compiler/pp/command.go @@ -304,23 +304,62 @@ func (r *Rule) matchPattern(line string) map[string]string { return captures } +// ppQuote wraps a captured value in a PRG string literal, picking a +// delimiter that doesn't collide with characters already inside. Harbour +// # stringify takes the raw source text of the argument and must +// produce a legal PRG string — if the capture is `"world"`, the result +// can't just be `""world""`. Preference order matches Harbour: +// double-quotes first, then single-quotes, then bracket literals. +func ppQuote(val string) string { + if !strings.ContainsRune(val, '"') { + return `"` + val + `"` + } + if !strings.ContainsRune(val, '\'') { + return "'" + val + "'" + } + if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') { + return "[" + val + "]" + } + // Fallback: double-quote with embedded quotes dropped. Pathological + // input only; Harbour itself refuses to handle this cleanly. + return `"` + strings.ReplaceAll(val, `"`, "") + `"` +} + // applyResult substitutes captured values into the result template. +// Order matters — the compound forms (`#`, `<(z)>`, `<.z.>`, `<"z">`) +// all contain the bare `` token, so the bare substitution has to run +// LAST. Previously `` was replaced first and left a stray `#` / `(` / +// `.` / `"` behind, producing bogus lines like `? #hello` that the +// lexer then choked on with ILLEGAL token errors. func (r *Rule) applyResult(captures map[string]string) string { result := r.ResultTmpl for name, val := range captures { - // — direct substitution - result = strings.ReplaceAll(result, "<"+name+">", val) - // <(name)> — stringify - result = strings.ReplaceAll(result, "<("+name+")>", `"`+val+`"`) - // <.name.> — logify + quoted := ppQuote(val) + // # — dumb stringify (always quote). + result = strings.ReplaceAll(result, "#<"+name+">", quoted) + // <"name"> — explicit stringify. + result = strings.ReplaceAll(result, `<"`+name+`">`, quoted) + // <(name)> — smart stringify: already a string literal → keep; + // otherwise quote. `val` comes straight from the capture, so + // trim and check for surrounding quotes. + trim := strings.TrimSpace(val) + smart := quoted + if n := len(trim); n >= 2 && + ((trim[0] == '"' && trim[n-1] == '"') || + (trim[0] == '\'' && trim[n-1] == '\'') || + (trim[0] == '[' && trim[n-1] == ']')) { + smart = trim + } + result = strings.ReplaceAll(result, "<("+name+")>", smart) + // <.name.> — logify (empty → .F., else .T.) if val != "" { result = strings.ReplaceAll(result, "<."+name+".>", ".T.") } else { result = strings.ReplaceAll(result, "<."+name+".>", ".F.") } - // # — dumb stringify - result = strings.ReplaceAll(result, "#<"+name+">", `"`+val+`"`) + // — bare substitution (must be LAST, after all wrappers). + result = strings.ReplaceAll(result, "<"+name+">", val) } // Clean up unreferenced markers: , <(name)>, <.name.>, #, <"name"> @@ -406,11 +445,15 @@ func matchWord(lineWord, patternWord string, caseSens bool) bool { } // tokenizePattern splits a pattern into words, keeping markers as single tokens. +// Parens and commas are emitted as their own tokens so `DUMB()` and +// `DUMB( )` tokenise identically — matching what tokenizeLine does +// on call sites. Without this, `_DUMB_(a)` (no space) stored as a +// single word would never align with the pattern's `DUMB( , , )` +// tokens. func tokenizePattern(pattern string) []string { var tokens []string i := 0 for i < len(pattern) { - // Skip whitespace for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') { i++ } @@ -419,7 +462,6 @@ func tokenizePattern(pattern string) []string { } if pattern[i] == '<' { - // Find matching > end := strings.IndexByte(pattern[i:], '>') if end >= 0 { tokens = append(tokens, pattern[i:i+end+1]) @@ -428,21 +470,21 @@ func tokenizePattern(pattern string) []string { } } - if pattern[i] == '[' { - tokens = append(tokens, "[") - i++ - continue - } - if pattern[i] == ']' { - tokens = append(tokens, "]") + switch pattern[i] { + case '[', ']', '(', ')', ',': + tokens = append(tokens, string(pattern[i])) i++ continue } - // Regular word + // Regular word — stop at space/tab/marker/bracket/paren/comma. start := i - for i < len(pattern) && pattern[i] != ' ' && pattern[i] != '\t' && - pattern[i] != '<' && pattern[i] != '[' && pattern[i] != ']' { + for i < len(pattern) { + c := pattern[i] + if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' || + c == '(' || c == ')' || c == ',' { + break + } i++ } if i > start { @@ -452,7 +494,10 @@ func tokenizePattern(pattern string) []string { return tokens } -// tokenizeLine splits a source line into words (keeping strings and parens together). +// tokenizeLine splits a source line into words matching the rules used +// by tokenizePattern: string literals stay intact, commas/parens/brackets +// emit as standalone tokens so a call like `DUMB(hello)` tokenises as +// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side. func tokenizeLine(line string) []string { var tokens []string i := 0 @@ -479,17 +524,19 @@ func tokenizeLine(line string) []string { continue } - // Comma (standalone token) - if line[i] == ',' { - tokens = append(tokens, ",") + switch line[i] { + case ',', '(', ')', '[', ']': + tokens = append(tokens, string(line[i])) i++ continue } - // Word + // Word — stop at whitespace, brackets, parens, comma, quotes. start := i - for i < len(line) && line[i] != ' ' && line[i] != '\t' && line[i] != ',' { - if line[i] == '"' || line[i] == '\'' { + for i < len(line) { + c := line[i] + if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' || + c == '[' || c == ']' || c == '"' || c == '\'' { break } i++