Three silent-miscompile fixes in the preprocessor that were
masking real bugs in Harbour-style PRG.
1. Brace tokenizer (compiler/pp/command.go)
`{` and `}` now tokenize as standalone separator tokens. The
matcher previously only split on `,()[]"'` etc., so a codeblock
literal `{|| ... }` in a macro argument became the tokens `{||`,
`""`, `}`. The capture-depth tracker only matched exact `{`/`}`,
so `{||` was invisible as an opener while the standalone `}`
wrongly decremented depth — `TEST_LINE( o:VarPut({|| "" }) )`
truncated mid-argument and the parser later choked at the inner
`}` with `expected ), got } "}"`.
Fix: add `{` and `}` to tokenizeLine's separator set. Now
`{|| ... }` lexes as `{`, `||`, `""`, `}` and balances cleanly.
2. ;-continuation join for non-`#` lines (compiler/pp/pp.go)
The existing line-joiner only collapsed trailing `;` continuations
on `#`-prefixed directives. Plain source code using the same
convention — e.g. Harbour's TEST macro:
TEST t004 STATIC s_once := NIL, S_C ;
INIT hb_threadOnce( @s_once, {|| ... } ) ;
CODE x := S_C
was processed one physical line at a time, so the TEST pattern
never matched the full logical statement. The first row passed
through unrewritten, fell through to the parser as an expression,
and gengo silently absorbed it as part of the *previous*
function's body. Six TEST macros' STATIC declarations all ended
up tagged with t003's function name, producing duplicate
`static_T003_S_ONCE` decls and a Go compile failure.
Fix: add the same trailing-`;` join logic to user code, with
blank-line fillers inserted post-join so source line numbers in
parser errors still align with the original file.
3. Block-comment-aware continuation join
Inline `/* ... */` at the end of a continuation row hid the
trailing `;` from the joiner's HasSuffix check. The fix calls
stripBlockComments on the next-line peek before testing for `;`,
so chains like
AAdd( aResult, { cChildBase, ;
aRefs[ "fk" ][ j ][ 1 ], ; /* child col */
aRefs[ "fk" ][ j ][ 3 ], ; /* parent col */
...
keep folding instead of stopping after one row and leaving a
dangling `,` at end of line.
Results
-------
Harbour-core compat sweep: 25/30 → 28/30 (remaining lnlenli1 +
keywords are //NOTEST stress files, intentionally unbalanced).
All 6 release gates green: go test ./..., FiveSql2 43/43,
Harbour compat 56/56, std.ch 17/17, FRB 7/7, examples 65/71.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1430 lines
41 KiB
Go
1430 lines
41 KiB
Go
// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
|
|
// All rights reserved.
|
|
|
|
// #command / #translate implementation for Five preprocessor.
|
|
//
|
|
// Harbour PP syntax:
|
|
// #command PATTERN => RESULT
|
|
// #translate PATTERN => RESULT
|
|
// #xcommand PATTERN => RESULT (case-sensitive)
|
|
// #xtranslate PATTERN => RESULT (case-sensitive)
|
|
//
|
|
// Pattern markers:
|
|
// <x> — match any expression (regular match)
|
|
// <!x!> — match single identifier only (restricted match)
|
|
// <x,...> — match comma-separated list
|
|
// <*x*> — match rest of line (wild match)
|
|
// <x:a,b,c> — match one of listed words (list match)
|
|
// [...] — optional clause
|
|
//
|
|
// Result markers:
|
|
// <x> — substitute matched text
|
|
// <(x)> — stringify (wrap in quotes)
|
|
// <{x}> — blockify (wrap in {|| })
|
|
// #<x> — dumb stringify
|
|
// <.x.> — logify (.T. if matched, .F. if not)
|
|
//
|
|
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
|
|
package pp
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// Rule represents a single #command or #translate rule.
|
|
type Rule struct {
|
|
Pattern string // raw pattern text
|
|
Result string // raw result text
|
|
IsCommand bool // #command vs #translate
|
|
CaseSens bool // #xcommand/#xtranslate = case sensitive
|
|
Keyword string // first keyword (for fast matching)
|
|
Markers []Marker // parsed pattern markers
|
|
ResultTmpl string // result template with marker references
|
|
|
|
// Warnings collected during ParseRule. Currently only one source:
|
|
// result-template markers that reference a name absent from the
|
|
// pattern. Caller can surface these to the user — a typo'd
|
|
// `<For>` instead of `<for>` used to silently produce broken
|
|
// expansion output.
|
|
Warnings []string
|
|
}
|
|
|
|
// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
|
|
type Marker struct {
|
|
Name string // marker name
|
|
Type MarkerType
|
|
ListValues []string // for <x:a,b,c> — allowed values
|
|
}
|
|
|
|
type MarkerType int
|
|
|
|
const (
|
|
MarkerRegular MarkerType = iota // <x> — any expression
|
|
MarkerRestricted // <!x!> — identifier only
|
|
MarkerList // <x,...> — comma-separated list
|
|
MarkerWild // <*x*> — rest of line
|
|
MarkerWordList // <x:a,b,c> — one of listed words
|
|
)
|
|
|
|
// ParseRule parses a #command/#translate directive into a Rule.
|
|
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
|
|
// Split on =>
|
|
parts := strings.SplitN(directive, "=>", 2)
|
|
if len(parts) != 2 {
|
|
return nil
|
|
}
|
|
|
|
pattern := strings.TrimSpace(parts[0])
|
|
result := strings.TrimSpace(parts[1])
|
|
|
|
// Earlier versions stripped every ` ;` as Harbour line-continuation.
|
|
// That also destroyed in-line PRG statement separators — `IF x ==
|
|
// NIL ; x := y ; ENDIF` lost all its semicolons. Line-continuation
|
|
// joining is the preprocessor's job (processLines), not this rule
|
|
// parser's. Keep the semicolons as-is.
|
|
|
|
rule := &Rule{
|
|
Pattern: pattern,
|
|
Result: result,
|
|
IsCommand: isCommand,
|
|
CaseSens: caseSens,
|
|
ResultTmpl: result,
|
|
}
|
|
|
|
// Extract first keyword for fast matching. The first whitespace-
|
|
// delimited token of the pattern becomes the dispatch key; we
|
|
// strip marker wrappers and any trailing `(` so a pattern like
|
|
// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
|
|
// firstToken normalises source lines.
|
|
words := strings.Fields(pattern)
|
|
if len(words) > 0 {
|
|
kw := words[0]
|
|
kw = strings.TrimLeft(kw, "<[")
|
|
kw = strings.TrimRight(kw, ">]")
|
|
if idx := strings.IndexByte(kw, '('); idx >= 0 {
|
|
kw = kw[:idx]
|
|
}
|
|
if !strings.ContainsAny(kw, "!*,:") {
|
|
rule.Keyword = kw
|
|
}
|
|
}
|
|
|
|
// Parse markers from pattern
|
|
rule.Markers = parseMarkers(pattern)
|
|
|
|
// Validate result-template marker references. Each `<name>`
|
|
// (and its smart-stringify / blockify / logify / dumb-stringify
|
|
// variants) must reference a name declared in the pattern.
|
|
// Catches typos like `<For>` vs `<for>` (case-sensitive
|
|
// xcommand) before they silently produce broken output at
|
|
// expansion time.
|
|
rule.Warnings = validateResultMarkers(pattern, result, rule.Markers, caseSens)
|
|
|
|
return rule
|
|
}
|
|
|
|
// validateResultMarkers scans the result template for marker
|
|
// references and reports any name not declared in the pattern.
|
|
// Result returned as a slice of human-readable warning strings —
|
|
// caller decides whether to surface or ignore.
|
|
func validateResultMarkers(pattern, result string, markers []Marker, caseSens bool) []string {
|
|
declared := make(map[string]bool, len(markers))
|
|
for _, m := range markers {
|
|
key := m.Name
|
|
if !caseSens {
|
|
key = strings.ToUpper(key)
|
|
}
|
|
declared[key] = true
|
|
}
|
|
if len(declared) == 0 {
|
|
// Nothing to validate against — rule is keyword-only.
|
|
return nil
|
|
}
|
|
|
|
var warnings []string
|
|
seen := map[string]bool{}
|
|
i := 0
|
|
for i < len(result) {
|
|
// Marker shapes recognised here mirror applyResult's loop:
|
|
// <name>, <(name)>, <.name.>, <{name}>, <"name">, #<name>.
|
|
// findMarkerEnd already understands all of them — we just
|
|
// need the inner identifier.
|
|
if result[i] != '<' && !(result[i] == '#' && i+1 < len(result) && result[i+1] == '<') {
|
|
i++
|
|
continue
|
|
}
|
|
start := i
|
|
if result[i] == '#' {
|
|
start = i + 1
|
|
}
|
|
end := findMarkerEnd(result, start)
|
|
if end == 0 {
|
|
i++
|
|
continue
|
|
}
|
|
// Extract identifier between the wrappers.
|
|
inner := result[start+1 : end-1]
|
|
// Strip prefix `(`, `.`, `"`, `{`
|
|
for len(inner) > 0 && (inner[0] == '(' || inner[0] == '.' || inner[0] == '"' || inner[0] == '{') {
|
|
inner = inner[1:]
|
|
}
|
|
// Strip suffix `)`, `.`, `"`, `}`
|
|
for len(inner) > 0 {
|
|
c := inner[len(inner)-1]
|
|
if c == ')' || c == '.' || c == '"' || c == '}' || c == ' ' {
|
|
inner = inner[:len(inner)-1]
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
key := inner
|
|
if !caseSens {
|
|
key = strings.ToUpper(key)
|
|
}
|
|
if key != "" && !declared[key] && !seen[key] {
|
|
seen[key] = true
|
|
warnings = append(warnings,
|
|
fmt.Sprintf("result-template marker <%s> not declared in pattern: %q",
|
|
inner, pattern))
|
|
}
|
|
i = end
|
|
}
|
|
return warnings
|
|
}
|
|
|
|
// parseMarkers extracts all <...> markers from a pattern.
|
|
func parseMarkers(pattern string) []Marker {
|
|
var markers []Marker
|
|
i := 0
|
|
for i < len(pattern) {
|
|
if pattern[i] == '<' {
|
|
end := strings.IndexByte(pattern[i:], '>')
|
|
if end < 0 {
|
|
break
|
|
}
|
|
inner := pattern[i+1 : i+end]
|
|
m := parseOneMarker(inner)
|
|
if m.Name != "" {
|
|
markers = append(markers, m)
|
|
}
|
|
i += end + 1
|
|
} else {
|
|
i++
|
|
}
|
|
}
|
|
return markers
|
|
}
|
|
|
|
func parseOneMarker(inner string) Marker {
|
|
inner = strings.TrimSpace(inner)
|
|
|
|
// <!name!> — restricted
|
|
if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
|
|
}
|
|
|
|
// <*name*> — wild
|
|
if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
|
|
}
|
|
|
|
// <name,...> — comma list
|
|
if strings.HasSuffix(inner, ",...") {
|
|
return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
|
|
}
|
|
|
|
// <name:a,b,c> — word list
|
|
if idx := strings.IndexByte(inner, ':'); idx > 0 {
|
|
name := inner[:idx]
|
|
vals := strings.Split(inner[idx+1:], ",")
|
|
for i := range vals {
|
|
vals[i] = strings.TrimSpace(vals[i])
|
|
}
|
|
return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
|
|
}
|
|
|
|
// <(name)> — extended-expression marker. In Harbour PP this captures
|
|
// a file-name-like extended expression and the matching result token
|
|
// `<(name)>` smart-stringifies it (already-quoted → keep, identifier
|
|
// → quote). Strip the parens so captures are stored under the bare
|
|
// name; result substitution then matches both `<(name)>` and `<name>`
|
|
// via the existing path.
|
|
if strings.HasPrefix(inner, "(") && strings.HasSuffix(inner, ")") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRegular}
|
|
}
|
|
|
|
// <name> — regular
|
|
return Marker{Name: inner, Type: MarkerRegular}
|
|
}
|
|
|
|
// --- Rule matching and application ---
|
|
|
|
// MatchLine checks if a source line matches this rule and returns the substituted result.
|
|
// Returns ("", false) if no match.
|
|
func (r *Rule) MatchLine(line string) (string, bool) {
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed == "" {
|
|
return "", false
|
|
}
|
|
|
|
// Fast keyword check
|
|
if r.Keyword != "" {
|
|
firstWord := firstToken(trimmed)
|
|
if r.CaseSens {
|
|
if firstWord != r.Keyword {
|
|
return "", false
|
|
}
|
|
} else {
|
|
if !strings.EqualFold(firstWord, r.Keyword) {
|
|
return "", false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to match pattern against line
|
|
captures := r.matchPattern(trimmed)
|
|
if captures == nil {
|
|
return "", false
|
|
}
|
|
|
|
// Apply result template
|
|
result := r.applyResult(captures)
|
|
return result, true
|
|
}
|
|
|
|
// matchPattern attempts to match the pattern against a line.
|
|
// Returns captured values map, or nil if no match.
|
|
func (r *Rule) matchPattern(line string) map[string]string {
|
|
captures := make(map[string]string)
|
|
|
|
patternWords := tokenizePattern(r.Pattern)
|
|
lineWords := tokenizeLine(line)
|
|
|
|
pi, li := 0, 0
|
|
for pi < len(patternWords) && li < len(lineWords) {
|
|
pw := patternWords[pi]
|
|
|
|
// Marker?
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
inner := pw[1 : len(pw)-1]
|
|
m := parseOneMarker(inner)
|
|
|
|
switch m.Type {
|
|
case MarkerWild:
|
|
// Capture rest of line
|
|
rest := strings.Join(lineWords[li:], " ")
|
|
captures[m.Name] = rest
|
|
li = len(lineWords)
|
|
pi++
|
|
|
|
case MarkerList:
|
|
// Capture a comma-separated list until the next literal
|
|
// pattern token. Paren-balanced so nested `(`/`[`/`{`
|
|
// don't let an inner `)` terminate the capture. Commas
|
|
// at the top level are preserved verbatim in the
|
|
// captured string so the `<z>` substitution in the
|
|
// result template reproduces the argument list as-is.
|
|
var parts []string
|
|
depth := 0
|
|
delim := ""
|
|
if pi+1 < len(patternWords) {
|
|
delim = patternWords[pi+1]
|
|
}
|
|
for li < len(lineWords) {
|
|
w := lineWords[li]
|
|
if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
|
|
break
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
li++
|
|
}
|
|
captures[m.Name] = strings.Join(parts, " ")
|
|
pi++
|
|
|
|
case MarkerWordList:
|
|
// Match one of listed words
|
|
matched := false
|
|
for _, allowed := range m.ListValues {
|
|
if r.CaseSens {
|
|
if lineWords[li] == allowed {
|
|
matched = true
|
|
break
|
|
}
|
|
} else if strings.EqualFold(lineWords[li], allowed) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched {
|
|
return nil
|
|
}
|
|
captures[m.Name] = lineWords[li]
|
|
li++
|
|
pi++
|
|
|
|
default:
|
|
// Regular or restricted: capture one token or expression
|
|
captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
|
|
captures[m.Name] = captured
|
|
pi++
|
|
}
|
|
} else if pw == "[" {
|
|
// Optional, possibly-repeating sub-pattern. Try matching the
|
|
// bracketed body repeatedly against the remaining line; each
|
|
// successful iteration appends its marker captures under the
|
|
// same name with a \x01 separator. Used by Harbour forms
|
|
// like `DEFAULT <v1> TO <x1> [, <vn> TO <xn> ]` where the
|
|
// trailing bracket repeats for each additional pair.
|
|
depth := 1
|
|
bodyStart := pi + 1
|
|
bodyEnd := bodyStart
|
|
for bodyEnd < len(patternWords) && depth > 0 {
|
|
if patternWords[bodyEnd] == "[" {
|
|
depth++
|
|
} else if patternWords[bodyEnd] == "]" {
|
|
depth--
|
|
if depth == 0 {
|
|
break
|
|
}
|
|
}
|
|
bodyEnd++
|
|
}
|
|
body := patternWords[bodyStart:bodyEnd]
|
|
// Outer-pattern tail (everything after the matching `]`) is
|
|
// needed so a regular marker at the end of `body` knows where
|
|
// to stop capturing. Without this, `[TO <v>] [FOR <for>]`
|
|
// against `TO n FOR age >= 30` would let `<v>` swallow the
|
|
// rest of the line because `body` itself has no literal that
|
|
// follows the marker.
|
|
outerTail := patternWords[bodyEnd+1:]
|
|
for li < len(lineWords) {
|
|
snapshotLi := li
|
|
iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens, outerTail)
|
|
if !ok {
|
|
li = snapshotLi
|
|
break
|
|
}
|
|
// No-progress matches can happen when the body is just
|
|
// a list/regular marker that immediately hits a stop
|
|
// boundary on this iteration — its captured value is
|
|
// empty. Don't merge those into captures, otherwise an
|
|
// earlier successful iteration's value gets contaminated
|
|
// with the `\x01`-separator form and the result-template
|
|
// substitution skips it as multi-capture garbage.
|
|
if newLi == snapshotLi {
|
|
break
|
|
}
|
|
for k, v := range iterCaps {
|
|
if prev, hit := captures[k]; hit && prev != "" {
|
|
captures[k] = prev + "\x01" + v
|
|
} else {
|
|
captures[k] = v
|
|
}
|
|
}
|
|
li = newLi
|
|
}
|
|
pi = bodyEnd + 1 // past ]
|
|
} else if pw == "]" {
|
|
pi++
|
|
} else {
|
|
// Literal keyword — must match
|
|
if !matchWord(lineWords[li], pw, r.CaseSens) {
|
|
return nil
|
|
}
|
|
li++
|
|
pi++
|
|
}
|
|
}
|
|
|
|
// Walk any tail of the pattern that wasn't matched against the
|
|
// line. We accept it only if everything that remains is *optional*
|
|
// — i.e. a `[...]` block (which by definition can be absent) or
|
|
// markers/literals that are nested inside one. A bare `<a>` or a
|
|
// literal token outside of brackets is required, so encountering
|
|
// one means the pattern isn't satisfied: bare `CLOSE` must not
|
|
// match rule `CLOSE <a>`.
|
|
depth := 0
|
|
for pi < len(patternWords) {
|
|
pw := patternWords[pi]
|
|
switch {
|
|
case pw == "[":
|
|
depth++
|
|
case pw == "]":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
default:
|
|
if depth == 0 {
|
|
return nil
|
|
}
|
|
}
|
|
pi++
|
|
}
|
|
|
|
// For #command with no markers and no optional clauses:
|
|
// all line tokens must be consumed for a match
|
|
if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
|
|
!strings.Contains(r.Pattern, "[") {
|
|
return nil
|
|
}
|
|
|
|
return captures
|
|
}
|
|
|
|
// matchSegment tries to match a bracketed sub-pattern against a slice
|
|
// of the line tokens starting at startLi. Returns per-iteration
|
|
// captures and the new line position on success. The segment cannot
|
|
// contain nested `[...]` — callers of the optional-repeat logic
|
|
// flatten one level at a time.
|
|
//
|
|
// A "mini-matcher" that mirrors the main loop for MarkerRegular,
|
|
// MarkerRestricted, and MarkerList plus literal keywords. MarkerWild
|
|
// inside `[...]` is rare and still defers to the main matcher.
|
|
func matchSegment(segment, lineWords []string, startLi int, caseSens bool, outerTail []string) (map[string]string, int, bool) {
|
|
caps := make(map[string]string)
|
|
li := startLi
|
|
|
|
// When the segment starts with a literal (e.g. `,` in
|
|
// `[, <vn> TO <xn>]`), treat that literal as the natural boundary
|
|
// between iterations. Used as the delimiter for a trailing marker
|
|
// that would otherwise gobble the rest of the line.
|
|
repeatBoundary := ""
|
|
if len(segment) > 0 && !strings.HasPrefix(segment[0], "<") &&
|
|
segment[0] != "[" && segment[0] != "]" {
|
|
repeatBoundary = segment[0]
|
|
}
|
|
|
|
for pi := 0; pi < len(segment); pi++ {
|
|
pw := segment[pi]
|
|
// Nested optional clause: find the matching `]`, run the
|
|
// repeat-loop on the inner body until no progress. Mirrors
|
|
// the main matchPattern's `[` branch. Doesn't require any
|
|
// remaining input — an absent optional just doesn't iterate.
|
|
if pw == "[" {
|
|
depth := 1
|
|
bodyStart := pi + 1
|
|
bodyEnd := bodyStart
|
|
for bodyEnd < len(segment) && depth > 0 {
|
|
if segment[bodyEnd] == "[" {
|
|
depth++
|
|
} else if segment[bodyEnd] == "]" {
|
|
depth--
|
|
if depth == 0 {
|
|
break
|
|
}
|
|
}
|
|
bodyEnd++
|
|
}
|
|
innerBody := segment[bodyStart:bodyEnd]
|
|
innerOuterTail := segment[bodyEnd+1:]
|
|
for li < len(lineWords) {
|
|
snapshotLi := li
|
|
iterCaps, newLi, ok := matchSegment(innerBody, lineWords, li, caseSens, innerOuterTail)
|
|
if !ok {
|
|
li = snapshotLi
|
|
break
|
|
}
|
|
if newLi == snapshotLi {
|
|
break
|
|
}
|
|
for k, v := range iterCaps {
|
|
if prev, hit := caps[k]; hit && prev != "" {
|
|
caps[k] = prev + "\x01" + v
|
|
} else {
|
|
caps[k] = v
|
|
}
|
|
}
|
|
li = newLi
|
|
}
|
|
pi = bodyEnd
|
|
continue
|
|
}
|
|
if pw == "]" {
|
|
// Stray closer — skip.
|
|
continue
|
|
}
|
|
if li >= len(lineWords) {
|
|
return nil, startLi, false
|
|
}
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
inner := pw[1 : len(pw)-1]
|
|
m := parseOneMarker(inner)
|
|
switch m.Type {
|
|
case MarkerWordList:
|
|
// Match one of the listed words. If the current line
|
|
// token isn't in the allowed set, the segment fails to
|
|
// match — same behavior as the top-level matcher.
|
|
w := lineWords[li]
|
|
matched := false
|
|
for _, allowed := range m.ListValues {
|
|
if caseSens {
|
|
if w == allowed {
|
|
matched = true
|
|
break
|
|
}
|
|
} else if strings.EqualFold(w, allowed) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched {
|
|
return nil, startLi, false
|
|
}
|
|
caps[m.Name] = w
|
|
li++
|
|
continue
|
|
case MarkerList:
|
|
// Capture comma-separated tokens until we hit the
|
|
// segment's next literal, an outer literal, or one of
|
|
// the limited values of a following MarkerWordList
|
|
// (e.g. `<off:OFF>` — OFF is the only token that can
|
|
// match it, so the list before it must stop at OFF).
|
|
// Paren-balanced so `f(a,b)` inside the list doesn't
|
|
// terminate prematurely. Mirrors the main matchPattern's
|
|
// MarkerList branch.
|
|
stop := map[string]struct{}{}
|
|
addStopFrom(stop, segment[pi+1:])
|
|
addStopFrom(stop, outerTail)
|
|
var parts []string
|
|
depth := 0
|
|
for li < len(lineWords) {
|
|
w := lineWords[li]
|
|
if depth == 0 {
|
|
key := w
|
|
if !caseSens {
|
|
key = strings.ToUpper(w)
|
|
}
|
|
if _, hit := stop[key]; hit {
|
|
break
|
|
}
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
li++
|
|
}
|
|
caps[m.Name] = strings.Join(parts, " ")
|
|
continue
|
|
case MarkerRegular, MarkerRestricted:
|
|
// fall through to capture-one-expression below
|
|
default:
|
|
return nil, startLi, false
|
|
}
|
|
// Build a pseudo-pattern tail so captureExpression picks
|
|
// the right delimiters. Priority order (each level is
|
|
// merged, then captureExpression stops at *whichever*
|
|
// delimiter shows up first in the input):
|
|
// 1. Next literals inside the same segment.
|
|
// 2. Every literal in the outer-pattern tail — what
|
|
// stops `[TO <(f)>] [FIELDS ...] [FOR ...]` from
|
|
// letting `<(f)>` swallow a trailing FOR/WHILE/...
|
|
// 3. Repeat boundary (the segment's leading literal)
|
|
// — needed for multi-iter `[, <xN>]` so each
|
|
// iteration's `<xN>` stops at the next ',' before
|
|
// the outer-tail's TO/FOR/etc. catches it.
|
|
tail := segment[pi+1:]
|
|
if !hasLiteralAfter(tail) {
|
|
combined := []string{}
|
|
if hasLiteralAfter(outerTail) {
|
|
combined = append(combined, outerTail...)
|
|
}
|
|
if repeatBoundary != "" {
|
|
combined = append(combined, repeatBoundary)
|
|
}
|
|
if len(combined) > 0 {
|
|
tail = combined
|
|
}
|
|
}
|
|
captured := captureExpression(lineWords, &li, tail, 0, caseSens)
|
|
caps[m.Name] = captured
|
|
continue
|
|
}
|
|
if !matchWord(lineWords[li], pw, caseSens) {
|
|
return nil, startLi, false
|
|
}
|
|
li++
|
|
}
|
|
return caps, li, true
|
|
}
|
|
|
|
// addStopFrom merges into `stop` every token that could legally match
|
|
// the next position in `pw`: bare literals AND each value of any
|
|
// MarkerWordList (`<name:A,B,C>`) since those markers can match only
|
|
// their listed words. Used so a preceding list/regular capture knows
|
|
// to stop before any of them. Always uppercased — the caller decides
|
|
// whether to do a case-insensitive lookup.
|
|
func addStopFrom(stop map[string]struct{}, pw []string) {
|
|
for _, w := range pw {
|
|
if w == "" || w == "[" || w == "]" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
|
|
inner := w[1 : len(w)-1]
|
|
if m := parseOneMarker(inner); m.Type == MarkerWordList {
|
|
for _, v := range m.ListValues {
|
|
stop[strings.ToUpper(v)] = struct{}{}
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
stop[strings.ToUpper(w)] = struct{}{}
|
|
}
|
|
}
|
|
|
|
// firstLiteral returns the first non-marker, non-bracket token in pw,
|
|
// or "" if none. Used to give matchSegment a stop-boundary drawn from
|
|
// the outer pattern when its body ends in a regular marker.
|
|
func firstLiteral(pw []string) string {
|
|
for _, w := range pw {
|
|
if w == "[" || w == "]" || w == "" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
|
|
continue
|
|
}
|
|
return w
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// hasLiteralAfter reports whether a pattern slice contains any literal
|
|
// keyword token (non-marker, non-bracket) — used to decide whether a
|
|
// marker's capture has a real delimiter or needs a synthetic one.
|
|
func hasLiteralAfter(segment []string) bool {
|
|
for _, pw := range segment {
|
|
if pw == "[" || pw == "]" || pw == "" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
continue
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// quoteListElements smart-stringifies a list-style capture: split val
|
|
// on top-level commas (paren / bracket / brace balanced) and emit each
|
|
// element quoted. Already-quoted elements are kept as-is so a literal
|
|
// like `"a", "b"` round-trips intact. Used by `<(name)>` substitution
|
|
// when `name` came from a `<name,...>` marker — Harbour's std.ch idiom
|
|
// for `{ <(fields)> }` to expand to `{ "a", "b", "c" }`.
|
|
func quoteListElements(val string) string {
|
|
parts := splitTopLevelCommas(val)
|
|
if len(parts) == 0 {
|
|
return ""
|
|
}
|
|
out := make([]string, 0, len(parts))
|
|
for _, p := range parts {
|
|
t := strings.TrimSpace(p)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
// Already a string literal — keep verbatim.
|
|
if n := len(t); n >= 2 &&
|
|
((t[0] == '"' && t[n-1] == '"') ||
|
|
(t[0] == '\'' && t[n-1] == '\'') ||
|
|
(t[0] == '[' && t[n-1] == ']')) {
|
|
out = append(out, t)
|
|
continue
|
|
}
|
|
out = append(out, ppQuote(t))
|
|
}
|
|
return strings.Join(out, ", ")
|
|
}
|
|
|
|
// splitTopLevelCommas splits s on commas that are not nested inside
|
|
// (), [], or {}. Strings ("..." / '...') are skipped to avoid breaking
|
|
// captured PRG expressions.
|
|
func splitTopLevelCommas(s string) []string {
|
|
var parts []string
|
|
depth := 0
|
|
start := 0
|
|
inStr := byte(0)
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if inStr != 0 {
|
|
if c == inStr {
|
|
inStr = 0
|
|
}
|
|
continue
|
|
}
|
|
switch c {
|
|
case '"', '\'':
|
|
inStr = c
|
|
case '(', '[', '{':
|
|
depth++
|
|
case ')', ']', '}':
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
case ',':
|
|
if depth == 0 {
|
|
parts = append(parts, s[start:i])
|
|
start = i + 1
|
|
}
|
|
}
|
|
}
|
|
parts = append(parts, s[start:])
|
|
return parts
|
|
}
|
|
|
|
// ppQuote wraps a captured value in a PRG string literal, picking a
|
|
// delimiter that doesn't collide with characters already inside. Harbour
|
|
// #<name> stringify takes the raw source text of the argument and must
|
|
// produce a legal PRG string — if the capture is `"world"`, the result
|
|
// can't just be `""world""`. Preference order matches Harbour:
|
|
// double-quotes first, then single-quotes, then bracket literals.
|
|
func ppQuote(val string) string {
|
|
if !strings.ContainsRune(val, '"') {
|
|
return `"` + val + `"`
|
|
}
|
|
if !strings.ContainsRune(val, '\'') {
|
|
return "'" + val + "'"
|
|
}
|
|
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
|
|
return "[" + val + "]"
|
|
}
|
|
// Fallback: double-quote with embedded quotes dropped. Pathological
|
|
// input only; Harbour itself refuses to handle this cleanly.
|
|
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
|
|
}
|
|
|
|
// applyResult substitutes captured values into the result template.
|
|
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
|
|
// all contain the bare `<z>` token, so the bare substitution has to run
|
|
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
|
|
// `.` / `"` behind, producing bogus lines like `? #hello` that the
|
|
// lexer then choked on with ILLEGAL token errors.
|
|
func (r *Rule) applyResult(captures map[string]string) string {
|
|
result := r.ResultTmpl
|
|
|
|
// Expand optional-repeat `[ ... ]` segments in the template. If any
|
|
// marker inside a bracketed section was multi-captured during the
|
|
// pattern match (values joined with \x01), emit the body once per
|
|
// iteration with per-iter values. If no markers inside are multi-
|
|
// captured, the bracket body is included once with whatever single
|
|
// captures apply (the required-or-absent case).
|
|
result = expandOptionalRepeat(result, captures)
|
|
|
|
// Marker-name → list flag, so the smart-stringify branch below can
|
|
// emit per-element quoting (`{ "a", "b" }`) for list captures
|
|
// instead of treating the comma-joined string as one literal.
|
|
isList := make(map[string]bool, len(r.Markers))
|
|
for _, m := range r.Markers {
|
|
if m.Type == MarkerList {
|
|
isList[m.Name] = true
|
|
}
|
|
}
|
|
|
|
for name, val := range captures {
|
|
// Multi-capture markers are consumed by expandOptionalRepeat;
|
|
// the bare substitution for the joined form would produce
|
|
// garbage (values separated by \x01). Skip them here and let
|
|
// any remaining bare `<name>` fall through to the cleanup.
|
|
if strings.ContainsRune(val, '\x01') {
|
|
continue
|
|
}
|
|
quoted := ppQuote(val)
|
|
// #<name> — dumb stringify (always quote).
|
|
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
|
|
// <"name"> — explicit stringify.
|
|
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
|
|
// <(name)> — smart stringify: already a string literal → keep;
|
|
// list capture → quote each comma-separated element; otherwise
|
|
// quote whole. `val` comes straight from the capture, so trim
|
|
// and check for surrounding quotes.
|
|
trim := strings.TrimSpace(val)
|
|
smart := quoted
|
|
if n := len(trim); n >= 2 &&
|
|
((trim[0] == '"' && trim[n-1] == '"') ||
|
|
(trim[0] == '\'' && trim[n-1] == '\'') ||
|
|
(trim[0] == '[' && trim[n-1] == ']')) {
|
|
smart = trim
|
|
} else if isList[name] {
|
|
smart = quoteListElements(val)
|
|
}
|
|
result = strings.ReplaceAll(result, "<("+name+")>", smart)
|
|
// <.name.> — logify (empty → .F., else .T.)
|
|
if val != "" {
|
|
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
|
|
} else {
|
|
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
|
|
}
|
|
// <{name}> — blockify: wrap captured expression in {|| ... }.
|
|
// For list-typed markers (`<name,...>`) wrap *each* element so
|
|
// `{ <{v}> }` against `LIST id, name` expands to
|
|
// `{ {|| id }, {|| name } }`, matching Harbour's std.ch
|
|
// idiom for column blocks. Empty capture → NIL so the call
|
|
// site sees a nil block (missing FOR/WHILE clause).
|
|
if val == "" {
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", "NIL")
|
|
} else if isList[name] {
|
|
parts := splitTopLevelCommas(val)
|
|
out := make([]string, 0, len(parts))
|
|
for _, p := range parts {
|
|
t := strings.TrimSpace(p)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
out = append(out, "{|| "+t+" }")
|
|
}
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", strings.Join(out, ", "))
|
|
} else {
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", "{|| "+val+" }")
|
|
}
|
|
// <name> — bare substitution (must be LAST, after all wrappers).
|
|
result = strings.ReplaceAll(result, "<"+name+">", val)
|
|
}
|
|
|
|
// Any `<{name}>` still in the template means `name` was never
|
|
// captured — emit NIL so call sites see a missing block argument
|
|
// (matches Harbour: empty FOR/WHILE → NIL → bypass the condition).
|
|
result = replaceUnreferencedBlockify(result)
|
|
|
|
// Same idea for `<.name.>`: a missing marker logifies to .F.,
|
|
// matching Harbour's behavior of "absent optional clause => .F."
|
|
// for OFF / ALL / REST / etc.
|
|
result = replaceUnreferencedLogify(result)
|
|
|
|
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
|
|
result = cleanUnreferencedMarkers(result)
|
|
|
|
return result
|
|
}
|
|
|
|
// replaceUnreferencedLogify rewrites every remaining `<.ident.>` to
|
|
// `.F.` — the absent-optional-clause sentinel that matches Harbour's
|
|
// std.ch convention.
|
|
func replaceUnreferencedLogify(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(s) {
|
|
if i+2 < len(s) && s[i] == '<' && s[i+1] == '.' {
|
|
j := i + 2
|
|
if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
|
|
j++
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j+1 < len(s) && s[j] == '.' && s[j+1] == '>' {
|
|
out.WriteString(".F.")
|
|
i = j + 2
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
out.WriteByte(s[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// replaceUnreferencedBlockify rewrites every remaining `<{ident}>` to
|
|
// NIL. Run after the main substitution loop, before the generic
|
|
// unreferenced-marker cleanup.
|
|
func replaceUnreferencedBlockify(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(s) {
|
|
if i+2 < len(s) && s[i] == '<' && s[i+1] == '{' {
|
|
j := i + 2
|
|
// Identifier
|
|
if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
|
|
j++
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j+1 < len(s) && s[j] == '}' && s[j+1] == '>' {
|
|
out.WriteString("NIL")
|
|
i = j + 2
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
out.WriteByte(s[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// expandOptionalRepeat walks a result template and rewrites each top-
|
|
// level `[ ... ]` block by examining the captures referenced inside:
|
|
//
|
|
// - If any referenced marker has multiple captured iterations
|
|
// (values joined with \x01), emit the body N times, substituting
|
|
// the i-th iteration's value for each such marker and dropping
|
|
// single-valued markers into each iteration unchanged.
|
|
// - If no referenced marker is multi-captured BUT the single
|
|
// captures include non-empty values, emit the body once.
|
|
// - Otherwise drop the block.
|
|
//
|
|
// Nested brackets are not supported — Harbour uses a single level of
|
|
// `[...]` for the common repeat form. Callers that need deeper nesting
|
|
// can fall back to writing out separate #xcommand rules.
|
|
func expandOptionalRepeat(template string, captures map[string]string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(template) {
|
|
if template[i] == '[' {
|
|
// Find matching top-level ']'. Skip over quoted strings
|
|
// and nested brackets inside PP markers like `<.x.>`.
|
|
depth := 1
|
|
j := i + 1
|
|
for j < len(template) && depth > 0 {
|
|
switch template[j] {
|
|
case '[':
|
|
// Inside a marker `<...>` the `[` is just text;
|
|
// only count top-level brackets.
|
|
if inMarker(template, j) {
|
|
j++
|
|
continue
|
|
}
|
|
depth++
|
|
case ']':
|
|
if inMarker(template, j) {
|
|
j++
|
|
continue
|
|
}
|
|
depth--
|
|
if depth == 0 {
|
|
body := template[i+1 : j]
|
|
out.WriteString(expandBracketBody(body, captures))
|
|
i = j + 1
|
|
goto next
|
|
}
|
|
}
|
|
j++
|
|
}
|
|
// Unmatched [ — copy literally.
|
|
out.WriteByte(template[i])
|
|
i++
|
|
next:
|
|
continue
|
|
}
|
|
out.WriteByte(template[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// inMarker reports whether position `p` in s is inside a PP marker
|
|
// reference like `<.x.>` / `<"x">` / `<(x)>` — where `[` and `]` are
|
|
// ordinary text, not template delimiters.
|
|
func inMarker(s string, p int) bool {
|
|
// Look backward for `<` not preceded by a marker-terminator.
|
|
for k := p - 1; k >= 0; k-- {
|
|
c := s[k]
|
|
if c == '>' {
|
|
return false
|
|
}
|
|
if c == '<' {
|
|
// Scan forward from `<` to see if we're still inside.
|
|
for m := k + 1; m < len(s) && m <= p; m++ {
|
|
if s[m] == '>' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// expandBracketBody returns the optional-repeat body expanded once per
|
|
// iteration of its multi-captured markers. See expandOptionalRepeat.
|
|
func expandBracketBody(body string, captures map[string]string) string {
|
|
// Find marker names referenced inside the body.
|
|
refs := referencedMarkers(body)
|
|
iters := 1
|
|
hasMulti := false
|
|
for _, name := range refs {
|
|
if val, ok := captures[name]; ok && strings.ContainsRune(val, '\x01') {
|
|
n := strings.Count(val, "\x01") + 1
|
|
if n > iters {
|
|
iters = n
|
|
}
|
|
hasMulti = true
|
|
}
|
|
}
|
|
if !hasMulti {
|
|
// No multi-capture — include body once if any referenced marker
|
|
// has a (single) capture; otherwise drop.
|
|
anyPresent := false
|
|
for _, name := range refs {
|
|
if _, ok := captures[name]; ok {
|
|
anyPresent = true
|
|
break
|
|
}
|
|
}
|
|
if !anyPresent {
|
|
return ""
|
|
}
|
|
return body
|
|
}
|
|
|
|
// Pre-split each multi-captured referent into a per-iteration list.
|
|
parts := make(map[string][]string, len(refs))
|
|
for _, name := range refs {
|
|
if val, ok := captures[name]; ok {
|
|
parts[name] = strings.Split(val, "\x01")
|
|
}
|
|
}
|
|
|
|
var out strings.Builder
|
|
for iter := 0; iter < iters; iter++ {
|
|
piece := body
|
|
for name, vals := range parts {
|
|
var v string
|
|
if iter < len(vals) {
|
|
v = vals[iter]
|
|
}
|
|
quoted := ppQuote(v)
|
|
piece = strings.ReplaceAll(piece, "#<"+name+">", quoted)
|
|
piece = strings.ReplaceAll(piece, `<"`+name+`">`, quoted)
|
|
piece = strings.ReplaceAll(piece, "<("+name+")>", quoted)
|
|
if v != "" {
|
|
piece = strings.ReplaceAll(piece, "<."+name+".>", ".T.")
|
|
} else {
|
|
piece = strings.ReplaceAll(piece, "<."+name+".>", ".F.")
|
|
}
|
|
piece = strings.ReplaceAll(piece, "<"+name+">", v)
|
|
}
|
|
out.WriteString(piece)
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// referencedMarkers extracts marker names referenced inside a template
|
|
// fragment. Handles `<name>`, `<(name)>`, `<.name.>`, `<"name">`, and
|
|
// `#<name>` forms.
|
|
func referencedMarkers(s string) []string {
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
i := 0
|
|
for i < len(s) {
|
|
if s[i] == '<' {
|
|
j := i + 1
|
|
// Skip leading punctuation forms: (name), .name., "name".
|
|
for j < len(s) && (s[j] == '(' || s[j] == '.' || s[j] == '"') {
|
|
j++
|
|
}
|
|
start := j
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') ||
|
|
(s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j > start {
|
|
name := s[start:j]
|
|
if !seen[name] {
|
|
seen[name] = true
|
|
out = append(out, name)
|
|
}
|
|
}
|
|
i = j
|
|
continue
|
|
}
|
|
i++
|
|
}
|
|
return out
|
|
}
|
|
|
|
// cleanUnreferencedMarkers removes any remaining <name>, <(name)>,
|
|
// <.name.>, #<name> references. Only removes well-formed PP marker
|
|
// references, not comparison operators. Skips over PRG string
|
|
// literals ("...", '...', [...]) so a captured value containing
|
|
// `<a>` text (e.g. "<a>http://x</a>" inside a regex/string) isn't
|
|
// gutted — that pass used to corrupt arbitrary string content.
|
|
func cleanUnreferencedMarkers(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
inStr := byte(0)
|
|
for i < len(s) {
|
|
c := s[i]
|
|
// Inside a string literal: copy until the matching closer.
|
|
// Bracket-strings `[...]` are PRG-specific but are also used
|
|
// as the result template's optional-repeat brackets, so we
|
|
// leave them out of this pass — only `'…'` and `"…"` are
|
|
// unambiguous strings here.
|
|
if inStr != 0 {
|
|
out.WriteByte(c)
|
|
if c == inStr {
|
|
inStr = 0
|
|
}
|
|
i++
|
|
continue
|
|
}
|
|
if c == '"' || c == '\'' {
|
|
inStr = c
|
|
out.WriteByte(c)
|
|
i++
|
|
continue
|
|
}
|
|
removed := false
|
|
// #<name>
|
|
if c == '#' && i+1 < len(s) && s[i+1] == '<' {
|
|
if end := findMarkerEnd(s, i+1); end > 0 {
|
|
i = end
|
|
removed = true
|
|
}
|
|
}
|
|
// <name>, <(name)>, <.name.>, <"name">
|
|
if !removed && c == '<' {
|
|
if end := findMarkerEnd(s, i); end > 0 {
|
|
i = end
|
|
removed = true
|
|
}
|
|
}
|
|
if !removed {
|
|
out.WriteByte(c)
|
|
i++
|
|
}
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
|
|
func findMarkerEnd(s string, start int) int {
|
|
if start >= len(s) || s[start] != '<' {
|
|
return 0
|
|
}
|
|
i := start + 1
|
|
// Skip optional ( or . or " or { prefix (smart-stringify, logify,
|
|
// stringify, blockify respectively)
|
|
if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"' || s[i] == '{') {
|
|
i++
|
|
}
|
|
// Must start with letter or underscore (identifier)
|
|
if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
|
|
return 0
|
|
}
|
|
// Consume identifier
|
|
for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
|
|
i++
|
|
}
|
|
// Skip optional ) or . or " or } or , suffix
|
|
for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == '}' || s[i] == ',' || s[i] == ' ') {
|
|
i++
|
|
}
|
|
if i < len(s) && s[i] == '>' {
|
|
return i + 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// --- Helpers ---
|
|
|
|
func firstToken(s string) string {
|
|
for i, c := range s {
|
|
if c == ' ' || c == '\t' || c == '(' {
|
|
return s[:i]
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
func matchWord(lineWord, patternWord string, caseSens bool) bool {
|
|
if caseSens {
|
|
return lineWord == patternWord
|
|
}
|
|
return strings.EqualFold(lineWord, patternWord)
|
|
}
|
|
|
|
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
|
|
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
|
|
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
|
|
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
|
|
// single word would never align with the pattern's `DUMB( , <z>, )`
|
|
// tokens.
|
|
func tokenizePattern(pattern string) []string {
|
|
var tokens []string
|
|
i := 0
|
|
for i < len(pattern) {
|
|
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
|
|
i++
|
|
}
|
|
if i >= len(pattern) {
|
|
break
|
|
}
|
|
|
|
if pattern[i] == '<' {
|
|
end := strings.IndexByte(pattern[i:], '>')
|
|
if end >= 0 {
|
|
tokens = append(tokens, pattern[i:i+end+1])
|
|
i += end + 1
|
|
continue
|
|
}
|
|
}
|
|
|
|
switch pattern[i] {
|
|
case '[', ']', '(', ')', ',':
|
|
tokens = append(tokens, string(pattern[i]))
|
|
i++
|
|
continue
|
|
}
|
|
|
|
// Regular word — stop at space/tab/marker/bracket/paren/comma.
|
|
start := i
|
|
for i < len(pattern) {
|
|
c := pattern[i]
|
|
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
|
|
c == '(' || c == ')' || c == ',' {
|
|
break
|
|
}
|
|
i++
|
|
}
|
|
if i > start {
|
|
tokens = append(tokens, pattern[start:i])
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// tokenizeLine splits a source line into words matching the rules used
|
|
// by tokenizePattern: string literals stay intact, commas/parens/brackets
|
|
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
|
|
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
|
|
func tokenizeLine(line string) []string {
|
|
var tokens []string
|
|
i := 0
|
|
for i < len(line) {
|
|
for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
|
|
i++
|
|
}
|
|
if i >= len(line) {
|
|
break
|
|
}
|
|
|
|
// String literal
|
|
if line[i] == '"' || line[i] == '\'' {
|
|
quote := line[i]
|
|
start := i
|
|
i++
|
|
for i < len(line) && line[i] != quote {
|
|
i++
|
|
}
|
|
if i < len(line) {
|
|
i++
|
|
}
|
|
tokens = append(tokens, line[start:i])
|
|
continue
|
|
}
|
|
|
|
switch line[i] {
|
|
case ',', '(', ')', '[', ']', '{', '}':
|
|
tokens = append(tokens, string(line[i]))
|
|
i++
|
|
continue
|
|
}
|
|
|
|
// Word — stop at whitespace, brackets, parens, braces, comma, quotes.
|
|
// Braces split out so codeblock literals `{|| ... }` and array
|
|
// literals `{1, 2}` balance correctly during capture: without this
|
|
// `{||` fuses into one word that fails the depth-tracker's exact
|
|
// `{` match, while a trailing `}` token (alone before `)`) does
|
|
// match `case "}":` and falsely decrements depth.
|
|
start := i
|
|
for i < len(line) {
|
|
c := line[i]
|
|
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
|
|
c == '[' || c == ']' || c == '{' || c == '}' || c == '"' || c == '\'' {
|
|
break
|
|
}
|
|
i++
|
|
}
|
|
if i > start {
|
|
tokens = append(tokens, line[start:i])
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// captureExpression captures an expression from line tokens.
|
|
// If this is the last marker in the pattern, captures all remaining tokens.
|
|
// Otherwise, captures until the next keyword in the pattern.
|
|
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
|
|
if *li >= len(lineWords) {
|
|
return ""
|
|
}
|
|
|
|
// Collect every literal-keyword delimiter that follows in the
|
|
// pattern, not just the first. Optional clauses in std.ch sit
|
|
// next to one another (`[TO <(f)>] [FIELDS <fields,...>]
|
|
// [FOR <for>] [WHILE <while>] ...`), so the file-name marker
|
|
// must stop at TO's *successor* — but we don't know which
|
|
// successor will actually be present in the input. Stopping on
|
|
// any of them keeps `<(f)>` from swallowing a trailing
|
|
// `FOR x > 5` clause. MarkerWordList values count too — a
|
|
// `<off:OFF>` marker can only match the word OFF, so prior
|
|
// captures must stop at it.
|
|
stopSet := map[string]struct{}{}
|
|
addStopFrom(stopSet, patternWords[nextPi:])
|
|
var delims []string
|
|
for k := range stopSet {
|
|
delims = append(delims, k)
|
|
}
|
|
|
|
if len(delims) > 0 {
|
|
// Capture until any delimiter is hit, paren-balancing so nested
|
|
// parens/brackets/braces inside the expression don't falsely
|
|
// terminate the capture. Harbour's own PP does the same —
|
|
// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
|
|
// and leave the outer `)` for the pattern's own delimiter.
|
|
var parts []string
|
|
depth := 0
|
|
for *li < len(lineWords) {
|
|
w := lineWords[*li]
|
|
if depth == 0 {
|
|
stop := false
|
|
for _, d := range delims {
|
|
if matchWord(w, d, caseSens) {
|
|
stop = true
|
|
break
|
|
}
|
|
}
|
|
if stop {
|
|
break
|
|
}
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
*li++
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
// No delimiter: if last marker, capture all remaining tokens
|
|
if nextPi >= len(patternWords) {
|
|
rest := strings.Join(lineWords[*li:], " ")
|
|
*li = len(lineWords)
|
|
return rest
|
|
}
|
|
|
|
// Single token capture (between markers)
|
|
tok := lineWords[*li]
|
|
*li++
|
|
return tok
|
|
}
|