five/compiler/pp/command.go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.

// #command / #translate implementation for Five preprocessor.
//
// Harbour PP syntax:
//   #command PATTERN => RESULT
//   #translate PATTERN => RESULT
//   #xcommand PATTERN => RESULT   (case-sensitive)
//   #xtranslate PATTERN => RESULT (case-sensitive)
//
// Pattern markers:
//   <x>       — match any expression (regular match)
//   <!x!>     — match single identifier only (restricted match)
//   <x,...>   — match comma-separated list
//   <*x*>     — match rest of line (wild match)
//   <x:a,b,c> — match one of listed words (list match)
//   [...]     — optional clause
//
// Result markers:
//   <x>       — substitute matched text
//   <(x)>     — stringify (wrap in quotes)
//   <{x}>     — blockify (wrap in {|| })
//   #<x>      — dumb stringify
//   <.x.>     — logify (.T. if matched, .F. if not)
//
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
package pp

import (
	"strings"
)

// Rule represents a single #command or #translate rule.
type Rule struct {
	Pattern    string   // raw pattern text
	Result     string   // raw result text
	IsCommand  bool     // #command vs #translate
	CaseSens   bool     // #xcommand/#xtranslate = case sensitive
	Keyword    string   // first keyword (for fast matching)
	Markers    []Marker // parsed pattern markers
	ResultTmpl string   // result template with marker references
}

// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
type Marker struct {
	Name       string // marker name
	Type       MarkerType
	ListValues []string // for <x:a,b,c> — allowed values
}

type MarkerType int

const (
	MarkerRegular    MarkerType = iota // <x> — any expression
	MarkerRestricted                    // <!x!> — identifier only
	MarkerList                          // <x,...> — comma-separated list
	MarkerWild                          // <*x*> — rest of line
	MarkerWordList                      // <x:a,b,c> — one of listed words
)

// ParseRule parses a #command/#translate directive into a Rule.
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
	// Split on =>
	parts := strings.SplitN(directive, "=>", 2)
	if len(parts) != 2 {
		return nil
	}

	pattern := strings.TrimSpace(parts[0])
	result := strings.TrimSpace(parts[1])

	// Earlier versions stripped every ` ;` as Harbour line-continuation.
	// That also destroyed in-line PRG statement separators — `IF x ==
	// NIL ; x := y ; ENDIF` lost all its semicolons. Line-continuation
	// joining is the preprocessor's job (processLines), not this rule
	// parser's. Keep the semicolons as-is.

	rule := &Rule{
		Pattern:    pattern,
		Result:     result,
		IsCommand:  isCommand,
		CaseSens:   caseSens,
		ResultTmpl: result,
	}

	// Extract first keyword for fast matching. The first whitespace-
	// delimited token of the pattern becomes the dispatch key; we
	// strip marker wrappers and any trailing `(` so a pattern like
	// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
	// firstToken normalises source lines.
	words := strings.Fields(pattern)
	if len(words) > 0 {
		kw := words[0]
		kw = strings.TrimLeft(kw, "<[")
		kw = strings.TrimRight(kw, ">]")
		if idx := strings.IndexByte(kw, '('); idx >= 0 {
			kw = kw[:idx]
		}
		if !strings.ContainsAny(kw, "!*,:") {
			rule.Keyword = kw
		}
	}

	// Parse markers from pattern
	rule.Markers = parseMarkers(pattern)

	return rule
}

// parseMarkers extracts all <...> markers from a pattern.
func parseMarkers(pattern string) []Marker {
	var markers []Marker
	i := 0
	for i < len(pattern) {
		if pattern[i] == '<' {
			end := strings.IndexByte(pattern[i:], '>')
			if end < 0 {
				break
			}
			inner := pattern[i+1 : i+end]
			m := parseOneMarker(inner)
			if m.Name != "" {
				markers = append(markers, m)
			}
			i += end + 1
		} else {
			i++
		}
	}
	return markers
}

func parseOneMarker(inner string) Marker {
	inner = strings.TrimSpace(inner)

	// <!name!> — restricted
	if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
		return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
	}

	// <*name*> — wild
	if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
		return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
	}

	// <name,...> — comma list
	if strings.HasSuffix(inner, ",...") {
		return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
	}

	// <name:a,b,c> — word list
	if idx := strings.IndexByte(inner, ':'); idx > 0 {
		name := inner[:idx]
		vals := strings.Split(inner[idx+1:], ",")
		for i := range vals {
			vals[i] = strings.TrimSpace(vals[i])
		}
		return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
	}

	// <(name)> — extended-expression marker. In Harbour PP this captures
	// a file-name-like extended expression and the matching result token
	// `<(name)>` smart-stringifies it (already-quoted → keep, identifier
	// → quote). Strip the parens so captures are stored under the bare
	// name; result substitution then matches both `<(name)>` and `<name>`
	// via the existing path.
	if strings.HasPrefix(inner, "(") && strings.HasSuffix(inner, ")") {
		return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRegular}
	}

	// <name> — regular
	return Marker{Name: inner, Type: MarkerRegular}
}

// --- Rule matching and application ---

// MatchLine checks if a source line matches this rule and returns the substituted result.
// Returns ("", false) if no match.
func (r *Rule) MatchLine(line string) (string, bool) {
	trimmed := strings.TrimSpace(line)
	if trimmed == "" {
		return "", false
	}

	// Fast keyword check
	if r.Keyword != "" {
		firstWord := firstToken(trimmed)
		if r.CaseSens {
			if firstWord != r.Keyword {
				return "", false
			}
		} else {
			if !strings.EqualFold(firstWord, r.Keyword) {
				return "", false
			}
		}
	}

	// Try to match pattern against line
	captures := r.matchPattern(trimmed)
	if captures == nil {
		return "", false
	}

	// Apply result template
	result := r.applyResult(captures)
	return result, true
}

// matchPattern attempts to match the pattern against a line.
// Returns captured values map, or nil if no match.
func (r *Rule) matchPattern(line string) map[string]string {
	captures := make(map[string]string)

	patternWords := tokenizePattern(r.Pattern)
	lineWords := tokenizeLine(line)

	pi, li := 0, 0
	for pi < len(patternWords) && li < len(lineWords) {
		pw := patternWords[pi]

		// Marker?
		if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
			inner := pw[1 : len(pw)-1]
			m := parseOneMarker(inner)

			switch m.Type {
			case MarkerWild:
				// Capture rest of line
				rest := strings.Join(lineWords[li:], " ")
				captures[m.Name] = rest
				li = len(lineWords)
				pi++

			case MarkerList:
				// Capture a comma-separated list until the next literal
				// pattern token. Paren-balanced so nested `(`/`[`/`{`
				// don't let an inner `)` terminate the capture. Commas
				// at the top level are preserved verbatim in the
				// captured string so the `<z>` substitution in the
				// result template reproduces the argument list as-is.
				var parts []string
				depth := 0
				delim := ""
				if pi+1 < len(patternWords) {
					delim = patternWords[pi+1]
				}
				for li < len(lineWords) {
					w := lineWords[li]
					if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
						break
					}
					switch w {
					case "(", "[", "{":
						depth++
					case ")", "]", "}":
						if depth > 0 {
							depth--
						}
					}
					parts = append(parts, w)
					li++
				}
				captures[m.Name] = strings.Join(parts, " ")
				pi++

			case MarkerWordList:
				// Match one of listed words
				matched := false
				for _, allowed := range m.ListValues {
					if r.CaseSens {
						if lineWords[li] == allowed {
							matched = true
							break
						}
					} else if strings.EqualFold(lineWords[li], allowed) {
						matched = true
						break
					}
				}
				if !matched {
					return nil
				}
				captures[m.Name] = lineWords[li]
				li++
				pi++

			default:
				// Regular or restricted: capture one token or expression
				captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
				captures[m.Name] = captured
				pi++
			}
		} else if pw == "[" {
			// Optional, possibly-repeating sub-pattern. Try matching the
			// bracketed body repeatedly against the remaining line; each
			// successful iteration appends its marker captures under the
			// same name with a \x01 separator. Used by Harbour forms
			// like `DEFAULT <v1> TO <x1> [, <vn> TO <xn> ]` where the
			// trailing bracket repeats for each additional pair.
			depth := 1
			bodyStart := pi + 1
			bodyEnd := bodyStart
			for bodyEnd < len(patternWords) && depth > 0 {
				if patternWords[bodyEnd] == "[" {
					depth++
				} else if patternWords[bodyEnd] == "]" {
					depth--
					if depth == 0 {
						break
					}
				}
				bodyEnd++
			}
			body := patternWords[bodyStart:bodyEnd]
			// Outer-pattern tail (everything after the matching `]`) is
			// needed so a regular marker at the end of `body` knows where
			// to stop capturing. Without this, `[TO <v>] [FOR <for>]`
			// against `TO n FOR age >= 30` would let `<v>` swallow the
			// rest of the line because `body` itself has no literal that
			// follows the marker.
			outerTail := patternWords[bodyEnd+1:]
			for li < len(lineWords) {
				snapshotLi := li
				iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens, outerTail)
				if !ok {
					li = snapshotLi
					break
				}
				// No-progress matches can happen when the body is just
				// a list/regular marker that immediately hits a stop
				// boundary on this iteration — its captured value is
				// empty. Don't merge those into captures, otherwise an
				// earlier successful iteration's value gets contaminated
				// with the `\x01`-separator form and the result-template
				// substitution skips it as multi-capture garbage.
				if newLi == snapshotLi {
					break
				}
				for k, v := range iterCaps {
					if prev, hit := captures[k]; hit && prev != "" {
						captures[k] = prev + "\x01" + v
					} else {
						captures[k] = v
					}
				}
				li = newLi
			}
			pi = bodyEnd + 1 // past ]
		} else if pw == "]" {
			pi++
		} else {
			// Literal keyword — must match
			if !matchWord(lineWords[li], pw, r.CaseSens) {
				return nil
			}
			li++
			pi++
		}
	}

	// Walk any tail of the pattern that wasn't matched against the
	// line. We accept it only if everything that remains is *optional*
	// — i.e. a `[...]` block (which by definition can be absent) or
	// markers/literals that are nested inside one. A bare `<a>` or a
	// literal token outside of brackets is required, so encountering
	// one means the pattern isn't satisfied: bare `CLOSE` must not
	// match rule `CLOSE <a>`.
	depth := 0
	for pi < len(patternWords) {
		pw := patternWords[pi]
		switch {
		case pw == "[":
			depth++
		case pw == "]":
			if depth > 0 {
				depth--
			}
		default:
			if depth == 0 {
				return nil
			}
		}
		pi++
	}

	// For #command with no markers and no optional clauses:
	// all line tokens must be consumed for a match
	if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
		!strings.Contains(r.Pattern, "[") {
		return nil
	}

	return captures
}

// matchSegment tries to match a bracketed sub-pattern against a slice
// of the line tokens starting at startLi. Returns per-iteration
// captures and the new line position on success. The segment cannot
// contain nested `[...]` — callers of the optional-repeat logic
// flatten one level at a time.
//
// A "mini-matcher" that mirrors the main loop for MarkerRegular,
// MarkerRestricted, and MarkerList plus literal keywords. MarkerWild
// inside `[...]` is rare and still defers to the main matcher.
func matchSegment(segment, lineWords []string, startLi int, caseSens bool, outerTail []string) (map[string]string, int, bool) {
	caps := make(map[string]string)
	li := startLi

	// When the segment starts with a literal (e.g. `,` in
	// `[, <vn> TO <xn>]`), treat that literal as the natural boundary
	// between iterations. Used as the delimiter for a trailing marker
	// that would otherwise gobble the rest of the line.
	repeatBoundary := ""
	if len(segment) > 0 && !strings.HasPrefix(segment[0], "<") &&
		segment[0] != "[" && segment[0] != "]" {
		repeatBoundary = segment[0]
	}

	for pi := 0; pi < len(segment); pi++ {
		pw := segment[pi]
		if li >= len(lineWords) {
			return nil, startLi, false
		}
		if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
			inner := pw[1 : len(pw)-1]
			m := parseOneMarker(inner)
			switch m.Type {
			case MarkerWordList:
				// Match one of the listed words. If the current line
				// token isn't in the allowed set, the segment fails to
				// match — same behavior as the top-level matcher.
				w := lineWords[li]
				matched := false
				for _, allowed := range m.ListValues {
					if caseSens {
						if w == allowed {
							matched = true
							break
						}
					} else if strings.EqualFold(w, allowed) {
						matched = true
						break
					}
				}
				if !matched {
					return nil, startLi, false
				}
				caps[m.Name] = w
				li++
				continue
			case MarkerList:
				// Capture comma-separated tokens until we hit the
				// segment's next literal, an outer literal, or one of
				// the limited values of a following MarkerWordList
				// (e.g. `<off:OFF>` — OFF is the only token that can
				// match it, so the list before it must stop at OFF).
				// Paren-balanced so `f(a,b)` inside the list doesn't
				// terminate prematurely. Mirrors the main matchPattern's
				// MarkerList branch.
				stop := map[string]struct{}{}
				addStopFrom(stop, segment[pi+1:])
				addStopFrom(stop, outerTail)
				var parts []string
				depth := 0
				for li < len(lineWords) {
					w := lineWords[li]
					if depth == 0 {
						key := w
						if !caseSens {
							key = strings.ToUpper(w)
						}
						if _, hit := stop[key]; hit {
							break
						}
					}
					switch w {
					case "(", "[", "{":
						depth++
					case ")", "]", "}":
						if depth > 0 {
							depth--
						}
					}
					parts = append(parts, w)
					li++
				}
				caps[m.Name] = strings.Join(parts, " ")
				continue
			case MarkerRegular, MarkerRestricted:
				// fall through to capture-one-expression below
			default:
				return nil, startLi, false
			}
			// Build a pseudo-pattern tail so captureExpression picks the
			// right delimiters. Priority:
			//   1. Next literals inside the same segment.
			//   2. Every literal in the outer-pattern tail — this is
			//      what stops `[TO <(f)>] [FIELDS ...] [FOR ...]` from
			//      letting `<(f)>` swallow a trailing FOR/WHILE/NEXT
			//      clause that happened to be present.
			//   3. Repeat boundary (the segment's leading literal) so a
			//      multi-iteration capture stops before the next iter.
			tail := segment[pi+1:]
			if !hasLiteralAfter(tail) {
				if hasLiteralAfter(outerTail) {
					tail = outerTail
				} else if repeatBoundary != "" {
					tail = []string{repeatBoundary}
				}
			}
			captured := captureExpression(lineWords, &li, tail, 0, caseSens)
			caps[m.Name] = captured
			continue
		}
		if !matchWord(lineWords[li], pw, caseSens) {
			return nil, startLi, false
		}
		li++
	}
	return caps, li, true
}

// addStopFrom merges into `stop` every token that could legally match
// the next position in `pw`: bare literals AND each value of any
// MarkerWordList (`<name:A,B,C>`) since those markers can match only
// their listed words. Used so a preceding list/regular capture knows
// to stop before any of them. Always uppercased — the caller decides
// whether to do a case-insensitive lookup.
func addStopFrom(stop map[string]struct{}, pw []string) {
	for _, w := range pw {
		if w == "" || w == "[" || w == "]" {
			continue
		}
		if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
			inner := w[1 : len(w)-1]
			if m := parseOneMarker(inner); m.Type == MarkerWordList {
				for _, v := range m.ListValues {
					stop[strings.ToUpper(v)] = struct{}{}
				}
			}
			continue
		}
		stop[strings.ToUpper(w)] = struct{}{}
	}
}

// firstLiteral returns the first non-marker, non-bracket token in pw,
// or "" if none. Used to give matchSegment a stop-boundary drawn from
// the outer pattern when its body ends in a regular marker.
func firstLiteral(pw []string) string {
	for _, w := range pw {
		if w == "[" || w == "]" || w == "" {
			continue
		}
		if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
			continue
		}
		return w
	}
	return ""
}

// hasLiteralAfter reports whether a pattern slice contains any literal
// keyword token (non-marker, non-bracket) — used to decide whether a
// marker's capture has a real delimiter or needs a synthetic one.
func hasLiteralAfter(segment []string) bool {
	for _, pw := range segment {
		if pw == "[" || pw == "]" || pw == "" {
			continue
		}
		if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
			continue
		}
		return true
	}
	return false
}

// quoteListElements smart-stringifies a list-style capture: split val
// on top-level commas (paren / bracket / brace balanced) and emit each
// element quoted. Already-quoted elements are kept as-is so a literal
// like `"a", "b"` round-trips intact. Used by `<(name)>` substitution
// when `name` came from a `<name,...>` marker — Harbour's std.ch idiom
// for `{ <(fields)> }` to expand to `{ "a", "b", "c" }`.
func quoteListElements(val string) string {
	parts := splitTopLevelCommas(val)
	if len(parts) == 0 {
		return ""
	}
	out := make([]string, 0, len(parts))
	for _, p := range parts {
		t := strings.TrimSpace(p)
		if t == "" {
			continue
		}
		// Already a string literal — keep verbatim.
		if n := len(t); n >= 2 &&
			((t[0] == '"' && t[n-1] == '"') ||
				(t[0] == '\'' && t[n-1] == '\'') ||
				(t[0] == '[' && t[n-1] == ']')) {
			out = append(out, t)
			continue
		}
		out = append(out, ppQuote(t))
	}
	return strings.Join(out, ", ")
}

// splitTopLevelCommas splits s on commas that are not nested inside
// (), [], or {}. Strings ("..." / '...') are skipped to avoid breaking
// captured PRG expressions.
func splitTopLevelCommas(s string) []string {
	var parts []string
	depth := 0
	start := 0
	inStr := byte(0)
	for i := 0; i < len(s); i++ {
		c := s[i]
		if inStr != 0 {
			if c == inStr {
				inStr = 0
			}
			continue
		}
		switch c {
		case '"', '\'':
			inStr = c
		case '(', '[', '{':
			depth++
		case ')', ']', '}':
			if depth > 0 {
				depth--
			}
		case ',':
			if depth == 0 {
				parts = append(parts, s[start:i])
				start = i + 1
			}
		}
	}
	parts = append(parts, s[start:])
	return parts
}

// ppQuote wraps a captured value in a PRG string literal, picking a
// delimiter that doesn't collide with characters already inside. Harbour
// #<name> stringify takes the raw source text of the argument and must
// produce a legal PRG string — if the capture is `"world"`, the result
// can't just be `""world""`. Preference order matches Harbour:
// double-quotes first, then single-quotes, then bracket literals.
func ppQuote(val string) string {
	if !strings.ContainsRune(val, '"') {
		return `"` + val + `"`
	}
	if !strings.ContainsRune(val, '\'') {
		return "'" + val + "'"
	}
	if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
		return "[" + val + "]"
	}
	// Fallback: double-quote with embedded quotes dropped. Pathological
	// input only; Harbour itself refuses to handle this cleanly.
	return `"` + strings.ReplaceAll(val, `"`, "") + `"`
}

// applyResult substitutes captured values into the result template.
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
// all contain the bare `<z>` token, so the bare substitution has to run
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
// `.` / `"` behind, producing bogus lines like `? #hello` that the
// lexer then choked on with ILLEGAL token errors.
func (r *Rule) applyResult(captures map[string]string) string {
	result := r.ResultTmpl

	// Expand optional-repeat `[ ... ]` segments in the template. If any
	// marker inside a bracketed section was multi-captured during the
	// pattern match (values joined with \x01), emit the body once per
	// iteration with per-iter values. If no markers inside are multi-
	// captured, the bracket body is included once with whatever single
	// captures apply (the required-or-absent case).
	result = expandOptionalRepeat(result, captures)

	// Marker-name → list flag, so the smart-stringify branch below can
	// emit per-element quoting (`{ "a", "b" }`) for list captures
	// instead of treating the comma-joined string as one literal.
	isList := make(map[string]bool, len(r.Markers))
	for _, m := range r.Markers {
		if m.Type == MarkerList {
			isList[m.Name] = true
		}
	}

	for name, val := range captures {
		// Multi-capture markers are consumed by expandOptionalRepeat;
		// the bare substitution for the joined form would produce
		// garbage (values separated by \x01). Skip them here and let
		// any remaining bare `<name>` fall through to the cleanup.
		if strings.ContainsRune(val, '\x01') {
			continue
		}
		quoted := ppQuote(val)
		// #<name> — dumb stringify (always quote).
		result = strings.ReplaceAll(result, "#<"+name+">", quoted)
		// <"name"> — explicit stringify.
		result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
		// <(name)> — smart stringify: already a string literal → keep;
		// list capture → quote each comma-separated element; otherwise
		// quote whole. `val` comes straight from the capture, so trim
		// and check for surrounding quotes.
		trim := strings.TrimSpace(val)
		smart := quoted
		if n := len(trim); n >= 2 &&
			((trim[0] == '"' && trim[n-1] == '"') ||
				(trim[0] == '\'' && trim[n-1] == '\'') ||
				(trim[0] == '[' && trim[n-1] == ']')) {
			smart = trim
		} else if isList[name] {
			smart = quoteListElements(val)
		}
		result = strings.ReplaceAll(result, "<("+name+")>", smart)
		// <.name.> — logify (empty → .F., else .T.)
		if val != "" {
			result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
		} else {
			result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
		}
		// <{name}> — blockify: wrap captured expression in {|| ... }.
		// For list-typed markers (`<name,...>`) wrap *each* element so
		// `{ <{v}> }` against `LIST id, name` expands to
		// `{ {|| id }, {|| name } }`, matching Harbour's std.ch
		// idiom for column blocks. Empty capture → NIL so the call
		// site sees a nil block (missing FOR/WHILE clause).
		if val == "" {
			result = strings.ReplaceAll(result, "<{"+name+"}>", "NIL")
		} else if isList[name] {
			parts := splitTopLevelCommas(val)
			out := make([]string, 0, len(parts))
			for _, p := range parts {
				t := strings.TrimSpace(p)
				if t == "" {
					continue
				}
				out = append(out, "{|| "+t+" }")
			}
			result = strings.ReplaceAll(result, "<{"+name+"}>", strings.Join(out, ", "))
		} else {
			result = strings.ReplaceAll(result, "<{"+name+"}>", "{|| "+val+" }")
		}
		// <name> — bare substitution (must be LAST, after all wrappers).
		result = strings.ReplaceAll(result, "<"+name+">", val)
	}

	// Any `<{name}>` still in the template means `name` was never
	// captured — emit NIL so call sites see a missing block argument
	// (matches Harbour: empty FOR/WHILE → NIL → bypass the condition).
	result = replaceUnreferencedBlockify(result)

	// Same idea for `<.name.>`: a missing marker logifies to .F.,
	// matching Harbour's behavior of "absent optional clause => .F."
	// for OFF / ALL / REST / etc.
	result = replaceUnreferencedLogify(result)

	// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
	result = cleanUnreferencedMarkers(result)

	return result
}

// replaceUnreferencedLogify rewrites every remaining `<.ident.>` to
// `.F.` — the absent-optional-clause sentinel that matches Harbour's
// std.ch convention.
func replaceUnreferencedLogify(s string) string {
	var out strings.Builder
	i := 0
	for i < len(s) {
		if i+2 < len(s) && s[i] == '<' && s[i+1] == '.' {
			j := i + 2
			if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
				j++
				for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
					j++
				}
				if j+1 < len(s) && s[j] == '.' && s[j+1] == '>' {
					out.WriteString(".F.")
					i = j + 2
					continue
				}
			}
		}
		out.WriteByte(s[i])
		i++
	}
	return out.String()
}

// replaceUnreferencedBlockify rewrites every remaining `<{ident}>` to
// NIL. Run after the main substitution loop, before the generic
// unreferenced-marker cleanup.
func replaceUnreferencedBlockify(s string) string {
	var out strings.Builder
	i := 0
	for i < len(s) {
		if i+2 < len(s) && s[i] == '<' && s[i+1] == '{' {
			j := i + 2
			// Identifier
			if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
				j++
				for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
					j++
				}
				if j+1 < len(s) && s[j] == '}' && s[j+1] == '>' {
					out.WriteString("NIL")
					i = j + 2
					continue
				}
			}
		}
		out.WriteByte(s[i])
		i++
	}
	return out.String()
}

// expandOptionalRepeat walks a result template and rewrites each top-
// level `[ ... ]` block by examining the captures referenced inside:
//
//   - If any referenced marker has multiple captured iterations
//     (values joined with \x01), emit the body N times, substituting
//     the i-th iteration's value for each such marker and dropping
//     single-valued markers into each iteration unchanged.
//   - If no referenced marker is multi-captured BUT the single
//     captures include non-empty values, emit the body once.
//   - Otherwise drop the block.
//
// Nested brackets are not supported — Harbour uses a single level of
// `[...]` for the common repeat form. Callers that need deeper nesting
// can fall back to writing out separate #xcommand rules.
func expandOptionalRepeat(template string, captures map[string]string) string {
	var out strings.Builder
	i := 0
	for i < len(template) {
		if template[i] == '[' {
			// Find matching top-level ']'. Skip over quoted strings
			// and nested brackets inside PP markers like `<.x.>`.
			depth := 1
			j := i + 1
			for j < len(template) && depth > 0 {
				switch template[j] {
				case '[':
					// Inside a marker `<...>` the `[` is just text;
					// only count top-level brackets.
					if inMarker(template, j) {
						j++
						continue
					}
					depth++
				case ']':
					if inMarker(template, j) {
						j++
						continue
					}
					depth--
					if depth == 0 {
						body := template[i+1 : j]
						out.WriteString(expandBracketBody(body, captures))
						i = j + 1
						goto next
					}
				}
				j++
			}
			// Unmatched [ — copy literally.
			out.WriteByte(template[i])
			i++
		next:
			continue
		}
		out.WriteByte(template[i])
		i++
	}
	return out.String()
}

// inMarker reports whether position `p` in s is inside a PP marker
// reference like `<.x.>` / `<"x">` / `<(x)>` — where `[` and `]` are
// ordinary text, not template delimiters.
func inMarker(s string, p int) bool {
	// Look backward for `<` not preceded by a marker-terminator.
	for k := p - 1; k >= 0; k-- {
		c := s[k]
		if c == '>' {
			return false
		}
		if c == '<' {
			// Scan forward from `<` to see if we're still inside.
			for m := k + 1; m < len(s) && m <= p; m++ {
				if s[m] == '>' {
					return false
				}
			}
			return true
		}
	}
	return false
}

// expandBracketBody returns the optional-repeat body expanded once per
// iteration of its multi-captured markers. See expandOptionalRepeat.
func expandBracketBody(body string, captures map[string]string) string {
	// Find marker names referenced inside the body.
	refs := referencedMarkers(body)
	iters := 1
	hasMulti := false
	for _, name := range refs {
		if val, ok := captures[name]; ok && strings.ContainsRune(val, '\x01') {
			n := strings.Count(val, "\x01") + 1
			if n > iters {
				iters = n
			}
			hasMulti = true
		}
	}
	if !hasMulti {
		// No multi-capture — include body once if any referenced marker
		// has a (single) capture; otherwise drop.
		anyPresent := false
		for _, name := range refs {
			if _, ok := captures[name]; ok {
				anyPresent = true
				break
			}
		}
		if !anyPresent {
			return ""
		}
		return body
	}

	// Pre-split each multi-captured referent into a per-iteration list.
	parts := make(map[string][]string, len(refs))
	for _, name := range refs {
		if val, ok := captures[name]; ok {
			parts[name] = strings.Split(val, "\x01")
		}
	}

	var out strings.Builder
	for iter := 0; iter < iters; iter++ {
		piece := body
		for name, vals := range parts {
			var v string
			if iter < len(vals) {
				v = vals[iter]
			}
			quoted := ppQuote(v)
			piece = strings.ReplaceAll(piece, "#<"+name+">", quoted)
			piece = strings.ReplaceAll(piece, `<"`+name+`">`, quoted)
			piece = strings.ReplaceAll(piece, "<("+name+")>", quoted)
			if v != "" {
				piece = strings.ReplaceAll(piece, "<."+name+".>", ".T.")
			} else {
				piece = strings.ReplaceAll(piece, "<."+name+".>", ".F.")
			}
			piece = strings.ReplaceAll(piece, "<"+name+">", v)
		}
		out.WriteString(piece)
	}
	return out.String()
}

// referencedMarkers extracts marker names referenced inside a template
// fragment. Handles `<name>`, `<(name)>`, `<.name.>`, `<"name">`, and
// `#<name>` forms.
func referencedMarkers(s string) []string {
	seen := map[string]bool{}
	var out []string
	i := 0
	for i < len(s) {
		if s[i] == '<' {
			j := i + 1
			// Skip leading punctuation forms: (name), .name., "name".
			for j < len(s) && (s[j] == '(' || s[j] == '.' || s[j] == '"') {
				j++
			}
			start := j
			for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') ||
				(s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
				j++
			}
			if j > start {
				name := s[start:j]
				if !seen[name] {
					seen[name] = true
					out = append(out, name)
				}
			}
			i = j
			continue
		}
		i++
	}
	return out
}

// cleanUnreferencedMarkers removes any remaining <name>, <(name)>, <.name.>, #<name> references.
// Only removes well-formed PP marker references, not comparison operators.
func cleanUnreferencedMarkers(s string) string {
	// Match patterns like <identifier>, <(identifier)>, <.identifier.>, #<identifier>
	var out strings.Builder
	i := 0
	for i < len(s) {
		removed := false
		// #<name>
		if s[i] == '#' && i+1 < len(s) && s[i+1] == '<' {
			if end := findMarkerEnd(s, i+1); end > 0 {
				i = end
				removed = true
			}
		}
		// <name>, <(name)>, <.name.>, <"name">
		if !removed && s[i] == '<' {
			if end := findMarkerEnd(s, i); end > 0 {
				i = end
				removed = true
			}
		}
		if !removed {
			out.WriteByte(s[i])
			i++
		}
	}
	return out.String()
}

// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
func findMarkerEnd(s string, start int) int {
	if start >= len(s) || s[start] != '<' {
		return 0
	}
	i := start + 1
	// Skip optional ( or . or " or { prefix (smart-stringify, logify,
	// stringify, blockify respectively)
	if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"' || s[i] == '{') {
		i++
	}
	// Must start with letter or underscore (identifier)
	if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
		return 0
	}
	// Consume identifier
	for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
		i++
	}
	// Skip optional ) or . or " or } or , suffix
	for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == '}' || s[i] == ',' || s[i] == ' ') {
		i++
	}
	if i < len(s) && s[i] == '>' {
		return i + 1
	}
	return 0
}

// --- Helpers ---

func firstToken(s string) string {
	for i, c := range s {
		if c == ' ' || c == '\t' || c == '(' {
			return s[:i]
		}
	}
	return s
}

func matchWord(lineWord, patternWord string, caseSens bool) bool {
	if caseSens {
		return lineWord == patternWord
	}
	return strings.EqualFold(lineWord, patternWord)
}

// tokenizePattern splits a pattern into words, keeping markers as single tokens.
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
// single word would never align with the pattern's `DUMB( , <z>, )`
// tokens.
func tokenizePattern(pattern string) []string {
	var tokens []string
	i := 0
	for i < len(pattern) {
		for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
			i++
		}
		if i >= len(pattern) {
			break
		}

		if pattern[i] == '<' {
			end := strings.IndexByte(pattern[i:], '>')
			if end >= 0 {
				tokens = append(tokens, pattern[i:i+end+1])
				i += end + 1
				continue
			}
		}

		switch pattern[i] {
		case '[', ']', '(', ')', ',':
			tokens = append(tokens, string(pattern[i]))
			i++
			continue
		}

		// Regular word — stop at space/tab/marker/bracket/paren/comma.
		start := i
		for i < len(pattern) {
			c := pattern[i]
			if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
				c == '(' || c == ')' || c == ',' {
				break
			}
			i++
		}
		if i > start {
			tokens = append(tokens, pattern[start:i])
		}
	}
	return tokens
}

// tokenizeLine splits a source line into words matching the rules used
// by tokenizePattern: string literals stay intact, commas/parens/brackets
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
func tokenizeLine(line string) []string {
	var tokens []string
	i := 0
	for i < len(line) {
		for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
			i++
		}
		if i >= len(line) {
			break
		}

		// String literal
		if line[i] == '"' || line[i] == '\'' {
			quote := line[i]
			start := i
			i++
			for i < len(line) && line[i] != quote {
				i++
			}
			if i < len(line) {
				i++
			}
			tokens = append(tokens, line[start:i])
			continue
		}

		switch line[i] {
		case ',', '(', ')', '[', ']':
			tokens = append(tokens, string(line[i]))
			i++
			continue
		}

		// Word — stop at whitespace, brackets, parens, comma, quotes.
		start := i
		for i < len(line) {
			c := line[i]
			if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
				c == '[' || c == ']' || c == '"' || c == '\'' {
				break
			}
			i++
		}
		if i > start {
			tokens = append(tokens, line[start:i])
		}
	}
	return tokens
}

// captureExpression captures an expression from line tokens.
// If this is the last marker in the pattern, captures all remaining tokens.
// Otherwise, captures until the next keyword in the pattern.
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
	if *li >= len(lineWords) {
		return ""
	}

	// Collect every literal-keyword delimiter that follows in the
	// pattern, not just the first. Optional clauses in std.ch sit
	// next to one another (`[TO <(f)>] [FIELDS <fields,...>]
	// [FOR <for>] [WHILE <while>] ...`), so the file-name marker
	// must stop at TO's *successor* — but we don't know which
	// successor will actually be present in the input. Stopping on
	// any of them keeps `<(f)>` from swallowing a trailing
	// `FOR x > 5` clause. MarkerWordList values count too — a
	// `<off:OFF>` marker can only match the word OFF, so prior
	// captures must stop at it.
	stopSet := map[string]struct{}{}
	addStopFrom(stopSet, patternWords[nextPi:])
	var delims []string
	for k := range stopSet {
		delims = append(delims, k)
	}

	if len(delims) > 0 {
		// Capture until any delimiter is hit, paren-balancing so nested
		// parens/brackets/braces inside the expression don't falsely
		// terminate the capture. Harbour's own PP does the same —
		// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
		// and leave the outer `)` for the pattern's own delimiter.
		var parts []string
		depth := 0
		for *li < len(lineWords) {
			w := lineWords[*li]
			if depth == 0 {
				stop := false
				for _, d := range delims {
					if matchWord(w, d, caseSens) {
						stop = true
						break
					}
				}
				if stop {
					break
				}
			}
			switch w {
			case "(", "[", "{":
				depth++
			case ")", "]", "}":
				if depth > 0 {
					depth--
				}
			}
			parts = append(parts, w)
			*li++
		}
		return strings.Join(parts, " ")
	}

	// No delimiter: if last marker, capture all remaining tokens
	if nextPi >= len(patternWords) {
		rest := strings.Join(lineWords[*li:], " ")
		*li = len(lineWords)
		return rest
	}

	// Single token capture (between markers)
	tok := lineWords[*li]
	*li++
	return tok
}