// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com) // All rights reserved. // #command / #translate implementation for Five preprocessor. // // Harbour PP syntax: // #command PATTERN => RESULT // #translate PATTERN => RESULT // #xcommand PATTERN => RESULT (case-sensitive) // #xtranslate PATTERN => RESULT (case-sensitive) // // Pattern markers: // — match any expression (regular match) // — match single identifier only (restricted match) // — match comma-separated list // <*x*> — match rest of line (wild match) // — match one of listed words (list match) // [...] — optional clause // // Result markers: // — substitute matched text // <(x)> — stringify (wrap in quotes) // <{x}> — blockify (wrap in {|| }) // # — dumb stringify // <.x.> — logify (.T. if matched, .F. if not) // // Reference: /mnt/d/harbour-core/src/pp/ppcore.c package pp import ( "strings" ) // Rule represents a single #command or #translate rule. type Rule struct { Pattern string // raw pattern text Result string // raw result text IsCommand bool // #command vs #translate CaseSens bool // #xcommand/#xtranslate = case sensitive Keyword string // first keyword (for fast matching) Markers []Marker // parsed pattern markers ResultTmpl string // result template with marker references } // Marker represents a pattern marker like , , , <*x*>. type Marker struct { Name string // marker name Type MarkerType ListValues []string // for — allowed values } type MarkerType int const ( MarkerRegular MarkerType = iota // — any expression MarkerRestricted // — identifier only MarkerList // — comma-separated list MarkerWild // <*x*> — rest of line MarkerWordList // — one of listed words ) // ParseRule parses a #command/#translate directive into a Rule. func ParseRule(directive string, isCommand, caseSens bool) *Rule { // Split on => parts := strings.SplitN(directive, "=>", 2) if len(parts) != 2 { return nil } pattern := strings.TrimSpace(parts[0]) result := strings.TrimSpace(parts[1]) // Earlier versions stripped every ` ;` as Harbour line-continuation. // That also destroyed in-line PRG statement separators — `IF x == // NIL ; x := y ; ENDIF` lost all its semicolons. Line-continuation // joining is the preprocessor's job (processLines), not this rule // parser's. Keep the semicolons as-is. rule := &Rule{ Pattern: pattern, Result: result, IsCommand: isCommand, CaseSens: caseSens, ResultTmpl: result, } // Extract first keyword for fast matching. The first whitespace- // delimited token of the pattern becomes the dispatch key; we // strip marker wrappers and any trailing `(` so a pattern like // `MAKE_TEST( , )` hashes on `MAKE_TEST`, matching how // firstToken normalises source lines. words := strings.Fields(pattern) if len(words) > 0 { kw := words[0] kw = strings.TrimLeft(kw, "<[") kw = strings.TrimRight(kw, ">]") if idx := strings.IndexByte(kw, '('); idx >= 0 { kw = kw[:idx] } if !strings.ContainsAny(kw, "!*,:") { rule.Keyword = kw } } // Parse markers from pattern rule.Markers = parseMarkers(pattern) return rule } // parseMarkers extracts all <...> markers from a pattern. func parseMarkers(pattern string) []Marker { var markers []Marker i := 0 for i < len(pattern) { if pattern[i] == '<' { end := strings.IndexByte(pattern[i:], '>') if end < 0 { break } inner := pattern[i+1 : i+end] m := parseOneMarker(inner) if m.Name != "" { markers = append(markers, m) } i += end + 1 } else { i++ } } return markers } func parseOneMarker(inner string) Marker { inner = strings.TrimSpace(inner) // — restricted if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") { return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted} } // <*name*> — wild if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") { return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild} } // — comma list if strings.HasSuffix(inner, ",...") { return Marker{Name: inner[:len(inner)-4], Type: MarkerList} } // — word list if idx := strings.IndexByte(inner, ':'); idx > 0 { name := inner[:idx] vals := strings.Split(inner[idx+1:], ",") for i := range vals { vals[i] = strings.TrimSpace(vals[i]) } return Marker{Name: name, Type: MarkerWordList, ListValues: vals} } // <(name)> — extended-expression marker. In Harbour PP this captures // a file-name-like extended expression and the matching result token // `<(name)>` smart-stringifies it (already-quoted → keep, identifier // → quote). Strip the parens so captures are stored under the bare // name; result substitution then matches both `<(name)>` and `` // via the existing path. if strings.HasPrefix(inner, "(") && strings.HasSuffix(inner, ")") { return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRegular} } // — regular return Marker{Name: inner, Type: MarkerRegular} } // --- Rule matching and application --- // MatchLine checks if a source line matches this rule and returns the substituted result. // Returns ("", false) if no match. func (r *Rule) MatchLine(line string) (string, bool) { trimmed := strings.TrimSpace(line) if trimmed == "" { return "", false } // Fast keyword check if r.Keyword != "" { firstWord := firstToken(trimmed) if r.CaseSens { if firstWord != r.Keyword { return "", false } } else { if !strings.EqualFold(firstWord, r.Keyword) { return "", false } } } // Try to match pattern against line captures := r.matchPattern(trimmed) if captures == nil { return "", false } // Apply result template result := r.applyResult(captures) return result, true } // matchPattern attempts to match the pattern against a line. // Returns captured values map, or nil if no match. func (r *Rule) matchPattern(line string) map[string]string { captures := make(map[string]string) patternWords := tokenizePattern(r.Pattern) lineWords := tokenizeLine(line) pi, li := 0, 0 for pi < len(patternWords) && li < len(lineWords) { pw := patternWords[pi] // Marker? if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") { inner := pw[1 : len(pw)-1] m := parseOneMarker(inner) switch m.Type { case MarkerWild: // Capture rest of line rest := strings.Join(lineWords[li:], " ") captures[m.Name] = rest li = len(lineWords) pi++ case MarkerList: // Capture a comma-separated list until the next literal // pattern token. Paren-balanced so nested `(`/`[`/`{` // don't let an inner `)` terminate the capture. Commas // at the top level are preserved verbatim in the // captured string so the `` substitution in the // result template reproduces the argument list as-is. var parts []string depth := 0 delim := "" if pi+1 < len(patternWords) { delim = patternWords[pi+1] } for li < len(lineWords) { w := lineWords[li] if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) { break } switch w { case "(", "[", "{": depth++ case ")", "]", "}": if depth > 0 { depth-- } } parts = append(parts, w) li++ } captures[m.Name] = strings.Join(parts, " ") pi++ case MarkerWordList: // Match one of listed words matched := false for _, allowed := range m.ListValues { if r.CaseSens { if lineWords[li] == allowed { matched = true break } } else if strings.EqualFold(lineWords[li], allowed) { matched = true break } } if !matched { return nil } captures[m.Name] = lineWords[li] li++ pi++ default: // Regular or restricted: capture one token or expression captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens) captures[m.Name] = captured pi++ } } else if pw == "[" { // Optional, possibly-repeating sub-pattern. Try matching the // bracketed body repeatedly against the remaining line; each // successful iteration appends its marker captures under the // same name with a \x01 separator. Used by Harbour forms // like `DEFAULT TO [, TO ]` where the // trailing bracket repeats for each additional pair. depth := 1 bodyStart := pi + 1 bodyEnd := bodyStart for bodyEnd < len(patternWords) && depth > 0 { if patternWords[bodyEnd] == "[" { depth++ } else if patternWords[bodyEnd] == "]" { depth-- if depth == 0 { break } } bodyEnd++ } body := patternWords[bodyStart:bodyEnd] // Outer-pattern tail (everything after the matching `]`) is // needed so a regular marker at the end of `body` knows where // to stop capturing. Without this, `[TO ] [FOR ]` // against `TO n FOR age >= 30` would let `` swallow the // rest of the line because `body` itself has no literal that // follows the marker. outerTail := patternWords[bodyEnd+1:] for li < len(lineWords) { snapshotLi := li iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens, outerTail) if !ok { li = snapshotLi break } // No-progress matches can happen when the body is just // a list/regular marker that immediately hits a stop // boundary on this iteration — its captured value is // empty. Don't merge those into captures, otherwise an // earlier successful iteration's value gets contaminated // with the `\x01`-separator form and the result-template // substitution skips it as multi-capture garbage. if newLi == snapshotLi { break } for k, v := range iterCaps { if prev, hit := captures[k]; hit && prev != "" { captures[k] = prev + "\x01" + v } else { captures[k] = v } } li = newLi } pi = bodyEnd + 1 // past ] } else if pw == "]" { pi++ } else { // Literal keyword — must match if !matchWord(lineWords[li], pw, r.CaseSens) { return nil } li++ pi++ } } // Walk any tail of the pattern that wasn't matched against the // line. We accept it only if everything that remains is *optional* // — i.e. a `[...]` block (which by definition can be absent) or // markers/literals that are nested inside one. A bare `` or a // literal token outside of brackets is required, so encountering // one means the pattern isn't satisfied: bare `CLOSE` must not // match rule `CLOSE `. depth := 0 for pi < len(patternWords) { pw := patternWords[pi] switch { case pw == "[": depth++ case pw == "]": if depth > 0 { depth-- } default: if depth == 0 { return nil } } pi++ } // For #command with no markers and no optional clauses: // all line tokens must be consumed for a match if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 && !strings.Contains(r.Pattern, "[") { return nil } return captures } // matchSegment tries to match a bracketed sub-pattern against a slice // of the line tokens starting at startLi. Returns per-iteration // captures and the new line position on success. The segment cannot // contain nested `[...]` — callers of the optional-repeat logic // flatten one level at a time. // // A "mini-matcher" that mirrors the main loop for MarkerRegular, // MarkerRestricted, and MarkerList plus literal keywords. MarkerWild // inside `[...]` is rare and still defers to the main matcher. func matchSegment(segment, lineWords []string, startLi int, caseSens bool, outerTail []string) (map[string]string, int, bool) { caps := make(map[string]string) li := startLi // When the segment starts with a literal (e.g. `,` in // `[, TO ]`), treat that literal as the natural boundary // between iterations. Used as the delimiter for a trailing marker // that would otherwise gobble the rest of the line. repeatBoundary := "" if len(segment) > 0 && !strings.HasPrefix(segment[0], "<") && segment[0] != "[" && segment[0] != "]" { repeatBoundary = segment[0] } for pi := 0; pi < len(segment); pi++ { pw := segment[pi] // Nested optional clause: find the matching `]`, run the // repeat-loop on the inner body until no progress. Mirrors // the main matchPattern's `[` branch. Doesn't require any // remaining input — an absent optional just doesn't iterate. if pw == "[" { depth := 1 bodyStart := pi + 1 bodyEnd := bodyStart for bodyEnd < len(segment) && depth > 0 { if segment[bodyEnd] == "[" { depth++ } else if segment[bodyEnd] == "]" { depth-- if depth == 0 { break } } bodyEnd++ } innerBody := segment[bodyStart:bodyEnd] innerOuterTail := segment[bodyEnd+1:] for li < len(lineWords) { snapshotLi := li iterCaps, newLi, ok := matchSegment(innerBody, lineWords, li, caseSens, innerOuterTail) if !ok { li = snapshotLi break } if newLi == snapshotLi { break } for k, v := range iterCaps { if prev, hit := caps[k]; hit && prev != "" { caps[k] = prev + "\x01" + v } else { caps[k] = v } } li = newLi } pi = bodyEnd continue } if pw == "]" { // Stray closer — skip. continue } if li >= len(lineWords) { return nil, startLi, false } if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") { inner := pw[1 : len(pw)-1] m := parseOneMarker(inner) switch m.Type { case MarkerWordList: // Match one of the listed words. If the current line // token isn't in the allowed set, the segment fails to // match — same behavior as the top-level matcher. w := lineWords[li] matched := false for _, allowed := range m.ListValues { if caseSens { if w == allowed { matched = true break } } else if strings.EqualFold(w, allowed) { matched = true break } } if !matched { return nil, startLi, false } caps[m.Name] = w li++ continue case MarkerList: // Capture comma-separated tokens until we hit the // segment's next literal, an outer literal, or one of // the limited values of a following MarkerWordList // (e.g. `` — OFF is the only token that can // match it, so the list before it must stop at OFF). // Paren-balanced so `f(a,b)` inside the list doesn't // terminate prematurely. Mirrors the main matchPattern's // MarkerList branch. stop := map[string]struct{}{} addStopFrom(stop, segment[pi+1:]) addStopFrom(stop, outerTail) var parts []string depth := 0 for li < len(lineWords) { w := lineWords[li] if depth == 0 { key := w if !caseSens { key = strings.ToUpper(w) } if _, hit := stop[key]; hit { break } } switch w { case "(", "[", "{": depth++ case ")", "]", "}": if depth > 0 { depth-- } } parts = append(parts, w) li++ } caps[m.Name] = strings.Join(parts, " ") continue case MarkerRegular, MarkerRestricted: // fall through to capture-one-expression below default: return nil, startLi, false } // Build a pseudo-pattern tail so captureExpression picks // the right delimiters. Priority order (each level is // merged, then captureExpression stops at *whichever* // delimiter shows up first in the input): // 1. Next literals inside the same segment. // 2. Every literal in the outer-pattern tail — what // stops `[TO <(f)>] [FIELDS ...] [FOR ...]` from // letting `<(f)>` swallow a trailing FOR/WHILE/... // 3. Repeat boundary (the segment's leading literal) // — needed for multi-iter `[, ]` so each // iteration's `` stops at the next ',' before // the outer-tail's TO/FOR/etc. catches it. tail := segment[pi+1:] if !hasLiteralAfter(tail) { combined := []string{} if hasLiteralAfter(outerTail) { combined = append(combined, outerTail...) } if repeatBoundary != "" { combined = append(combined, repeatBoundary) } if len(combined) > 0 { tail = combined } } captured := captureExpression(lineWords, &li, tail, 0, caseSens) caps[m.Name] = captured continue } if !matchWord(lineWords[li], pw, caseSens) { return nil, startLi, false } li++ } return caps, li, true } // addStopFrom merges into `stop` every token that could legally match // the next position in `pw`: bare literals AND each value of any // MarkerWordList (``) since those markers can match only // their listed words. Used so a preceding list/regular capture knows // to stop before any of them. Always uppercased — the caller decides // whether to do a case-insensitive lookup. func addStopFrom(stop map[string]struct{}, pw []string) { for _, w := range pw { if w == "" || w == "[" || w == "]" { continue } if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") { inner := w[1 : len(w)-1] if m := parseOneMarker(inner); m.Type == MarkerWordList { for _, v := range m.ListValues { stop[strings.ToUpper(v)] = struct{}{} } } continue } stop[strings.ToUpper(w)] = struct{}{} } } // firstLiteral returns the first non-marker, non-bracket token in pw, // or "" if none. Used to give matchSegment a stop-boundary drawn from // the outer pattern when its body ends in a regular marker. func firstLiteral(pw []string) string { for _, w := range pw { if w == "[" || w == "]" || w == "" { continue } if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") { continue } return w } return "" } // hasLiteralAfter reports whether a pattern slice contains any literal // keyword token (non-marker, non-bracket) — used to decide whether a // marker's capture has a real delimiter or needs a synthetic one. func hasLiteralAfter(segment []string) bool { for _, pw := range segment { if pw == "[" || pw == "]" || pw == "" { continue } if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") { continue } return true } return false } // quoteListElements smart-stringifies a list-style capture: split val // on top-level commas (paren / bracket / brace balanced) and emit each // element quoted. Already-quoted elements are kept as-is so a literal // like `"a", "b"` round-trips intact. Used by `<(name)>` substitution // when `name` came from a `` marker — Harbour's std.ch idiom // for `{ <(fields)> }` to expand to `{ "a", "b", "c" }`. func quoteListElements(val string) string { parts := splitTopLevelCommas(val) if len(parts) == 0 { return "" } out := make([]string, 0, len(parts)) for _, p := range parts { t := strings.TrimSpace(p) if t == "" { continue } // Already a string literal — keep verbatim. if n := len(t); n >= 2 && ((t[0] == '"' && t[n-1] == '"') || (t[0] == '\'' && t[n-1] == '\'') || (t[0] == '[' && t[n-1] == ']')) { out = append(out, t) continue } out = append(out, ppQuote(t)) } return strings.Join(out, ", ") } // splitTopLevelCommas splits s on commas that are not nested inside // (), [], or {}. Strings ("..." / '...') are skipped to avoid breaking // captured PRG expressions. func splitTopLevelCommas(s string) []string { var parts []string depth := 0 start := 0 inStr := byte(0) for i := 0; i < len(s); i++ { c := s[i] if inStr != 0 { if c == inStr { inStr = 0 } continue } switch c { case '"', '\'': inStr = c case '(', '[', '{': depth++ case ')', ']', '}': if depth > 0 { depth-- } case ',': if depth == 0 { parts = append(parts, s[start:i]) start = i + 1 } } } parts = append(parts, s[start:]) return parts } // ppQuote wraps a captured value in a PRG string literal, picking a // delimiter that doesn't collide with characters already inside. Harbour // # stringify takes the raw source text of the argument and must // produce a legal PRG string — if the capture is `"world"`, the result // can't just be `""world""`. Preference order matches Harbour: // double-quotes first, then single-quotes, then bracket literals. func ppQuote(val string) string { if !strings.ContainsRune(val, '"') { return `"` + val + `"` } if !strings.ContainsRune(val, '\'') { return "'" + val + "'" } if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') { return "[" + val + "]" } // Fallback: double-quote with embedded quotes dropped. Pathological // input only; Harbour itself refuses to handle this cleanly. return `"` + strings.ReplaceAll(val, `"`, "") + `"` } // applyResult substitutes captured values into the result template. // Order matters — the compound forms (`#`, `<(z)>`, `<.z.>`, `<"z">`) // all contain the bare `` token, so the bare substitution has to run // LAST. Previously `` was replaced first and left a stray `#` / `(` / // `.` / `"` behind, producing bogus lines like `? #hello` that the // lexer then choked on with ILLEGAL token errors. func (r *Rule) applyResult(captures map[string]string) string { result := r.ResultTmpl // Expand optional-repeat `[ ... ]` segments in the template. If any // marker inside a bracketed section was multi-captured during the // pattern match (values joined with \x01), emit the body once per // iteration with per-iter values. If no markers inside are multi- // captured, the bracket body is included once with whatever single // captures apply (the required-or-absent case). result = expandOptionalRepeat(result, captures) // Marker-name → list flag, so the smart-stringify branch below can // emit per-element quoting (`{ "a", "b" }`) for list captures // instead of treating the comma-joined string as one literal. isList := make(map[string]bool, len(r.Markers)) for _, m := range r.Markers { if m.Type == MarkerList { isList[m.Name] = true } } for name, val := range captures { // Multi-capture markers are consumed by expandOptionalRepeat; // the bare substitution for the joined form would produce // garbage (values separated by \x01). Skip them here and let // any remaining bare `` fall through to the cleanup. if strings.ContainsRune(val, '\x01') { continue } quoted := ppQuote(val) // # — dumb stringify (always quote). result = strings.ReplaceAll(result, "#<"+name+">", quoted) // <"name"> — explicit stringify. result = strings.ReplaceAll(result, `<"`+name+`">`, quoted) // <(name)> — smart stringify: already a string literal → keep; // list capture → quote each comma-separated element; otherwise // quote whole. `val` comes straight from the capture, so trim // and check for surrounding quotes. trim := strings.TrimSpace(val) smart := quoted if n := len(trim); n >= 2 && ((trim[0] == '"' && trim[n-1] == '"') || (trim[0] == '\'' && trim[n-1] == '\'') || (trim[0] == '[' && trim[n-1] == ']')) { smart = trim } else if isList[name] { smart = quoteListElements(val) } result = strings.ReplaceAll(result, "<("+name+")>", smart) // <.name.> — logify (empty → .F., else .T.) if val != "" { result = strings.ReplaceAll(result, "<."+name+".>", ".T.") } else { result = strings.ReplaceAll(result, "<."+name+".>", ".F.") } // <{name}> — blockify: wrap captured expression in {|| ... }. // For list-typed markers (``) wrap *each* element so // `{ <{v}> }` against `LIST id, name` expands to // `{ {|| id }, {|| name } }`, matching Harbour's std.ch // idiom for column blocks. Empty capture → NIL so the call // site sees a nil block (missing FOR/WHILE clause). if val == "" { result = strings.ReplaceAll(result, "<{"+name+"}>", "NIL") } else if isList[name] { parts := splitTopLevelCommas(val) out := make([]string, 0, len(parts)) for _, p := range parts { t := strings.TrimSpace(p) if t == "" { continue } out = append(out, "{|| "+t+" }") } result = strings.ReplaceAll(result, "<{"+name+"}>", strings.Join(out, ", ")) } else { result = strings.ReplaceAll(result, "<{"+name+"}>", "{|| "+val+" }") } // — bare substitution (must be LAST, after all wrappers). result = strings.ReplaceAll(result, "<"+name+">", val) } // Any `<{name}>` still in the template means `name` was never // captured — emit NIL so call sites see a missing block argument // (matches Harbour: empty FOR/WHILE → NIL → bypass the condition). result = replaceUnreferencedBlockify(result) // Same idea for `<.name.>`: a missing marker logifies to .F., // matching Harbour's behavior of "absent optional clause => .F." // for OFF / ALL / REST / etc. result = replaceUnreferencedLogify(result) // Clean up unreferenced markers: , <(name)>, <.name.>, #, <"name"> result = cleanUnreferencedMarkers(result) return result } // replaceUnreferencedLogify rewrites every remaining `<.ident.>` to // `.F.` — the absent-optional-clause sentinel that matches Harbour's // std.ch convention. func replaceUnreferencedLogify(s string) string { var out strings.Builder i := 0 for i < len(s) { if i+2 < len(s) && s[i] == '<' && s[i+1] == '.' { j := i + 2 if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) { j++ for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) { j++ } if j+1 < len(s) && s[j] == '.' && s[j+1] == '>' { out.WriteString(".F.") i = j + 2 continue } } } out.WriteByte(s[i]) i++ } return out.String() } // replaceUnreferencedBlockify rewrites every remaining `<{ident}>` to // NIL. Run after the main substitution loop, before the generic // unreferenced-marker cleanup. func replaceUnreferencedBlockify(s string) string { var out strings.Builder i := 0 for i < len(s) { if i+2 < len(s) && s[i] == '<' && s[i+1] == '{' { j := i + 2 // Identifier if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) { j++ for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) { j++ } if j+1 < len(s) && s[j] == '}' && s[j+1] == '>' { out.WriteString("NIL") i = j + 2 continue } } } out.WriteByte(s[i]) i++ } return out.String() } // expandOptionalRepeat walks a result template and rewrites each top- // level `[ ... ]` block by examining the captures referenced inside: // // - If any referenced marker has multiple captured iterations // (values joined with \x01), emit the body N times, substituting // the i-th iteration's value for each such marker and dropping // single-valued markers into each iteration unchanged. // - If no referenced marker is multi-captured BUT the single // captures include non-empty values, emit the body once. // - Otherwise drop the block. // // Nested brackets are not supported — Harbour uses a single level of // `[...]` for the common repeat form. Callers that need deeper nesting // can fall back to writing out separate #xcommand rules. func expandOptionalRepeat(template string, captures map[string]string) string { var out strings.Builder i := 0 for i < len(template) { if template[i] == '[' { // Find matching top-level ']'. Skip over quoted strings // and nested brackets inside PP markers like `<.x.>`. depth := 1 j := i + 1 for j < len(template) && depth > 0 { switch template[j] { case '[': // Inside a marker `<...>` the `[` is just text; // only count top-level brackets. if inMarker(template, j) { j++ continue } depth++ case ']': if inMarker(template, j) { j++ continue } depth-- if depth == 0 { body := template[i+1 : j] out.WriteString(expandBracketBody(body, captures)) i = j + 1 goto next } } j++ } // Unmatched [ — copy literally. out.WriteByte(template[i]) i++ next: continue } out.WriteByte(template[i]) i++ } return out.String() } // inMarker reports whether position `p` in s is inside a PP marker // reference like `<.x.>` / `<"x">` / `<(x)>` — where `[` and `]` are // ordinary text, not template delimiters. func inMarker(s string, p int) bool { // Look backward for `<` not preceded by a marker-terminator. for k := p - 1; k >= 0; k-- { c := s[k] if c == '>' { return false } if c == '<' { // Scan forward from `<` to see if we're still inside. for m := k + 1; m < len(s) && m <= p; m++ { if s[m] == '>' { return false } } return true } } return false } // expandBracketBody returns the optional-repeat body expanded once per // iteration of its multi-captured markers. See expandOptionalRepeat. func expandBracketBody(body string, captures map[string]string) string { // Find marker names referenced inside the body. refs := referencedMarkers(body) iters := 1 hasMulti := false for _, name := range refs { if val, ok := captures[name]; ok && strings.ContainsRune(val, '\x01') { n := strings.Count(val, "\x01") + 1 if n > iters { iters = n } hasMulti = true } } if !hasMulti { // No multi-capture — include body once if any referenced marker // has a (single) capture; otherwise drop. anyPresent := false for _, name := range refs { if _, ok := captures[name]; ok { anyPresent = true break } } if !anyPresent { return "" } return body } // Pre-split each multi-captured referent into a per-iteration list. parts := make(map[string][]string, len(refs)) for _, name := range refs { if val, ok := captures[name]; ok { parts[name] = strings.Split(val, "\x01") } } var out strings.Builder for iter := 0; iter < iters; iter++ { piece := body for name, vals := range parts { var v string if iter < len(vals) { v = vals[iter] } quoted := ppQuote(v) piece = strings.ReplaceAll(piece, "#<"+name+">", quoted) piece = strings.ReplaceAll(piece, `<"`+name+`">`, quoted) piece = strings.ReplaceAll(piece, "<("+name+")>", quoted) if v != "" { piece = strings.ReplaceAll(piece, "<."+name+".>", ".T.") } else { piece = strings.ReplaceAll(piece, "<."+name+".>", ".F.") } piece = strings.ReplaceAll(piece, "<"+name+">", v) } out.WriteString(piece) } return out.String() } // referencedMarkers extracts marker names referenced inside a template // fragment. Handles ``, `<(name)>`, `<.name.>`, `<"name">`, and // `#` forms. func referencedMarkers(s string) []string { seen := map[string]bool{} var out []string i := 0 for i < len(s) { if s[i] == '<' { j := i + 1 // Skip leading punctuation forms: (name), .name., "name". for j < len(s) && (s[j] == '(' || s[j] == '.' || s[j] == '"') { j++ } start := j for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) { j++ } if j > start { name := s[start:j] if !seen[name] { seen[name] = true out = append(out, name) } } i = j continue } i++ } return out } // cleanUnreferencedMarkers removes any remaining , <(name)>, <.name.>, # references. // Only removes well-formed PP marker references, not comparison operators. func cleanUnreferencedMarkers(s string) string { // Match patterns like , <(identifier)>, <.identifier.>, # var out strings.Builder i := 0 for i < len(s) { removed := false // # if s[i] == '#' && i+1 < len(s) && s[i+1] == '<' { if end := findMarkerEnd(s, i+1); end > 0 { i = end removed = true } } // , <(name)>, <.name.>, <"name"> if !removed && s[i] == '<' { if end := findMarkerEnd(s, i); end > 0 { i = end removed = true } } if !removed { out.WriteByte(s[i]) i++ } } return out.String() } // findMarkerEnd checks if s[start] begins a PP marker and returns end position, or 0. func findMarkerEnd(s string, start int) int { if start >= len(s) || s[start] != '<' { return 0 } i := start + 1 // Skip optional ( or . or " or { prefix (smart-stringify, logify, // stringify, blockify respectively) if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"' || s[i] == '{') { i++ } // Must start with letter or underscore (identifier) if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') { return 0 } // Consume identifier for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') { i++ } // Skip optional ) or . or " or } or , suffix for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == '}' || s[i] == ',' || s[i] == ' ') { i++ } if i < len(s) && s[i] == '>' { return i + 1 } return 0 } // --- Helpers --- func firstToken(s string) string { for i, c := range s { if c == ' ' || c == '\t' || c == '(' { return s[:i] } } return s } func matchWord(lineWord, patternWord string, caseSens bool) bool { if caseSens { return lineWord == patternWord } return strings.EqualFold(lineWord, patternWord) } // tokenizePattern splits a pattern into words, keeping markers as single tokens. // Parens and commas are emitted as their own tokens so `DUMB()` and // `DUMB( )` tokenise identically — matching what tokenizeLine does // on call sites. Without this, `_DUMB_(a)` (no space) stored as a // single word would never align with the pattern's `DUMB( , , )` // tokens. func tokenizePattern(pattern string) []string { var tokens []string i := 0 for i < len(pattern) { for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') { i++ } if i >= len(pattern) { break } if pattern[i] == '<' { end := strings.IndexByte(pattern[i:], '>') if end >= 0 { tokens = append(tokens, pattern[i:i+end+1]) i += end + 1 continue } } switch pattern[i] { case '[', ']', '(', ')', ',': tokens = append(tokens, string(pattern[i])) i++ continue } // Regular word — stop at space/tab/marker/bracket/paren/comma. start := i for i < len(pattern) { c := pattern[i] if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' || c == '(' || c == ')' || c == ',' { break } i++ } if i > start { tokens = append(tokens, pattern[start:i]) } } return tokens } // tokenizeLine splits a source line into words matching the rules used // by tokenizePattern: string literals stay intact, commas/parens/brackets // emit as standalone tokens so a call like `DUMB(hello)` tokenises as // `DUMB`, `(`, `hello`, `)` — aligning with the pattern side. func tokenizeLine(line string) []string { var tokens []string i := 0 for i < len(line) { for i < len(line) && (line[i] == ' ' || line[i] == '\t') { i++ } if i >= len(line) { break } // String literal if line[i] == '"' || line[i] == '\'' { quote := line[i] start := i i++ for i < len(line) && line[i] != quote { i++ } if i < len(line) { i++ } tokens = append(tokens, line[start:i]) continue } switch line[i] { case ',', '(', ')', '[', ']': tokens = append(tokens, string(line[i])) i++ continue } // Word — stop at whitespace, brackets, parens, comma, quotes. start := i for i < len(line) { c := line[i] if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' || c == '[' || c == ']' || c == '"' || c == '\'' { break } i++ } if i > start { tokens = append(tokens, line[start:i]) } } return tokens } // captureExpression captures an expression from line tokens. // If this is the last marker in the pattern, captures all remaining tokens. // Otherwise, captures until the next keyword in the pattern. func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string { if *li >= len(lineWords) { return "" } // Collect every literal-keyword delimiter that follows in the // pattern, not just the first. Optional clauses in std.ch sit // next to one another (`[TO <(f)>] [FIELDS ] // [FOR ] [WHILE ] ...`), so the file-name marker // must stop at TO's *successor* — but we don't know which // successor will actually be present in the input. Stopping on // any of them keeps `<(f)>` from swallowing a trailing // `FOR x > 5` clause. MarkerWordList values count too — a // `` marker can only match the word OFF, so prior // captures must stop at it. stopSet := map[string]struct{}{} addStopFrom(stopSet, patternWords[nextPi:]) var delims []string for k := range stopSet { delims = append(delims, k) } if len(delims) > 0 { // Capture until any delimiter is hit, paren-balancing so nested // parens/brackets/braces inside the expression don't falsely // terminate the capture. Harbour's own PP does the same — // `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens) // and leave the outer `)` for the pattern's own delimiter. var parts []string depth := 0 for *li < len(lineWords) { w := lineWords[*li] if depth == 0 { stop := false for _, d := range delims { if matchWord(w, d, caseSens) { stop = true break } } if stop { break } } switch w { case "(", "[", "{": depth++ case ")", "]", "}": if depth > 0 { depth-- } } parts = append(parts, w) *li++ } return strings.Join(parts, " ") } // No delimiter: if last marker, capture all remaining tokens if nextPi >= len(patternWords) { rest := strings.Join(lineWords[*li:], " ") *li = len(lineWords) return rest } // Single token capture (between markers) tok := lineWords[*li] *li++ return tok }