Files
five/compiler/pp/command.go
CharlesKWON 4a1bbdb1fe feat(pp): optional-repeat [...] blocks — DEFAULT / UPDATE from common.ch
Harbour's `#xcommand DEFAULT <v1> TO <x1> [, <vn> TO <xn>] => ...`
uses an optional, repeatable trailing `[...]` block to accept any
number of `var TO default` pairs on a single line. Five's PP
skipped bracket bodies during pattern matching and treated them
as no-ops in result templates, so

  DEFAULT a TO 10, b TO 20, c TO 30

expanded (at best) the first pair and dropped the rest — and
common.ch itself was documented as "not yet supported".

Three concrete changes:

1. matchPattern now matches the `[...]` body repeatedly against
   remaining line tokens via a new matchSegment helper. Each
   successful iteration appends captures for the interior markers
   under the same name, joined with a \x01 sentinel.

2. matchSegment, when capturing the last marker in a body with no
   following literal, uses the body's opening literal (e.g. the `,`
   in `[, <vn> TO <xn>]`) as the iteration boundary. Otherwise
   captureExpression would greedily eat the rest of the line and
   collapse every remaining pair into one capture.

3. applyResult's new expandOptionalRepeat walks the result template
   for top-level `[...]` blocks. When a referenced marker is multi-
   captured it emits the body N times (substituting per-iter value);
   when it's single-captured it emits the body once; otherwise drops
   the block. A separate referencedMarkers scanner and an inMarker
   guard keep literal `[` / `]` inside PP markers (like `<.x.>`)
   from being mistaken for bracket delimiters.

Side fix: ParseRule previously stripped every ` ;` as a Harbour
line-continuation marker, but that also destroyed in-line PRG
statement separators in result templates. Line joining is the
preprocessor's job upstream — keep semicolons intact here.

common.ch now ships real DEFAULT and UPDATE #xcommands. Verified
1-, 2-, and 3-pair DEFAULT expansion plus `common.ch` inclusion
from user code. FiveSql2 43/43, Harbour compat 56/56, Go test ALL
PASS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 18:20:11 +09:00

926 lines
26 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// #command / #translate implementation for Five preprocessor.
//
// Harbour PP syntax:
// #command PATTERN => RESULT
// #translate PATTERN => RESULT
// #xcommand PATTERN => RESULT (case-sensitive)
// #xtranslate PATTERN => RESULT (case-sensitive)
//
// Pattern markers:
// <x> — match any expression (regular match)
// <!x!> — match single identifier only (restricted match)
// <x,...> — match comma-separated list
// <*x*> — match rest of line (wild match)
// <x:a,b,c> — match one of listed words (list match)
// [...] — optional clause
//
// Result markers:
// <x> — substitute matched text
// <(x)> — stringify (wrap in quotes)
// <{x}> — blockify (wrap in {|| })
// #<x> — dumb stringify
// <.x.> — logify (.T. if matched, .F. if not)
//
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
package pp
import (
"strings"
)
// Rule represents a single #command or #translate rule.
type Rule struct {
Pattern string // raw pattern text
Result string // raw result text
IsCommand bool // #command vs #translate
CaseSens bool // #xcommand/#xtranslate = case sensitive
Keyword string // first keyword (for fast matching)
Markers []Marker // parsed pattern markers
ResultTmpl string // result template with marker references
}
// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
type Marker struct {
Name string // marker name
Type MarkerType
ListValues []string // for <x:a,b,c> — allowed values
}
type MarkerType int
const (
MarkerRegular MarkerType = iota // <x> — any expression
MarkerRestricted // <!x!> — identifier only
MarkerList // <x,...> — comma-separated list
MarkerWild // <*x*> — rest of line
MarkerWordList // <x:a,b,c> — one of listed words
)
// ParseRule parses a #command/#translate directive into a Rule.
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
// Split on =>
parts := strings.SplitN(directive, "=>", 2)
if len(parts) != 2 {
return nil
}
pattern := strings.TrimSpace(parts[0])
result := strings.TrimSpace(parts[1])
// Earlier versions stripped every ` ;` as Harbour line-continuation.
// That also destroyed in-line PRG statement separators — `IF x ==
// NIL ; x := y ; ENDIF` lost all its semicolons. Line-continuation
// joining is the preprocessor's job (processLines), not this rule
// parser's. Keep the semicolons as-is.
rule := &Rule{
Pattern: pattern,
Result: result,
IsCommand: isCommand,
CaseSens: caseSens,
ResultTmpl: result,
}
// Extract first keyword for fast matching. The first whitespace-
// delimited token of the pattern becomes the dispatch key; we
// strip marker wrappers and any trailing `(` so a pattern like
// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
// firstToken normalises source lines.
words := strings.Fields(pattern)
if len(words) > 0 {
kw := words[0]
kw = strings.TrimLeft(kw, "<[")
kw = strings.TrimRight(kw, ">]")
if idx := strings.IndexByte(kw, '('); idx >= 0 {
kw = kw[:idx]
}
if !strings.ContainsAny(kw, "!*,:") {
rule.Keyword = kw
}
}
// Parse markers from pattern
rule.Markers = parseMarkers(pattern)
return rule
}
// parseMarkers extracts all <...> markers from a pattern.
func parseMarkers(pattern string) []Marker {
var markers []Marker
i := 0
for i < len(pattern) {
if pattern[i] == '<' {
end := strings.IndexByte(pattern[i:], '>')
if end < 0 {
break
}
inner := pattern[i+1 : i+end]
m := parseOneMarker(inner)
if m.Name != "" {
markers = append(markers, m)
}
i += end + 1
} else {
i++
}
}
return markers
}
func parseOneMarker(inner string) Marker {
inner = strings.TrimSpace(inner)
// <!name!> — restricted
if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
}
// <*name*> — wild
if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
}
// <name,...> — comma list
if strings.HasSuffix(inner, ",...") {
return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
}
// <name:a,b,c> — word list
if idx := strings.IndexByte(inner, ':'); idx > 0 {
name := inner[:idx]
vals := strings.Split(inner[idx+1:], ",")
for i := range vals {
vals[i] = strings.TrimSpace(vals[i])
}
return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
}
// <name> — regular
return Marker{Name: inner, Type: MarkerRegular}
}
// --- Rule matching and application ---
// MatchLine checks if a source line matches this rule and returns the substituted result.
// Returns ("", false) if no match.
func (r *Rule) MatchLine(line string) (string, bool) {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
return "", false
}
// Fast keyword check
if r.Keyword != "" {
firstWord := firstToken(trimmed)
if r.CaseSens {
if firstWord != r.Keyword {
return "", false
}
} else {
if !strings.EqualFold(firstWord, r.Keyword) {
return "", false
}
}
}
// Try to match pattern against line
captures := r.matchPattern(trimmed)
if captures == nil {
return "", false
}
// Apply result template
result := r.applyResult(captures)
return result, true
}
// matchPattern attempts to match the pattern against a line.
// Returns captured values map, or nil if no match.
func (r *Rule) matchPattern(line string) map[string]string {
captures := make(map[string]string)
patternWords := tokenizePattern(r.Pattern)
lineWords := tokenizeLine(line)
pi, li := 0, 0
for pi < len(patternWords) && li < len(lineWords) {
pw := patternWords[pi]
// Marker?
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
inner := pw[1 : len(pw)-1]
m := parseOneMarker(inner)
switch m.Type {
case MarkerWild:
// Capture rest of line
rest := strings.Join(lineWords[li:], " ")
captures[m.Name] = rest
li = len(lineWords)
pi++
case MarkerList:
// Capture a comma-separated list until the next literal
// pattern token. Paren-balanced so nested `(`/`[`/`{`
// don't let an inner `)` terminate the capture. Commas
// at the top level are preserved verbatim in the
// captured string so the `<z>` substitution in the
// result template reproduces the argument list as-is.
var parts []string
depth := 0
delim := ""
if pi+1 < len(patternWords) {
delim = patternWords[pi+1]
}
for li < len(lineWords) {
w := lineWords[li]
if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
break
}
switch w {
case "(", "[", "{":
depth++
case ")", "]", "}":
if depth > 0 {
depth--
}
}
parts = append(parts, w)
li++
}
captures[m.Name] = strings.Join(parts, " ")
pi++
case MarkerWordList:
// Match one of listed words
matched := false
for _, allowed := range m.ListValues {
if r.CaseSens {
if lineWords[li] == allowed {
matched = true
break
}
} else if strings.EqualFold(lineWords[li], allowed) {
matched = true
break
}
}
if !matched {
return nil
}
captures[m.Name] = lineWords[li]
li++
pi++
default:
// Regular or restricted: capture one token or expression
captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
captures[m.Name] = captured
pi++
}
} else if pw == "[" {
// Optional, possibly-repeating sub-pattern. Try matching the
// bracketed body repeatedly against the remaining line; each
// successful iteration appends its marker captures under the
// same name with a \x01 separator. Used by Harbour forms
// like `DEFAULT <v1> TO <x1> [, <vn> TO <xn> ]` where the
// trailing bracket repeats for each additional pair.
depth := 1
bodyStart := pi + 1
bodyEnd := bodyStart
for bodyEnd < len(patternWords) && depth > 0 {
if patternWords[bodyEnd] == "[" {
depth++
} else if patternWords[bodyEnd] == "]" {
depth--
if depth == 0 {
break
}
}
bodyEnd++
}
body := patternWords[bodyStart:bodyEnd]
for li < len(lineWords) {
snapshotLi := li
iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens)
if !ok {
li = snapshotLi
break
}
for k, v := range iterCaps {
if prev, hit := captures[k]; hit && prev != "" {
captures[k] = prev + "\x01" + v
} else {
captures[k] = v
}
}
li = newLi
if li == snapshotLi {
break // no progress — avoid infinite loop
}
}
pi = bodyEnd + 1 // past ]
} else if pw == "]" {
pi++
} else {
// Literal keyword — must match
if !matchWord(lineWords[li], pw, r.CaseSens) {
return nil
}
li++
pi++
}
}
// Skip remaining optional markers in pattern
for pi < len(patternWords) {
pw := patternWords[pi]
if pw == "[" || pw == "]" || (strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">")) {
pi++
} else {
break
}
}
// For #command with no markers and no optional clauses:
// all line tokens must be consumed for a match
if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
!strings.Contains(r.Pattern, "[") {
return nil
}
return captures
}
// matchSegment tries to match a bracketed sub-pattern against a slice
// of the line tokens starting at startLi. Returns per-iteration
// captures and the new line position on success. The segment cannot
// contain nested `[...]` — callers of the optional-repeat logic
// flatten one level at a time.
//
// A "mini-matcher" that mirrors the main loop for MarkerRegular and
// literal keywords. MarkerList and MarkerWild inside `[...]` would
// need additional plumbing; defer those until real patterns need them.
func matchSegment(segment, lineWords []string, startLi int, caseSens bool) (map[string]string, int, bool) {
caps := make(map[string]string)
li := startLi
// When the segment starts with a literal (e.g. `,` in
// `[, <vn> TO <xn>]`), treat that literal as the natural boundary
// between iterations. Used as the delimiter for a trailing marker
// that would otherwise gobble the rest of the line.
repeatBoundary := ""
if len(segment) > 0 && !strings.HasPrefix(segment[0], "<") &&
segment[0] != "[" && segment[0] != "]" {
repeatBoundary = segment[0]
}
for pi := 0; pi < len(segment); pi++ {
pw := segment[pi]
if li >= len(lineWords) {
return nil, startLi, false
}
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
inner := pw[1 : len(pw)-1]
m := parseOneMarker(inner)
if m.Type != MarkerRegular && m.Type != MarkerRestricted {
return nil, startLi, false
}
// Build a pseudo-pattern tail so captureExpression picks the
// right delimiter. If there's a next literal inside `segment`,
// use it; otherwise fall back to the repeat boundary so the
// capture stops before the next iteration starts.
tail := segment[pi+1:]
if !hasLiteralAfter(tail) && repeatBoundary != "" {
tail = []string{repeatBoundary}
}
captured := captureExpression(lineWords, &li, tail, 0, caseSens)
caps[m.Name] = captured
continue
}
if !matchWord(lineWords[li], pw, caseSens) {
return nil, startLi, false
}
li++
}
return caps, li, true
}
// hasLiteralAfter reports whether a pattern slice contains any literal
// keyword token (non-marker, non-bracket) — used to decide whether a
// marker's capture has a real delimiter or needs a synthetic one.
func hasLiteralAfter(segment []string) bool {
for _, pw := range segment {
if pw == "[" || pw == "]" || pw == "" {
continue
}
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
continue
}
return true
}
return false
}
// ppQuote wraps a captured value in a PRG string literal, picking a
// delimiter that doesn't collide with characters already inside. Harbour
// #<name> stringify takes the raw source text of the argument and must
// produce a legal PRG string — if the capture is `"world"`, the result
// can't just be `""world""`. Preference order matches Harbour:
// double-quotes first, then single-quotes, then bracket literals.
func ppQuote(val string) string {
if !strings.ContainsRune(val, '"') {
return `"` + val + `"`
}
if !strings.ContainsRune(val, '\'') {
return "'" + val + "'"
}
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
return "[" + val + "]"
}
// Fallback: double-quote with embedded quotes dropped. Pathological
// input only; Harbour itself refuses to handle this cleanly.
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
}
// applyResult substitutes captured values into the result template.
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
// all contain the bare `<z>` token, so the bare substitution has to run
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
// `.` / `"` behind, producing bogus lines like `? #hello` that the
// lexer then choked on with ILLEGAL token errors.
func (r *Rule) applyResult(captures map[string]string) string {
result := r.ResultTmpl
// Expand optional-repeat `[ ... ]` segments in the template. If any
// marker inside a bracketed section was multi-captured during the
// pattern match (values joined with \x01), emit the body once per
// iteration with per-iter values. If no markers inside are multi-
// captured, the bracket body is included once with whatever single
// captures apply (the required-or-absent case).
result = expandOptionalRepeat(result, captures)
for name, val := range captures {
// Multi-capture markers are consumed by expandOptionalRepeat;
// the bare substitution for the joined form would produce
// garbage (values separated by \x01). Skip them here and let
// any remaining bare `<name>` fall through to the cleanup.
if strings.ContainsRune(val, '\x01') {
continue
}
quoted := ppQuote(val)
// #<name> — dumb stringify (always quote).
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
// <"name"> — explicit stringify.
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
// <(name)> — smart stringify: already a string literal → keep;
// otherwise quote. `val` comes straight from the capture, so
// trim and check for surrounding quotes.
trim := strings.TrimSpace(val)
smart := quoted
if n := len(trim); n >= 2 &&
((trim[0] == '"' && trim[n-1] == '"') ||
(trim[0] == '\'' && trim[n-1] == '\'') ||
(trim[0] == '[' && trim[n-1] == ']')) {
smart = trim
}
result = strings.ReplaceAll(result, "<("+name+")>", smart)
// <.name.> — logify (empty → .F., else .T.)
if val != "" {
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
} else {
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
}
// <name> — bare substitution (must be LAST, after all wrappers).
result = strings.ReplaceAll(result, "<"+name+">", val)
}
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
result = cleanUnreferencedMarkers(result)
return result
}
// expandOptionalRepeat walks a result template and rewrites each top-
// level `[ ... ]` block by examining the captures referenced inside:
//
// - If any referenced marker has multiple captured iterations
// (values joined with \x01), emit the body N times, substituting
// the i-th iteration's value for each such marker and dropping
// single-valued markers into each iteration unchanged.
// - If no referenced marker is multi-captured BUT the single
// captures include non-empty values, emit the body once.
// - Otherwise drop the block.
//
// Nested brackets are not supported — Harbour uses a single level of
// `[...]` for the common repeat form. Callers that need deeper nesting
// can fall back to writing out separate #xcommand rules.
func expandOptionalRepeat(template string, captures map[string]string) string {
var out strings.Builder
i := 0
for i < len(template) {
if template[i] == '[' {
// Find matching top-level ']'. Skip over quoted strings
// and nested brackets inside PP markers like `<.x.>`.
depth := 1
j := i + 1
for j < len(template) && depth > 0 {
switch template[j] {
case '[':
// Inside a marker `<...>` the `[` is just text;
// only count top-level brackets.
if inMarker(template, j) {
j++
continue
}
depth++
case ']':
if inMarker(template, j) {
j++
continue
}
depth--
if depth == 0 {
body := template[i+1 : j]
out.WriteString(expandBracketBody(body, captures))
i = j + 1
goto next
}
}
j++
}
// Unmatched [ — copy literally.
out.WriteByte(template[i])
i++
next:
continue
}
out.WriteByte(template[i])
i++
}
return out.String()
}
// inMarker reports whether position `p` in s is inside a PP marker
// reference like `<.x.>` / `<"x">` / `<(x)>` — where `[` and `]` are
// ordinary text, not template delimiters.
func inMarker(s string, p int) bool {
// Look backward for `<` not preceded by a marker-terminator.
for k := p - 1; k >= 0; k-- {
c := s[k]
if c == '>' {
return false
}
if c == '<' {
// Scan forward from `<` to see if we're still inside.
for m := k + 1; m < len(s) && m <= p; m++ {
if s[m] == '>' {
return false
}
}
return true
}
}
return false
}
// expandBracketBody returns the optional-repeat body expanded once per
// iteration of its multi-captured markers. See expandOptionalRepeat.
func expandBracketBody(body string, captures map[string]string) string {
// Find marker names referenced inside the body.
refs := referencedMarkers(body)
iters := 1
hasMulti := false
for _, name := range refs {
if val, ok := captures[name]; ok && strings.ContainsRune(val, '\x01') {
n := strings.Count(val, "\x01") + 1
if n > iters {
iters = n
}
hasMulti = true
}
}
if !hasMulti {
// No multi-capture — include body once if any referenced marker
// has a (single) capture; otherwise drop.
anyPresent := false
for _, name := range refs {
if _, ok := captures[name]; ok {
anyPresent = true
break
}
}
if !anyPresent {
return ""
}
return body
}
// Pre-split each multi-captured referent into a per-iteration list.
parts := make(map[string][]string, len(refs))
for _, name := range refs {
if val, ok := captures[name]; ok {
parts[name] = strings.Split(val, "\x01")
}
}
var out strings.Builder
for iter := 0; iter < iters; iter++ {
piece := body
for name, vals := range parts {
var v string
if iter < len(vals) {
v = vals[iter]
}
quoted := ppQuote(v)
piece = strings.ReplaceAll(piece, "#<"+name+">", quoted)
piece = strings.ReplaceAll(piece, `<"`+name+`">`, quoted)
piece = strings.ReplaceAll(piece, "<("+name+")>", quoted)
if v != "" {
piece = strings.ReplaceAll(piece, "<."+name+".>", ".T.")
} else {
piece = strings.ReplaceAll(piece, "<."+name+".>", ".F.")
}
piece = strings.ReplaceAll(piece, "<"+name+">", v)
}
out.WriteString(piece)
}
return out.String()
}
// referencedMarkers extracts marker names referenced inside a template
// fragment. Handles `<name>`, `<(name)>`, `<.name.>`, `<"name">`, and
// `#<name>` forms.
func referencedMarkers(s string) []string {
seen := map[string]bool{}
var out []string
i := 0
for i < len(s) {
if s[i] == '<' {
j := i + 1
// Skip leading punctuation forms: (name), .name., "name".
for j < len(s) && (s[j] == '(' || s[j] == '.' || s[j] == '"') {
j++
}
start := j
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') ||
(s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
j++
}
if j > start {
name := s[start:j]
if !seen[name] {
seen[name] = true
out = append(out, name)
}
}
i = j
continue
}
i++
}
return out
}
// cleanUnreferencedMarkers removes any remaining <name>, <(name)>, <.name.>, #<name> references.
// Only removes well-formed PP marker references, not comparison operators.
func cleanUnreferencedMarkers(s string) string {
// Match patterns like <identifier>, <(identifier)>, <.identifier.>, #<identifier>
var out strings.Builder
i := 0
for i < len(s) {
removed := false
// #<name>
if s[i] == '#' && i+1 < len(s) && s[i+1] == '<' {
if end := findMarkerEnd(s, i+1); end > 0 {
i = end
removed = true
}
}
// <name>, <(name)>, <.name.>, <"name">
if !removed && s[i] == '<' {
if end := findMarkerEnd(s, i); end > 0 {
i = end
removed = true
}
}
if !removed {
out.WriteByte(s[i])
i++
}
}
return out.String()
}
// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
func findMarkerEnd(s string, start int) int {
if start >= len(s) || s[start] != '<' {
return 0
}
i := start + 1
// Skip optional ( or . prefix
if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"') {
i++
}
// Must start with letter or underscore (identifier)
if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
return 0
}
// Consume identifier
for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
i++
}
// Skip optional ) or . or " or ,... suffix
for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == ',' || s[i] == ' ') {
i++
}
if i < len(s) && s[i] == '>' {
return i + 1
}
return 0
}
// --- Helpers ---
func firstToken(s string) string {
for i, c := range s {
if c == ' ' || c == '\t' || c == '(' {
return s[:i]
}
}
return s
}
func matchWord(lineWord, patternWord string, caseSens bool) bool {
if caseSens {
return lineWord == patternWord
}
return strings.EqualFold(lineWord, patternWord)
}
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
// single word would never align with the pattern's `DUMB( , <z>, )`
// tokens.
func tokenizePattern(pattern string) []string {
var tokens []string
i := 0
for i < len(pattern) {
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
i++
}
if i >= len(pattern) {
break
}
if pattern[i] == '<' {
end := strings.IndexByte(pattern[i:], '>')
if end >= 0 {
tokens = append(tokens, pattern[i:i+end+1])
i += end + 1
continue
}
}
switch pattern[i] {
case '[', ']', '(', ')', ',':
tokens = append(tokens, string(pattern[i]))
i++
continue
}
// Regular word — stop at space/tab/marker/bracket/paren/comma.
start := i
for i < len(pattern) {
c := pattern[i]
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
c == '(' || c == ')' || c == ',' {
break
}
i++
}
if i > start {
tokens = append(tokens, pattern[start:i])
}
}
return tokens
}
// tokenizeLine splits a source line into words matching the rules used
// by tokenizePattern: string literals stay intact, commas/parens/brackets
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
func tokenizeLine(line string) []string {
var tokens []string
i := 0
for i < len(line) {
for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
i++
}
if i >= len(line) {
break
}
// String literal
if line[i] == '"' || line[i] == '\'' {
quote := line[i]
start := i
i++
for i < len(line) && line[i] != quote {
i++
}
if i < len(line) {
i++
}
tokens = append(tokens, line[start:i])
continue
}
switch line[i] {
case ',', '(', ')', '[', ']':
tokens = append(tokens, string(line[i]))
i++
continue
}
// Word — stop at whitespace, brackets, parens, comma, quotes.
start := i
for i < len(line) {
c := line[i]
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
c == '[' || c == ']' || c == '"' || c == '\'' {
break
}
i++
}
if i > start {
tokens = append(tokens, line[start:i])
}
}
return tokens
}
// captureExpression captures an expression from line tokens.
// If this is the last marker in the pattern, captures all remaining tokens.
// Otherwise, captures until the next keyword in the pattern.
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
if *li >= len(lineWords) {
return ""
}
// Find next literal keyword in pattern to use as delimiter
delimWord := ""
for pi := nextPi; pi < len(patternWords); pi++ {
pw := patternWords[pi]
if !strings.HasPrefix(pw, "<") && pw != "[" && pw != "]" {
delimWord = pw
break
}
}
if delimWord != "" {
// Capture until the delimiter, paren-balancing so nested
// parens/brackets/braces inside the expression don't falsely
// terminate the capture. Harbour's own PP does the same —
// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
// and leave the outer `)` for the pattern's own delimiter.
var parts []string
depth := 0
for *li < len(lineWords) {
w := lineWords[*li]
if depth == 0 && matchWord(w, delimWord, caseSens) {
break
}
switch w {
case "(", "[", "{":
depth++
case ")", "]", "}":
if depth > 0 {
depth--
}
}
parts = append(parts, w)
*li++
}
return strings.Join(parts, " ")
}
// No delimiter: if last marker, capture all remaining tokens
if nextPi >= len(patternWords) {
rest := strings.Join(lineWords[*li:], " ")
*li = len(lineWords)
return rest
}
// Single token capture (between markers)
tok := lineWords[*li]
*li++
return tok
}