Three medium-priority audit items in one commit, each independently
revertible.
* **#18 JOIN hash-join fast path.** New std.ch shape:
JOIN WITH <alias> TO <file> [FIELDS ...] ON <mfield> = <dfield>
expands to a 6-arg __dbJoin call with the master/detail key
field names. Runtime detects the extra args, builds an O(M)
hash over the detail's key column, then probes per master row
for O(N+M) total — vs the FOR form's O(N*M). For 1k×1k that's
2k vs 1M operations; the gap widens with N. The original FOR
form is unchanged and stays the fallback for arbitrary
predicates. New helper dbHashKey type-tags the key string so
`1` (numeric), `"1"` (string), and `.T.` (logical) don't
collide in the bucket map.
* **#38 PP rule result-marker validation.** ParseRule now walks
the result template after parseMarkers and warns about every
`<name>` (or `<(name)>` / `<.name.>` / `<{name}>` / `#<name>`
/ `<"name">`) that doesn't match a pattern marker. Warnings
flow into pp.errors via handleDirective with the directive's
filename:line, so a typo'd `<NaMe>` in an `#xcommand`
case-sensitive rule fails the build with a clear diagnostic
instead of silently producing broken expansions.
* **#44 looksLikeInlineC heuristic strengthened.** Catches more
of the common Harbour-PRG-with-C-inline-block shapes that
used to fall through and produce cryptic Go-side errors:
function-like #define, `extern "C"` linkage blocks, C return-
type declarations (`int foo(`, `static char* bar(`), and the
hb_ret*() helper family used by Harbour's C FFI return
setters. Two small predicate helpers (allLetters,
allIdentChars) keep the C-vs-Go disambiguation tight enough
that legit Go code (`func name() int { ... }`) doesn't trip.
* **#28 LIST/DISPLAY pagination** — explicitly deferred. Proper
pagination requires interactive terminal handling (Inkey(0)
for the keypress) which would hang in CI / batch mode. Will
revisit when an interactive terminal layer needs it for
other reasons.
Test fixtures: tests/std_ch/test_join_hash.prg verifies the new
ON-form path produces the same output as the FOR form would.
std.ch runner now stands at 16/16.
Other gates green:
go test ./... : PASS
FiveSql2 SQL:1999 : 43/43
Harbour compat : 56/56
std.ch suite : 16/16
FRB suite : 7/7
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1425 lines
41 KiB
Go
1425 lines
41 KiB
Go
// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
|
|
// All rights reserved.
|
|
|
|
// #command / #translate implementation for Five preprocessor.
|
|
//
|
|
// Harbour PP syntax:
|
|
// #command PATTERN => RESULT
|
|
// #translate PATTERN => RESULT
|
|
// #xcommand PATTERN => RESULT (case-sensitive)
|
|
// #xtranslate PATTERN => RESULT (case-sensitive)
|
|
//
|
|
// Pattern markers:
|
|
// <x> — match any expression (regular match)
|
|
// <!x!> — match single identifier only (restricted match)
|
|
// <x,...> — match comma-separated list
|
|
// <*x*> — match rest of line (wild match)
|
|
// <x:a,b,c> — match one of listed words (list match)
|
|
// [...] — optional clause
|
|
//
|
|
// Result markers:
|
|
// <x> — substitute matched text
|
|
// <(x)> — stringify (wrap in quotes)
|
|
// <{x}> — blockify (wrap in {|| })
|
|
// #<x> — dumb stringify
|
|
// <.x.> — logify (.T. if matched, .F. if not)
|
|
//
|
|
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
|
|
package pp
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// Rule represents a single #command or #translate rule.
|
|
type Rule struct {
|
|
Pattern string // raw pattern text
|
|
Result string // raw result text
|
|
IsCommand bool // #command vs #translate
|
|
CaseSens bool // #xcommand/#xtranslate = case sensitive
|
|
Keyword string // first keyword (for fast matching)
|
|
Markers []Marker // parsed pattern markers
|
|
ResultTmpl string // result template with marker references
|
|
|
|
// Warnings collected during ParseRule. Currently only one source:
|
|
// result-template markers that reference a name absent from the
|
|
// pattern. Caller can surface these to the user — a typo'd
|
|
// `<For>` instead of `<for>` used to silently produce broken
|
|
// expansion output.
|
|
Warnings []string
|
|
}
|
|
|
|
// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
|
|
type Marker struct {
|
|
Name string // marker name
|
|
Type MarkerType
|
|
ListValues []string // for <x:a,b,c> — allowed values
|
|
}
|
|
|
|
type MarkerType int
|
|
|
|
const (
|
|
MarkerRegular MarkerType = iota // <x> — any expression
|
|
MarkerRestricted // <!x!> — identifier only
|
|
MarkerList // <x,...> — comma-separated list
|
|
MarkerWild // <*x*> — rest of line
|
|
MarkerWordList // <x:a,b,c> — one of listed words
|
|
)
|
|
|
|
// ParseRule parses a #command/#translate directive into a Rule.
|
|
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
|
|
// Split on =>
|
|
parts := strings.SplitN(directive, "=>", 2)
|
|
if len(parts) != 2 {
|
|
return nil
|
|
}
|
|
|
|
pattern := strings.TrimSpace(parts[0])
|
|
result := strings.TrimSpace(parts[1])
|
|
|
|
// Earlier versions stripped every ` ;` as Harbour line-continuation.
|
|
// That also destroyed in-line PRG statement separators — `IF x ==
|
|
// NIL ; x := y ; ENDIF` lost all its semicolons. Line-continuation
|
|
// joining is the preprocessor's job (processLines), not this rule
|
|
// parser's. Keep the semicolons as-is.
|
|
|
|
rule := &Rule{
|
|
Pattern: pattern,
|
|
Result: result,
|
|
IsCommand: isCommand,
|
|
CaseSens: caseSens,
|
|
ResultTmpl: result,
|
|
}
|
|
|
|
// Extract first keyword for fast matching. The first whitespace-
|
|
// delimited token of the pattern becomes the dispatch key; we
|
|
// strip marker wrappers and any trailing `(` so a pattern like
|
|
// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
|
|
// firstToken normalises source lines.
|
|
words := strings.Fields(pattern)
|
|
if len(words) > 0 {
|
|
kw := words[0]
|
|
kw = strings.TrimLeft(kw, "<[")
|
|
kw = strings.TrimRight(kw, ">]")
|
|
if idx := strings.IndexByte(kw, '('); idx >= 0 {
|
|
kw = kw[:idx]
|
|
}
|
|
if !strings.ContainsAny(kw, "!*,:") {
|
|
rule.Keyword = kw
|
|
}
|
|
}
|
|
|
|
// Parse markers from pattern
|
|
rule.Markers = parseMarkers(pattern)
|
|
|
|
// Validate result-template marker references. Each `<name>`
|
|
// (and its smart-stringify / blockify / logify / dumb-stringify
|
|
// variants) must reference a name declared in the pattern.
|
|
// Catches typos like `<For>` vs `<for>` (case-sensitive
|
|
// xcommand) before they silently produce broken output at
|
|
// expansion time.
|
|
rule.Warnings = validateResultMarkers(pattern, result, rule.Markers, caseSens)
|
|
|
|
return rule
|
|
}
|
|
|
|
// validateResultMarkers scans the result template for marker
|
|
// references and reports any name not declared in the pattern.
|
|
// Result returned as a slice of human-readable warning strings —
|
|
// caller decides whether to surface or ignore.
|
|
func validateResultMarkers(pattern, result string, markers []Marker, caseSens bool) []string {
|
|
declared := make(map[string]bool, len(markers))
|
|
for _, m := range markers {
|
|
key := m.Name
|
|
if !caseSens {
|
|
key = strings.ToUpper(key)
|
|
}
|
|
declared[key] = true
|
|
}
|
|
if len(declared) == 0 {
|
|
// Nothing to validate against — rule is keyword-only.
|
|
return nil
|
|
}
|
|
|
|
var warnings []string
|
|
seen := map[string]bool{}
|
|
i := 0
|
|
for i < len(result) {
|
|
// Marker shapes recognised here mirror applyResult's loop:
|
|
// <name>, <(name)>, <.name.>, <{name}>, <"name">, #<name>.
|
|
// findMarkerEnd already understands all of them — we just
|
|
// need the inner identifier.
|
|
if result[i] != '<' && !(result[i] == '#' && i+1 < len(result) && result[i+1] == '<') {
|
|
i++
|
|
continue
|
|
}
|
|
start := i
|
|
if result[i] == '#' {
|
|
start = i + 1
|
|
}
|
|
end := findMarkerEnd(result, start)
|
|
if end == 0 {
|
|
i++
|
|
continue
|
|
}
|
|
// Extract identifier between the wrappers.
|
|
inner := result[start+1 : end-1]
|
|
// Strip prefix `(`, `.`, `"`, `{`
|
|
for len(inner) > 0 && (inner[0] == '(' || inner[0] == '.' || inner[0] == '"' || inner[0] == '{') {
|
|
inner = inner[1:]
|
|
}
|
|
// Strip suffix `)`, `.`, `"`, `}`
|
|
for len(inner) > 0 {
|
|
c := inner[len(inner)-1]
|
|
if c == ')' || c == '.' || c == '"' || c == '}' || c == ' ' {
|
|
inner = inner[:len(inner)-1]
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
key := inner
|
|
if !caseSens {
|
|
key = strings.ToUpper(key)
|
|
}
|
|
if key != "" && !declared[key] && !seen[key] {
|
|
seen[key] = true
|
|
warnings = append(warnings,
|
|
fmt.Sprintf("result-template marker <%s> not declared in pattern: %q",
|
|
inner, pattern))
|
|
}
|
|
i = end
|
|
}
|
|
return warnings
|
|
}
|
|
|
|
// parseMarkers extracts all <...> markers from a pattern.
|
|
func parseMarkers(pattern string) []Marker {
|
|
var markers []Marker
|
|
i := 0
|
|
for i < len(pattern) {
|
|
if pattern[i] == '<' {
|
|
end := strings.IndexByte(pattern[i:], '>')
|
|
if end < 0 {
|
|
break
|
|
}
|
|
inner := pattern[i+1 : i+end]
|
|
m := parseOneMarker(inner)
|
|
if m.Name != "" {
|
|
markers = append(markers, m)
|
|
}
|
|
i += end + 1
|
|
} else {
|
|
i++
|
|
}
|
|
}
|
|
return markers
|
|
}
|
|
|
|
func parseOneMarker(inner string) Marker {
|
|
inner = strings.TrimSpace(inner)
|
|
|
|
// <!name!> — restricted
|
|
if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
|
|
}
|
|
|
|
// <*name*> — wild
|
|
if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
|
|
}
|
|
|
|
// <name,...> — comma list
|
|
if strings.HasSuffix(inner, ",...") {
|
|
return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
|
|
}
|
|
|
|
// <name:a,b,c> — word list
|
|
if idx := strings.IndexByte(inner, ':'); idx > 0 {
|
|
name := inner[:idx]
|
|
vals := strings.Split(inner[idx+1:], ",")
|
|
for i := range vals {
|
|
vals[i] = strings.TrimSpace(vals[i])
|
|
}
|
|
return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
|
|
}
|
|
|
|
// <(name)> — extended-expression marker. In Harbour PP this captures
|
|
// a file-name-like extended expression and the matching result token
|
|
// `<(name)>` smart-stringifies it (already-quoted → keep, identifier
|
|
// → quote). Strip the parens so captures are stored under the bare
|
|
// name; result substitution then matches both `<(name)>` and `<name>`
|
|
// via the existing path.
|
|
if strings.HasPrefix(inner, "(") && strings.HasSuffix(inner, ")") {
|
|
return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRegular}
|
|
}
|
|
|
|
// <name> — regular
|
|
return Marker{Name: inner, Type: MarkerRegular}
|
|
}
|
|
|
|
// --- Rule matching and application ---
|
|
|
|
// MatchLine checks if a source line matches this rule and returns the substituted result.
|
|
// Returns ("", false) if no match.
|
|
func (r *Rule) MatchLine(line string) (string, bool) {
|
|
trimmed := strings.TrimSpace(line)
|
|
if trimmed == "" {
|
|
return "", false
|
|
}
|
|
|
|
// Fast keyword check
|
|
if r.Keyword != "" {
|
|
firstWord := firstToken(trimmed)
|
|
if r.CaseSens {
|
|
if firstWord != r.Keyword {
|
|
return "", false
|
|
}
|
|
} else {
|
|
if !strings.EqualFold(firstWord, r.Keyword) {
|
|
return "", false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to match pattern against line
|
|
captures := r.matchPattern(trimmed)
|
|
if captures == nil {
|
|
return "", false
|
|
}
|
|
|
|
// Apply result template
|
|
result := r.applyResult(captures)
|
|
return result, true
|
|
}
|
|
|
|
// matchPattern attempts to match the pattern against a line.
|
|
// Returns captured values map, or nil if no match.
|
|
func (r *Rule) matchPattern(line string) map[string]string {
|
|
captures := make(map[string]string)
|
|
|
|
patternWords := tokenizePattern(r.Pattern)
|
|
lineWords := tokenizeLine(line)
|
|
|
|
pi, li := 0, 0
|
|
for pi < len(patternWords) && li < len(lineWords) {
|
|
pw := patternWords[pi]
|
|
|
|
// Marker?
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
inner := pw[1 : len(pw)-1]
|
|
m := parseOneMarker(inner)
|
|
|
|
switch m.Type {
|
|
case MarkerWild:
|
|
// Capture rest of line
|
|
rest := strings.Join(lineWords[li:], " ")
|
|
captures[m.Name] = rest
|
|
li = len(lineWords)
|
|
pi++
|
|
|
|
case MarkerList:
|
|
// Capture a comma-separated list until the next literal
|
|
// pattern token. Paren-balanced so nested `(`/`[`/`{`
|
|
// don't let an inner `)` terminate the capture. Commas
|
|
// at the top level are preserved verbatim in the
|
|
// captured string so the `<z>` substitution in the
|
|
// result template reproduces the argument list as-is.
|
|
var parts []string
|
|
depth := 0
|
|
delim := ""
|
|
if pi+1 < len(patternWords) {
|
|
delim = patternWords[pi+1]
|
|
}
|
|
for li < len(lineWords) {
|
|
w := lineWords[li]
|
|
if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
|
|
break
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
li++
|
|
}
|
|
captures[m.Name] = strings.Join(parts, " ")
|
|
pi++
|
|
|
|
case MarkerWordList:
|
|
// Match one of listed words
|
|
matched := false
|
|
for _, allowed := range m.ListValues {
|
|
if r.CaseSens {
|
|
if lineWords[li] == allowed {
|
|
matched = true
|
|
break
|
|
}
|
|
} else if strings.EqualFold(lineWords[li], allowed) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched {
|
|
return nil
|
|
}
|
|
captures[m.Name] = lineWords[li]
|
|
li++
|
|
pi++
|
|
|
|
default:
|
|
// Regular or restricted: capture one token or expression
|
|
captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
|
|
captures[m.Name] = captured
|
|
pi++
|
|
}
|
|
} else if pw == "[" {
|
|
// Optional, possibly-repeating sub-pattern. Try matching the
|
|
// bracketed body repeatedly against the remaining line; each
|
|
// successful iteration appends its marker captures under the
|
|
// same name with a \x01 separator. Used by Harbour forms
|
|
// like `DEFAULT <v1> TO <x1> [, <vn> TO <xn> ]` where the
|
|
// trailing bracket repeats for each additional pair.
|
|
depth := 1
|
|
bodyStart := pi + 1
|
|
bodyEnd := bodyStart
|
|
for bodyEnd < len(patternWords) && depth > 0 {
|
|
if patternWords[bodyEnd] == "[" {
|
|
depth++
|
|
} else if patternWords[bodyEnd] == "]" {
|
|
depth--
|
|
if depth == 0 {
|
|
break
|
|
}
|
|
}
|
|
bodyEnd++
|
|
}
|
|
body := patternWords[bodyStart:bodyEnd]
|
|
// Outer-pattern tail (everything after the matching `]`) is
|
|
// needed so a regular marker at the end of `body` knows where
|
|
// to stop capturing. Without this, `[TO <v>] [FOR <for>]`
|
|
// against `TO n FOR age >= 30` would let `<v>` swallow the
|
|
// rest of the line because `body` itself has no literal that
|
|
// follows the marker.
|
|
outerTail := patternWords[bodyEnd+1:]
|
|
for li < len(lineWords) {
|
|
snapshotLi := li
|
|
iterCaps, newLi, ok := matchSegment(body, lineWords, li, r.CaseSens, outerTail)
|
|
if !ok {
|
|
li = snapshotLi
|
|
break
|
|
}
|
|
// No-progress matches can happen when the body is just
|
|
// a list/regular marker that immediately hits a stop
|
|
// boundary on this iteration — its captured value is
|
|
// empty. Don't merge those into captures, otherwise an
|
|
// earlier successful iteration's value gets contaminated
|
|
// with the `\x01`-separator form and the result-template
|
|
// substitution skips it as multi-capture garbage.
|
|
if newLi == snapshotLi {
|
|
break
|
|
}
|
|
for k, v := range iterCaps {
|
|
if prev, hit := captures[k]; hit && prev != "" {
|
|
captures[k] = prev + "\x01" + v
|
|
} else {
|
|
captures[k] = v
|
|
}
|
|
}
|
|
li = newLi
|
|
}
|
|
pi = bodyEnd + 1 // past ]
|
|
} else if pw == "]" {
|
|
pi++
|
|
} else {
|
|
// Literal keyword — must match
|
|
if !matchWord(lineWords[li], pw, r.CaseSens) {
|
|
return nil
|
|
}
|
|
li++
|
|
pi++
|
|
}
|
|
}
|
|
|
|
// Walk any tail of the pattern that wasn't matched against the
|
|
// line. We accept it only if everything that remains is *optional*
|
|
// — i.e. a `[...]` block (which by definition can be absent) or
|
|
// markers/literals that are nested inside one. A bare `<a>` or a
|
|
// literal token outside of brackets is required, so encountering
|
|
// one means the pattern isn't satisfied: bare `CLOSE` must not
|
|
// match rule `CLOSE <a>`.
|
|
depth := 0
|
|
for pi < len(patternWords) {
|
|
pw := patternWords[pi]
|
|
switch {
|
|
case pw == "[":
|
|
depth++
|
|
case pw == "]":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
default:
|
|
if depth == 0 {
|
|
return nil
|
|
}
|
|
}
|
|
pi++
|
|
}
|
|
|
|
// For #command with no markers and no optional clauses:
|
|
// all line tokens must be consumed for a match
|
|
if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
|
|
!strings.Contains(r.Pattern, "[") {
|
|
return nil
|
|
}
|
|
|
|
return captures
|
|
}
|
|
|
|
// matchSegment tries to match a bracketed sub-pattern against a slice
|
|
// of the line tokens starting at startLi. Returns per-iteration
|
|
// captures and the new line position on success. The segment cannot
|
|
// contain nested `[...]` — callers of the optional-repeat logic
|
|
// flatten one level at a time.
|
|
//
|
|
// A "mini-matcher" that mirrors the main loop for MarkerRegular,
|
|
// MarkerRestricted, and MarkerList plus literal keywords. MarkerWild
|
|
// inside `[...]` is rare and still defers to the main matcher.
|
|
func matchSegment(segment, lineWords []string, startLi int, caseSens bool, outerTail []string) (map[string]string, int, bool) {
|
|
caps := make(map[string]string)
|
|
li := startLi
|
|
|
|
// When the segment starts with a literal (e.g. `,` in
|
|
// `[, <vn> TO <xn>]`), treat that literal as the natural boundary
|
|
// between iterations. Used as the delimiter for a trailing marker
|
|
// that would otherwise gobble the rest of the line.
|
|
repeatBoundary := ""
|
|
if len(segment) > 0 && !strings.HasPrefix(segment[0], "<") &&
|
|
segment[0] != "[" && segment[0] != "]" {
|
|
repeatBoundary = segment[0]
|
|
}
|
|
|
|
for pi := 0; pi < len(segment); pi++ {
|
|
pw := segment[pi]
|
|
// Nested optional clause: find the matching `]`, run the
|
|
// repeat-loop on the inner body until no progress. Mirrors
|
|
// the main matchPattern's `[` branch. Doesn't require any
|
|
// remaining input — an absent optional just doesn't iterate.
|
|
if pw == "[" {
|
|
depth := 1
|
|
bodyStart := pi + 1
|
|
bodyEnd := bodyStart
|
|
for bodyEnd < len(segment) && depth > 0 {
|
|
if segment[bodyEnd] == "[" {
|
|
depth++
|
|
} else if segment[bodyEnd] == "]" {
|
|
depth--
|
|
if depth == 0 {
|
|
break
|
|
}
|
|
}
|
|
bodyEnd++
|
|
}
|
|
innerBody := segment[bodyStart:bodyEnd]
|
|
innerOuterTail := segment[bodyEnd+1:]
|
|
for li < len(lineWords) {
|
|
snapshotLi := li
|
|
iterCaps, newLi, ok := matchSegment(innerBody, lineWords, li, caseSens, innerOuterTail)
|
|
if !ok {
|
|
li = snapshotLi
|
|
break
|
|
}
|
|
if newLi == snapshotLi {
|
|
break
|
|
}
|
|
for k, v := range iterCaps {
|
|
if prev, hit := caps[k]; hit && prev != "" {
|
|
caps[k] = prev + "\x01" + v
|
|
} else {
|
|
caps[k] = v
|
|
}
|
|
}
|
|
li = newLi
|
|
}
|
|
pi = bodyEnd
|
|
continue
|
|
}
|
|
if pw == "]" {
|
|
// Stray closer — skip.
|
|
continue
|
|
}
|
|
if li >= len(lineWords) {
|
|
return nil, startLi, false
|
|
}
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
inner := pw[1 : len(pw)-1]
|
|
m := parseOneMarker(inner)
|
|
switch m.Type {
|
|
case MarkerWordList:
|
|
// Match one of the listed words. If the current line
|
|
// token isn't in the allowed set, the segment fails to
|
|
// match — same behavior as the top-level matcher.
|
|
w := lineWords[li]
|
|
matched := false
|
|
for _, allowed := range m.ListValues {
|
|
if caseSens {
|
|
if w == allowed {
|
|
matched = true
|
|
break
|
|
}
|
|
} else if strings.EqualFold(w, allowed) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
if !matched {
|
|
return nil, startLi, false
|
|
}
|
|
caps[m.Name] = w
|
|
li++
|
|
continue
|
|
case MarkerList:
|
|
// Capture comma-separated tokens until we hit the
|
|
// segment's next literal, an outer literal, or one of
|
|
// the limited values of a following MarkerWordList
|
|
// (e.g. `<off:OFF>` — OFF is the only token that can
|
|
// match it, so the list before it must stop at OFF).
|
|
// Paren-balanced so `f(a,b)` inside the list doesn't
|
|
// terminate prematurely. Mirrors the main matchPattern's
|
|
// MarkerList branch.
|
|
stop := map[string]struct{}{}
|
|
addStopFrom(stop, segment[pi+1:])
|
|
addStopFrom(stop, outerTail)
|
|
var parts []string
|
|
depth := 0
|
|
for li < len(lineWords) {
|
|
w := lineWords[li]
|
|
if depth == 0 {
|
|
key := w
|
|
if !caseSens {
|
|
key = strings.ToUpper(w)
|
|
}
|
|
if _, hit := stop[key]; hit {
|
|
break
|
|
}
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
li++
|
|
}
|
|
caps[m.Name] = strings.Join(parts, " ")
|
|
continue
|
|
case MarkerRegular, MarkerRestricted:
|
|
// fall through to capture-one-expression below
|
|
default:
|
|
return nil, startLi, false
|
|
}
|
|
// Build a pseudo-pattern tail so captureExpression picks
|
|
// the right delimiters. Priority order (each level is
|
|
// merged, then captureExpression stops at *whichever*
|
|
// delimiter shows up first in the input):
|
|
// 1. Next literals inside the same segment.
|
|
// 2. Every literal in the outer-pattern tail — what
|
|
// stops `[TO <(f)>] [FIELDS ...] [FOR ...]` from
|
|
// letting `<(f)>` swallow a trailing FOR/WHILE/...
|
|
// 3. Repeat boundary (the segment's leading literal)
|
|
// — needed for multi-iter `[, <xN>]` so each
|
|
// iteration's `<xN>` stops at the next ',' before
|
|
// the outer-tail's TO/FOR/etc. catches it.
|
|
tail := segment[pi+1:]
|
|
if !hasLiteralAfter(tail) {
|
|
combined := []string{}
|
|
if hasLiteralAfter(outerTail) {
|
|
combined = append(combined, outerTail...)
|
|
}
|
|
if repeatBoundary != "" {
|
|
combined = append(combined, repeatBoundary)
|
|
}
|
|
if len(combined) > 0 {
|
|
tail = combined
|
|
}
|
|
}
|
|
captured := captureExpression(lineWords, &li, tail, 0, caseSens)
|
|
caps[m.Name] = captured
|
|
continue
|
|
}
|
|
if !matchWord(lineWords[li], pw, caseSens) {
|
|
return nil, startLi, false
|
|
}
|
|
li++
|
|
}
|
|
return caps, li, true
|
|
}
|
|
|
|
// addStopFrom merges into `stop` every token that could legally match
|
|
// the next position in `pw`: bare literals AND each value of any
|
|
// MarkerWordList (`<name:A,B,C>`) since those markers can match only
|
|
// their listed words. Used so a preceding list/regular capture knows
|
|
// to stop before any of them. Always uppercased — the caller decides
|
|
// whether to do a case-insensitive lookup.
|
|
func addStopFrom(stop map[string]struct{}, pw []string) {
|
|
for _, w := range pw {
|
|
if w == "" || w == "[" || w == "]" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
|
|
inner := w[1 : len(w)-1]
|
|
if m := parseOneMarker(inner); m.Type == MarkerWordList {
|
|
for _, v := range m.ListValues {
|
|
stop[strings.ToUpper(v)] = struct{}{}
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
stop[strings.ToUpper(w)] = struct{}{}
|
|
}
|
|
}
|
|
|
|
// firstLiteral returns the first non-marker, non-bracket token in pw,
|
|
// or "" if none. Used to give matchSegment a stop-boundary drawn from
|
|
// the outer pattern when its body ends in a regular marker.
|
|
func firstLiteral(pw []string) string {
|
|
for _, w := range pw {
|
|
if w == "[" || w == "]" || w == "" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(w, "<") && strings.HasSuffix(w, ">") {
|
|
continue
|
|
}
|
|
return w
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// hasLiteralAfter reports whether a pattern slice contains any literal
|
|
// keyword token (non-marker, non-bracket) — used to decide whether a
|
|
// marker's capture has a real delimiter or needs a synthetic one.
|
|
func hasLiteralAfter(segment []string) bool {
|
|
for _, pw := range segment {
|
|
if pw == "[" || pw == "]" || pw == "" {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
|
|
continue
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// quoteListElements smart-stringifies a list-style capture: split val
|
|
// on top-level commas (paren / bracket / brace balanced) and emit each
|
|
// element quoted. Already-quoted elements are kept as-is so a literal
|
|
// like `"a", "b"` round-trips intact. Used by `<(name)>` substitution
|
|
// when `name` came from a `<name,...>` marker — Harbour's std.ch idiom
|
|
// for `{ <(fields)> }` to expand to `{ "a", "b", "c" }`.
|
|
func quoteListElements(val string) string {
|
|
parts := splitTopLevelCommas(val)
|
|
if len(parts) == 0 {
|
|
return ""
|
|
}
|
|
out := make([]string, 0, len(parts))
|
|
for _, p := range parts {
|
|
t := strings.TrimSpace(p)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
// Already a string literal — keep verbatim.
|
|
if n := len(t); n >= 2 &&
|
|
((t[0] == '"' && t[n-1] == '"') ||
|
|
(t[0] == '\'' && t[n-1] == '\'') ||
|
|
(t[0] == '[' && t[n-1] == ']')) {
|
|
out = append(out, t)
|
|
continue
|
|
}
|
|
out = append(out, ppQuote(t))
|
|
}
|
|
return strings.Join(out, ", ")
|
|
}
|
|
|
|
// splitTopLevelCommas splits s on commas that are not nested inside
|
|
// (), [], or {}. Strings ("..." / '...') are skipped to avoid breaking
|
|
// captured PRG expressions.
|
|
func splitTopLevelCommas(s string) []string {
|
|
var parts []string
|
|
depth := 0
|
|
start := 0
|
|
inStr := byte(0)
|
|
for i := 0; i < len(s); i++ {
|
|
c := s[i]
|
|
if inStr != 0 {
|
|
if c == inStr {
|
|
inStr = 0
|
|
}
|
|
continue
|
|
}
|
|
switch c {
|
|
case '"', '\'':
|
|
inStr = c
|
|
case '(', '[', '{':
|
|
depth++
|
|
case ')', ']', '}':
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
case ',':
|
|
if depth == 0 {
|
|
parts = append(parts, s[start:i])
|
|
start = i + 1
|
|
}
|
|
}
|
|
}
|
|
parts = append(parts, s[start:])
|
|
return parts
|
|
}
|
|
|
|
// ppQuote wraps a captured value in a PRG string literal, picking a
|
|
// delimiter that doesn't collide with characters already inside. Harbour
|
|
// #<name> stringify takes the raw source text of the argument and must
|
|
// produce a legal PRG string — if the capture is `"world"`, the result
|
|
// can't just be `""world""`. Preference order matches Harbour:
|
|
// double-quotes first, then single-quotes, then bracket literals.
|
|
func ppQuote(val string) string {
|
|
if !strings.ContainsRune(val, '"') {
|
|
return `"` + val + `"`
|
|
}
|
|
if !strings.ContainsRune(val, '\'') {
|
|
return "'" + val + "'"
|
|
}
|
|
if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
|
|
return "[" + val + "]"
|
|
}
|
|
// Fallback: double-quote with embedded quotes dropped. Pathological
|
|
// input only; Harbour itself refuses to handle this cleanly.
|
|
return `"` + strings.ReplaceAll(val, `"`, "") + `"`
|
|
}
|
|
|
|
// applyResult substitutes captured values into the result template.
|
|
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
|
|
// all contain the bare `<z>` token, so the bare substitution has to run
|
|
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
|
|
// `.` / `"` behind, producing bogus lines like `? #hello` that the
|
|
// lexer then choked on with ILLEGAL token errors.
|
|
func (r *Rule) applyResult(captures map[string]string) string {
|
|
result := r.ResultTmpl
|
|
|
|
// Expand optional-repeat `[ ... ]` segments in the template. If any
|
|
// marker inside a bracketed section was multi-captured during the
|
|
// pattern match (values joined with \x01), emit the body once per
|
|
// iteration with per-iter values. If no markers inside are multi-
|
|
// captured, the bracket body is included once with whatever single
|
|
// captures apply (the required-or-absent case).
|
|
result = expandOptionalRepeat(result, captures)
|
|
|
|
// Marker-name → list flag, so the smart-stringify branch below can
|
|
// emit per-element quoting (`{ "a", "b" }`) for list captures
|
|
// instead of treating the comma-joined string as one literal.
|
|
isList := make(map[string]bool, len(r.Markers))
|
|
for _, m := range r.Markers {
|
|
if m.Type == MarkerList {
|
|
isList[m.Name] = true
|
|
}
|
|
}
|
|
|
|
for name, val := range captures {
|
|
// Multi-capture markers are consumed by expandOptionalRepeat;
|
|
// the bare substitution for the joined form would produce
|
|
// garbage (values separated by \x01). Skip them here and let
|
|
// any remaining bare `<name>` fall through to the cleanup.
|
|
if strings.ContainsRune(val, '\x01') {
|
|
continue
|
|
}
|
|
quoted := ppQuote(val)
|
|
// #<name> — dumb stringify (always quote).
|
|
result = strings.ReplaceAll(result, "#<"+name+">", quoted)
|
|
// <"name"> — explicit stringify.
|
|
result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
|
|
// <(name)> — smart stringify: already a string literal → keep;
|
|
// list capture → quote each comma-separated element; otherwise
|
|
// quote whole. `val` comes straight from the capture, so trim
|
|
// and check for surrounding quotes.
|
|
trim := strings.TrimSpace(val)
|
|
smart := quoted
|
|
if n := len(trim); n >= 2 &&
|
|
((trim[0] == '"' && trim[n-1] == '"') ||
|
|
(trim[0] == '\'' && trim[n-1] == '\'') ||
|
|
(trim[0] == '[' && trim[n-1] == ']')) {
|
|
smart = trim
|
|
} else if isList[name] {
|
|
smart = quoteListElements(val)
|
|
}
|
|
result = strings.ReplaceAll(result, "<("+name+")>", smart)
|
|
// <.name.> — logify (empty → .F., else .T.)
|
|
if val != "" {
|
|
result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
|
|
} else {
|
|
result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
|
|
}
|
|
// <{name}> — blockify: wrap captured expression in {|| ... }.
|
|
// For list-typed markers (`<name,...>`) wrap *each* element so
|
|
// `{ <{v}> }` against `LIST id, name` expands to
|
|
// `{ {|| id }, {|| name } }`, matching Harbour's std.ch
|
|
// idiom for column blocks. Empty capture → NIL so the call
|
|
// site sees a nil block (missing FOR/WHILE clause).
|
|
if val == "" {
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", "NIL")
|
|
} else if isList[name] {
|
|
parts := splitTopLevelCommas(val)
|
|
out := make([]string, 0, len(parts))
|
|
for _, p := range parts {
|
|
t := strings.TrimSpace(p)
|
|
if t == "" {
|
|
continue
|
|
}
|
|
out = append(out, "{|| "+t+" }")
|
|
}
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", strings.Join(out, ", "))
|
|
} else {
|
|
result = strings.ReplaceAll(result, "<{"+name+"}>", "{|| "+val+" }")
|
|
}
|
|
// <name> — bare substitution (must be LAST, after all wrappers).
|
|
result = strings.ReplaceAll(result, "<"+name+">", val)
|
|
}
|
|
|
|
// Any `<{name}>` still in the template means `name` was never
|
|
// captured — emit NIL so call sites see a missing block argument
|
|
// (matches Harbour: empty FOR/WHILE → NIL → bypass the condition).
|
|
result = replaceUnreferencedBlockify(result)
|
|
|
|
// Same idea for `<.name.>`: a missing marker logifies to .F.,
|
|
// matching Harbour's behavior of "absent optional clause => .F."
|
|
// for OFF / ALL / REST / etc.
|
|
result = replaceUnreferencedLogify(result)
|
|
|
|
// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
|
|
result = cleanUnreferencedMarkers(result)
|
|
|
|
return result
|
|
}
|
|
|
|
// replaceUnreferencedLogify rewrites every remaining `<.ident.>` to
|
|
// `.F.` — the absent-optional-clause sentinel that matches Harbour's
|
|
// std.ch convention.
|
|
func replaceUnreferencedLogify(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(s) {
|
|
if i+2 < len(s) && s[i] == '<' && s[i+1] == '.' {
|
|
j := i + 2
|
|
if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
|
|
j++
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j+1 < len(s) && s[j] == '.' && s[j+1] == '>' {
|
|
out.WriteString(".F.")
|
|
i = j + 2
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
out.WriteByte(s[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// replaceUnreferencedBlockify rewrites every remaining `<{ident}>` to
|
|
// NIL. Run after the main substitution loop, before the generic
|
|
// unreferenced-marker cleanup.
|
|
func replaceUnreferencedBlockify(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(s) {
|
|
if i+2 < len(s) && s[i] == '<' && s[i+1] == '{' {
|
|
j := i + 2
|
|
// Identifier
|
|
if j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z')) {
|
|
j++
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') || (s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j+1 < len(s) && s[j] == '}' && s[j+1] == '>' {
|
|
out.WriteString("NIL")
|
|
i = j + 2
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
out.WriteByte(s[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// expandOptionalRepeat walks a result template and rewrites each top-
|
|
// level `[ ... ]` block by examining the captures referenced inside:
|
|
//
|
|
// - If any referenced marker has multiple captured iterations
|
|
// (values joined with \x01), emit the body N times, substituting
|
|
// the i-th iteration's value for each such marker and dropping
|
|
// single-valued markers into each iteration unchanged.
|
|
// - If no referenced marker is multi-captured BUT the single
|
|
// captures include non-empty values, emit the body once.
|
|
// - Otherwise drop the block.
|
|
//
|
|
// Nested brackets are not supported — Harbour uses a single level of
|
|
// `[...]` for the common repeat form. Callers that need deeper nesting
|
|
// can fall back to writing out separate #xcommand rules.
|
|
func expandOptionalRepeat(template string, captures map[string]string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
for i < len(template) {
|
|
if template[i] == '[' {
|
|
// Find matching top-level ']'. Skip over quoted strings
|
|
// and nested brackets inside PP markers like `<.x.>`.
|
|
depth := 1
|
|
j := i + 1
|
|
for j < len(template) && depth > 0 {
|
|
switch template[j] {
|
|
case '[':
|
|
// Inside a marker `<...>` the `[` is just text;
|
|
// only count top-level brackets.
|
|
if inMarker(template, j) {
|
|
j++
|
|
continue
|
|
}
|
|
depth++
|
|
case ']':
|
|
if inMarker(template, j) {
|
|
j++
|
|
continue
|
|
}
|
|
depth--
|
|
if depth == 0 {
|
|
body := template[i+1 : j]
|
|
out.WriteString(expandBracketBody(body, captures))
|
|
i = j + 1
|
|
goto next
|
|
}
|
|
}
|
|
j++
|
|
}
|
|
// Unmatched [ — copy literally.
|
|
out.WriteByte(template[i])
|
|
i++
|
|
next:
|
|
continue
|
|
}
|
|
out.WriteByte(template[i])
|
|
i++
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// inMarker reports whether position `p` in s is inside a PP marker
|
|
// reference like `<.x.>` / `<"x">` / `<(x)>` — where `[` and `]` are
|
|
// ordinary text, not template delimiters.
|
|
func inMarker(s string, p int) bool {
|
|
// Look backward for `<` not preceded by a marker-terminator.
|
|
for k := p - 1; k >= 0; k-- {
|
|
c := s[k]
|
|
if c == '>' {
|
|
return false
|
|
}
|
|
if c == '<' {
|
|
// Scan forward from `<` to see if we're still inside.
|
|
for m := k + 1; m < len(s) && m <= p; m++ {
|
|
if s[m] == '>' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// expandBracketBody returns the optional-repeat body expanded once per
|
|
// iteration of its multi-captured markers. See expandOptionalRepeat.
|
|
func expandBracketBody(body string, captures map[string]string) string {
|
|
// Find marker names referenced inside the body.
|
|
refs := referencedMarkers(body)
|
|
iters := 1
|
|
hasMulti := false
|
|
for _, name := range refs {
|
|
if val, ok := captures[name]; ok && strings.ContainsRune(val, '\x01') {
|
|
n := strings.Count(val, "\x01") + 1
|
|
if n > iters {
|
|
iters = n
|
|
}
|
|
hasMulti = true
|
|
}
|
|
}
|
|
if !hasMulti {
|
|
// No multi-capture — include body once if any referenced marker
|
|
// has a (single) capture; otherwise drop.
|
|
anyPresent := false
|
|
for _, name := range refs {
|
|
if _, ok := captures[name]; ok {
|
|
anyPresent = true
|
|
break
|
|
}
|
|
}
|
|
if !anyPresent {
|
|
return ""
|
|
}
|
|
return body
|
|
}
|
|
|
|
// Pre-split each multi-captured referent into a per-iteration list.
|
|
parts := make(map[string][]string, len(refs))
|
|
for _, name := range refs {
|
|
if val, ok := captures[name]; ok {
|
|
parts[name] = strings.Split(val, "\x01")
|
|
}
|
|
}
|
|
|
|
var out strings.Builder
|
|
for iter := 0; iter < iters; iter++ {
|
|
piece := body
|
|
for name, vals := range parts {
|
|
var v string
|
|
if iter < len(vals) {
|
|
v = vals[iter]
|
|
}
|
|
quoted := ppQuote(v)
|
|
piece = strings.ReplaceAll(piece, "#<"+name+">", quoted)
|
|
piece = strings.ReplaceAll(piece, `<"`+name+`">`, quoted)
|
|
piece = strings.ReplaceAll(piece, "<("+name+")>", quoted)
|
|
if v != "" {
|
|
piece = strings.ReplaceAll(piece, "<."+name+".>", ".T.")
|
|
} else {
|
|
piece = strings.ReplaceAll(piece, "<."+name+".>", ".F.")
|
|
}
|
|
piece = strings.ReplaceAll(piece, "<"+name+">", v)
|
|
}
|
|
out.WriteString(piece)
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// referencedMarkers extracts marker names referenced inside a template
|
|
// fragment. Handles `<name>`, `<(name)>`, `<.name.>`, `<"name">`, and
|
|
// `#<name>` forms.
|
|
func referencedMarkers(s string) []string {
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
i := 0
|
|
for i < len(s) {
|
|
if s[i] == '<' {
|
|
j := i + 1
|
|
// Skip leading punctuation forms: (name), .name., "name".
|
|
for j < len(s) && (s[j] == '(' || s[j] == '.' || s[j] == '"') {
|
|
j++
|
|
}
|
|
start := j
|
|
for j < len(s) && (s[j] == '_' || (s[j] >= 'a' && s[j] <= 'z') ||
|
|
(s[j] >= 'A' && s[j] <= 'Z') || (s[j] >= '0' && s[j] <= '9')) {
|
|
j++
|
|
}
|
|
if j > start {
|
|
name := s[start:j]
|
|
if !seen[name] {
|
|
seen[name] = true
|
|
out = append(out, name)
|
|
}
|
|
}
|
|
i = j
|
|
continue
|
|
}
|
|
i++
|
|
}
|
|
return out
|
|
}
|
|
|
|
// cleanUnreferencedMarkers removes any remaining <name>, <(name)>,
|
|
// <.name.>, #<name> references. Only removes well-formed PP marker
|
|
// references, not comparison operators. Skips over PRG string
|
|
// literals ("...", '...', [...]) so a captured value containing
|
|
// `<a>` text (e.g. "<a>http://x</a>" inside a regex/string) isn't
|
|
// gutted — that pass used to corrupt arbitrary string content.
|
|
func cleanUnreferencedMarkers(s string) string {
|
|
var out strings.Builder
|
|
i := 0
|
|
inStr := byte(0)
|
|
for i < len(s) {
|
|
c := s[i]
|
|
// Inside a string literal: copy until the matching closer.
|
|
// Bracket-strings `[...]` are PRG-specific but are also used
|
|
// as the result template's optional-repeat brackets, so we
|
|
// leave them out of this pass — only `'…'` and `"…"` are
|
|
// unambiguous strings here.
|
|
if inStr != 0 {
|
|
out.WriteByte(c)
|
|
if c == inStr {
|
|
inStr = 0
|
|
}
|
|
i++
|
|
continue
|
|
}
|
|
if c == '"' || c == '\'' {
|
|
inStr = c
|
|
out.WriteByte(c)
|
|
i++
|
|
continue
|
|
}
|
|
removed := false
|
|
// #<name>
|
|
if c == '#' && i+1 < len(s) && s[i+1] == '<' {
|
|
if end := findMarkerEnd(s, i+1); end > 0 {
|
|
i = end
|
|
removed = true
|
|
}
|
|
}
|
|
// <name>, <(name)>, <.name.>, <"name">
|
|
if !removed && c == '<' {
|
|
if end := findMarkerEnd(s, i); end > 0 {
|
|
i = end
|
|
removed = true
|
|
}
|
|
}
|
|
if !removed {
|
|
out.WriteByte(c)
|
|
i++
|
|
}
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
|
|
func findMarkerEnd(s string, start int) int {
|
|
if start >= len(s) || s[start] != '<' {
|
|
return 0
|
|
}
|
|
i := start + 1
|
|
// Skip optional ( or . or " or { prefix (smart-stringify, logify,
|
|
// stringify, blockify respectively)
|
|
if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"' || s[i] == '{') {
|
|
i++
|
|
}
|
|
// Must start with letter or underscore (identifier)
|
|
if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
|
|
return 0
|
|
}
|
|
// Consume identifier
|
|
for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
|
|
i++
|
|
}
|
|
// Skip optional ) or . or " or } or , suffix
|
|
for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == '}' || s[i] == ',' || s[i] == ' ') {
|
|
i++
|
|
}
|
|
if i < len(s) && s[i] == '>' {
|
|
return i + 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// --- Helpers ---
|
|
|
|
func firstToken(s string) string {
|
|
for i, c := range s {
|
|
if c == ' ' || c == '\t' || c == '(' {
|
|
return s[:i]
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
func matchWord(lineWord, patternWord string, caseSens bool) bool {
|
|
if caseSens {
|
|
return lineWord == patternWord
|
|
}
|
|
return strings.EqualFold(lineWord, patternWord)
|
|
}
|
|
|
|
// tokenizePattern splits a pattern into words, keeping markers as single tokens.
|
|
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
|
|
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
|
|
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
|
|
// single word would never align with the pattern's `DUMB( , <z>, )`
|
|
// tokens.
|
|
func tokenizePattern(pattern string) []string {
|
|
var tokens []string
|
|
i := 0
|
|
for i < len(pattern) {
|
|
for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
|
|
i++
|
|
}
|
|
if i >= len(pattern) {
|
|
break
|
|
}
|
|
|
|
if pattern[i] == '<' {
|
|
end := strings.IndexByte(pattern[i:], '>')
|
|
if end >= 0 {
|
|
tokens = append(tokens, pattern[i:i+end+1])
|
|
i += end + 1
|
|
continue
|
|
}
|
|
}
|
|
|
|
switch pattern[i] {
|
|
case '[', ']', '(', ')', ',':
|
|
tokens = append(tokens, string(pattern[i]))
|
|
i++
|
|
continue
|
|
}
|
|
|
|
// Regular word — stop at space/tab/marker/bracket/paren/comma.
|
|
start := i
|
|
for i < len(pattern) {
|
|
c := pattern[i]
|
|
if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
|
|
c == '(' || c == ')' || c == ',' {
|
|
break
|
|
}
|
|
i++
|
|
}
|
|
if i > start {
|
|
tokens = append(tokens, pattern[start:i])
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// tokenizeLine splits a source line into words matching the rules used
|
|
// by tokenizePattern: string literals stay intact, commas/parens/brackets
|
|
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
|
|
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
|
|
func tokenizeLine(line string) []string {
|
|
var tokens []string
|
|
i := 0
|
|
for i < len(line) {
|
|
for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
|
|
i++
|
|
}
|
|
if i >= len(line) {
|
|
break
|
|
}
|
|
|
|
// String literal
|
|
if line[i] == '"' || line[i] == '\'' {
|
|
quote := line[i]
|
|
start := i
|
|
i++
|
|
for i < len(line) && line[i] != quote {
|
|
i++
|
|
}
|
|
if i < len(line) {
|
|
i++
|
|
}
|
|
tokens = append(tokens, line[start:i])
|
|
continue
|
|
}
|
|
|
|
switch line[i] {
|
|
case ',', '(', ')', '[', ']':
|
|
tokens = append(tokens, string(line[i]))
|
|
i++
|
|
continue
|
|
}
|
|
|
|
// Word — stop at whitespace, brackets, parens, comma, quotes.
|
|
start := i
|
|
for i < len(line) {
|
|
c := line[i]
|
|
if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
|
|
c == '[' || c == ']' || c == '"' || c == '\'' {
|
|
break
|
|
}
|
|
i++
|
|
}
|
|
if i > start {
|
|
tokens = append(tokens, line[start:i])
|
|
}
|
|
}
|
|
return tokens
|
|
}
|
|
|
|
// captureExpression captures an expression from line tokens.
|
|
// If this is the last marker in the pattern, captures all remaining tokens.
|
|
// Otherwise, captures until the next keyword in the pattern.
|
|
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
|
|
if *li >= len(lineWords) {
|
|
return ""
|
|
}
|
|
|
|
// Collect every literal-keyword delimiter that follows in the
|
|
// pattern, not just the first. Optional clauses in std.ch sit
|
|
// next to one another (`[TO <(f)>] [FIELDS <fields,...>]
|
|
// [FOR <for>] [WHILE <while>] ...`), so the file-name marker
|
|
// must stop at TO's *successor* — but we don't know which
|
|
// successor will actually be present in the input. Stopping on
|
|
// any of them keeps `<(f)>` from swallowing a trailing
|
|
// `FOR x > 5` clause. MarkerWordList values count too — a
|
|
// `<off:OFF>` marker can only match the word OFF, so prior
|
|
// captures must stop at it.
|
|
stopSet := map[string]struct{}{}
|
|
addStopFrom(stopSet, patternWords[nextPi:])
|
|
var delims []string
|
|
for k := range stopSet {
|
|
delims = append(delims, k)
|
|
}
|
|
|
|
if len(delims) > 0 {
|
|
// Capture until any delimiter is hit, paren-balancing so nested
|
|
// parens/brackets/braces inside the expression don't falsely
|
|
// terminate the capture. Harbour's own PP does the same —
|
|
// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
|
|
// and leave the outer `)` for the pattern's own delimiter.
|
|
var parts []string
|
|
depth := 0
|
|
for *li < len(lineWords) {
|
|
w := lineWords[*li]
|
|
if depth == 0 {
|
|
stop := false
|
|
for _, d := range delims {
|
|
if matchWord(w, d, caseSens) {
|
|
stop = true
|
|
break
|
|
}
|
|
}
|
|
if stop {
|
|
break
|
|
}
|
|
}
|
|
switch w {
|
|
case "(", "[", "{":
|
|
depth++
|
|
case ")", "]", "}":
|
|
if depth > 0 {
|
|
depth--
|
|
}
|
|
}
|
|
parts = append(parts, w)
|
|
*li++
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
// No delimiter: if last marker, capture all remaining tokens
|
|
if nextPi >= len(patternWords) {
|
|
rest := strings.Join(lineWords[*li:], " ")
|
|
*li = len(lineWords)
|
|
return rest
|
|
}
|
|
|
|
// Single token capture (between markers)
|
|
tok := lineWords[*li]
|
|
*li++
|
|
return tok
|
|
}
|