five/compiler/pp/command.go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.

// #command / #translate implementation for Five preprocessor.
//
// Harbour PP syntax:
//   #command PATTERN => RESULT
//   #translate PATTERN => RESULT
//   #xcommand PATTERN => RESULT   (case-sensitive)
//   #xtranslate PATTERN => RESULT (case-sensitive)
//
// Pattern markers:
//   <x>       — match any expression (regular match)
//   <!x!>     — match single identifier only (restricted match)
//   <x,...>   — match comma-separated list
//   <*x*>     — match rest of line (wild match)
//   <x:a,b,c> — match one of listed words (list match)
//   [...]     — optional clause
//
// Result markers:
//   <x>       — substitute matched text
//   <(x)>     — stringify (wrap in quotes)
//   <{x}>     — blockify (wrap in {|| })
//   #<x>      — dumb stringify
//   <.x.>     — logify (.T. if matched, .F. if not)
//
// Reference: /mnt/d/harbour-core/src/pp/ppcore.c
package pp

import (
	"strings"
)

// Rule represents a single #command or #translate rule.
type Rule struct {
	Pattern    string   // raw pattern text
	Result     string   // raw result text
	IsCommand  bool     // #command vs #translate
	CaseSens   bool     // #xcommand/#xtranslate = case sensitive
	Keyword    string   // first keyword (for fast matching)
	Markers    []Marker // parsed pattern markers
	ResultTmpl string   // result template with marker references
}

// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
type Marker struct {
	Name       string // marker name
	Type       MarkerType
	ListValues []string // for <x:a,b,c> — allowed values
}

type MarkerType int

const (
	MarkerRegular    MarkerType = iota // <x> — any expression
	MarkerRestricted                    // <!x!> — identifier only
	MarkerList                          // <x,...> — comma-separated list
	MarkerWild                          // <*x*> — rest of line
	MarkerWordList                      // <x:a,b,c> — one of listed words
)

// ParseRule parses a #command/#translate directive into a Rule.
func ParseRule(directive string, isCommand, caseSens bool) *Rule {
	// Split on =>
	parts := strings.SplitN(directive, "=>", 2)
	if len(parts) != 2 {
		return nil
	}

	pattern := strings.TrimSpace(parts[0])
	result := strings.TrimSpace(parts[1])

	// Handle line continuation (;)
	result = strings.ReplaceAll(result, " ;", "")

	rule := &Rule{
		Pattern:    pattern,
		Result:     result,
		IsCommand:  isCommand,
		CaseSens:   caseSens,
		ResultTmpl: result,
	}

	// Extract first keyword for fast matching. The first whitespace-
	// delimited token of the pattern becomes the dispatch key; we
	// strip marker wrappers and any trailing `(` so a pattern like
	// `MAKE_TEST( <obj>, <v> )` hashes on `MAKE_TEST`, matching how
	// firstToken normalises source lines.
	words := strings.Fields(pattern)
	if len(words) > 0 {
		kw := words[0]
		kw = strings.TrimLeft(kw, "<[")
		kw = strings.TrimRight(kw, ">]")
		if idx := strings.IndexByte(kw, '('); idx >= 0 {
			kw = kw[:idx]
		}
		if !strings.ContainsAny(kw, "!*,:") {
			rule.Keyword = kw
		}
	}

	// Parse markers from pattern
	rule.Markers = parseMarkers(pattern)

	return rule
}

// parseMarkers extracts all <...> markers from a pattern.
func parseMarkers(pattern string) []Marker {
	var markers []Marker
	i := 0
	for i < len(pattern) {
		if pattern[i] == '<' {
			end := strings.IndexByte(pattern[i:], '>')
			if end < 0 {
				break
			}
			inner := pattern[i+1 : i+end]
			m := parseOneMarker(inner)
			if m.Name != "" {
				markers = append(markers, m)
			}
			i += end + 1
		} else {
			i++
		}
	}
	return markers
}

func parseOneMarker(inner string) Marker {
	inner = strings.TrimSpace(inner)

	// <!name!> — restricted
	if strings.HasPrefix(inner, "!") && strings.HasSuffix(inner, "!") {
		return Marker{Name: inner[1 : len(inner)-1], Type: MarkerRestricted}
	}

	// <*name*> — wild
	if strings.HasPrefix(inner, "*") && strings.HasSuffix(inner, "*") {
		return Marker{Name: inner[1 : len(inner)-1], Type: MarkerWild}
	}

	// <name,...> — comma list
	if strings.HasSuffix(inner, ",...") {
		return Marker{Name: inner[:len(inner)-4], Type: MarkerList}
	}

	// <name:a,b,c> — word list
	if idx := strings.IndexByte(inner, ':'); idx > 0 {
		name := inner[:idx]
		vals := strings.Split(inner[idx+1:], ",")
		for i := range vals {
			vals[i] = strings.TrimSpace(vals[i])
		}
		return Marker{Name: name, Type: MarkerWordList, ListValues: vals}
	}

	// <name> — regular
	return Marker{Name: inner, Type: MarkerRegular}
}

// --- Rule matching and application ---

// MatchLine checks if a source line matches this rule and returns the substituted result.
// Returns ("", false) if no match.
func (r *Rule) MatchLine(line string) (string, bool) {
	trimmed := strings.TrimSpace(line)
	if trimmed == "" {
		return "", false
	}

	// Fast keyword check
	if r.Keyword != "" {
		firstWord := firstToken(trimmed)
		if r.CaseSens {
			if firstWord != r.Keyword {
				return "", false
			}
		} else {
			if !strings.EqualFold(firstWord, r.Keyword) {
				return "", false
			}
		}
	}

	// Try to match pattern against line
	captures := r.matchPattern(trimmed)
	if captures == nil {
		return "", false
	}

	// Apply result template
	result := r.applyResult(captures)
	return result, true
}

// matchPattern attempts to match the pattern against a line.
// Returns captured values map, or nil if no match.
func (r *Rule) matchPattern(line string) map[string]string {
	captures := make(map[string]string)

	patternWords := tokenizePattern(r.Pattern)
	lineWords := tokenizeLine(line)

	pi, li := 0, 0
	for pi < len(patternWords) && li < len(lineWords) {
		pw := patternWords[pi]

		// Marker?
		if strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">") {
			inner := pw[1 : len(pw)-1]
			m := parseOneMarker(inner)

			switch m.Type {
			case MarkerWild:
				// Capture rest of line
				rest := strings.Join(lineWords[li:], " ")
				captures[m.Name] = rest
				li = len(lineWords)
				pi++

			case MarkerList:
				// Capture a comma-separated list until the next literal
				// pattern token. Paren-balanced so nested `(`/`[`/`{`
				// don't let an inner `)` terminate the capture. Commas
				// at the top level are preserved verbatim in the
				// captured string so the `<z>` substitution in the
				// result template reproduces the argument list as-is.
				var parts []string
				depth := 0
				delim := ""
				if pi+1 < len(patternWords) {
					delim = patternWords[pi+1]
				}
				for li < len(lineWords) {
					w := lineWords[li]
					if depth == 0 && delim != "" && matchWord(w, delim, r.CaseSens) {
						break
					}
					switch w {
					case "(", "[", "{":
						depth++
					case ")", "]", "}":
						if depth > 0 {
							depth--
						}
					}
					parts = append(parts, w)
					li++
				}
				captures[m.Name] = strings.Join(parts, " ")
				pi++

			case MarkerWordList:
				// Match one of listed words
				matched := false
				for _, allowed := range m.ListValues {
					if r.CaseSens {
						if lineWords[li] == allowed {
							matched = true
							break
						}
					} else if strings.EqualFold(lineWords[li], allowed) {
						matched = true
						break
					}
				}
				if !matched {
					return nil
				}
				captures[m.Name] = lineWords[li]
				li++
				pi++

			default:
				// Regular or restricted: capture one token or expression
				captured := captureExpression(lineWords, &li, patternWords, pi+1, r.CaseSens)
				captures[m.Name] = captured
				pi++
			}
		} else if pw == "[" {
			// Optional clause — skip to matching ]
			depth := 1
			pi++
			for pi < len(patternWords) && depth > 0 {
				if patternWords[pi] == "[" {
					depth++
				} else if patternWords[pi] == "]" {
					depth--
				}
				pi++
			}
		} else if pw == "]" {
			pi++
		} else {
			// Literal keyword — must match
			if !matchWord(lineWords[li], pw, r.CaseSens) {
				return nil
			}
			li++
			pi++
		}
	}

	// Skip remaining optional markers in pattern
	for pi < len(patternWords) {
		pw := patternWords[pi]
		if pw == "[" || pw == "]" || (strings.HasPrefix(pw, "<") && strings.HasSuffix(pw, ">")) {
			pi++
		} else {
			break
		}
	}

	// For #command with no markers and no optional clauses:
	// all line tokens must be consumed for a match
	if r.IsCommand && li < len(lineWords) && len(r.Markers) == 0 &&
		!strings.Contains(r.Pattern, "[") {
		return nil
	}

	return captures
}

// ppQuote wraps a captured value in a PRG string literal, picking a
// delimiter that doesn't collide with characters already inside. Harbour
// #<name> stringify takes the raw source text of the argument and must
// produce a legal PRG string — if the capture is `"world"`, the result
// can't just be `""world""`. Preference order matches Harbour:
// double-quotes first, then single-quotes, then bracket literals.
func ppQuote(val string) string {
	if !strings.ContainsRune(val, '"') {
		return `"` + val + `"`
	}
	if !strings.ContainsRune(val, '\'') {
		return "'" + val + "'"
	}
	if !strings.ContainsRune(val, '[') && !strings.ContainsRune(val, ']') {
		return "[" + val + "]"
	}
	// Fallback: double-quote with embedded quotes dropped. Pathological
	// input only; Harbour itself refuses to handle this cleanly.
	return `"` + strings.ReplaceAll(val, `"`, "") + `"`
}

// applyResult substitutes captured values into the result template.
// Order matters — the compound forms (`#<z>`, `<(z)>`, `<.z.>`, `<"z">`)
// all contain the bare `<z>` token, so the bare substitution has to run
// LAST. Previously `<z>` was replaced first and left a stray `#` / `(` /
// `.` / `"` behind, producing bogus lines like `? #hello` that the
// lexer then choked on with ILLEGAL token errors.
func (r *Rule) applyResult(captures map[string]string) string {
	result := r.ResultTmpl

	for name, val := range captures {
		quoted := ppQuote(val)
		// #<name> — dumb stringify (always quote).
		result = strings.ReplaceAll(result, "#<"+name+">", quoted)
		// <"name"> — explicit stringify.
		result = strings.ReplaceAll(result, `<"`+name+`">`, quoted)
		// <(name)> — smart stringify: already a string literal → keep;
		// otherwise quote. `val` comes straight from the capture, so
		// trim and check for surrounding quotes.
		trim := strings.TrimSpace(val)
		smart := quoted
		if n := len(trim); n >= 2 &&
			((trim[0] == '"' && trim[n-1] == '"') ||
				(trim[0] == '\'' && trim[n-1] == '\'') ||
				(trim[0] == '[' && trim[n-1] == ']')) {
			smart = trim
		}
		result = strings.ReplaceAll(result, "<("+name+")>", smart)
		// <.name.> — logify (empty → .F., else .T.)
		if val != "" {
			result = strings.ReplaceAll(result, "<."+name+".>", ".T.")
		} else {
			result = strings.ReplaceAll(result, "<."+name+".>", ".F.")
		}
		// <name> — bare substitution (must be LAST, after all wrappers).
		result = strings.ReplaceAll(result, "<"+name+">", val)
	}

	// Clean up unreferenced markers: <name>, <(name)>, <.name.>, #<name>, <"name">
	result = cleanUnreferencedMarkers(result)

	return result
}

// cleanUnreferencedMarkers removes any remaining <name>, <(name)>, <.name.>, #<name> references.
// Only removes well-formed PP marker references, not comparison operators.
func cleanUnreferencedMarkers(s string) string {
	// Match patterns like <identifier>, <(identifier)>, <.identifier.>, #<identifier>
	var out strings.Builder
	i := 0
	for i < len(s) {
		removed := false
		// #<name>
		if s[i] == '#' && i+1 < len(s) && s[i+1] == '<' {
			if end := findMarkerEnd(s, i+1); end > 0 {
				i = end
				removed = true
			}
		}
		// <name>, <(name)>, <.name.>, <"name">
		if !removed && s[i] == '<' {
			if end := findMarkerEnd(s, i); end > 0 {
				i = end
				removed = true
			}
		}
		if !removed {
			out.WriteByte(s[i])
			i++
		}
	}
	return out.String()
}

// findMarkerEnd checks if s[start] begins a PP marker <name> and returns end position, or 0.
func findMarkerEnd(s string, start int) int {
	if start >= len(s) || s[start] != '<' {
		return 0
	}
	i := start + 1
	// Skip optional ( or . prefix
	if i < len(s) && (s[i] == '(' || s[i] == '.' || s[i] == '"') {
		i++
	}
	// Must start with letter or underscore (identifier)
	if i >= len(s) || !(s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] == '_') {
		return 0
	}
	// Consume identifier
	for i < len(s) && (s[i] >= 'a' && s[i] <= 'z' || s[i] >= 'A' && s[i] <= 'Z' || s[i] >= '0' && s[i] <= '9' || s[i] == '_') {
		i++
	}
	// Skip optional ) or . or " or ,... suffix
	for i < len(s) && (s[i] == ')' || s[i] == '.' || s[i] == '"' || s[i] == ',' || s[i] == ' ') {
		i++
	}
	if i < len(s) && s[i] == '>' {
		return i + 1
	}
	return 0
}

// --- Helpers ---

func firstToken(s string) string {
	for i, c := range s {
		if c == ' ' || c == '\t' || c == '(' {
			return s[:i]
		}
	}
	return s
}

func matchWord(lineWord, patternWord string, caseSens bool) bool {
	if caseSens {
		return lineWord == patternWord
	}
	return strings.EqualFold(lineWord, patternWord)
}

// tokenizePattern splits a pattern into words, keeping markers as single tokens.
// Parens and commas are emitted as their own tokens so `DUMB(<z>)` and
// `DUMB( <z> )` tokenise identically — matching what tokenizeLine does
// on call sites. Without this, `_DUMB_(a)` (no space) stored as a
// single word would never align with the pattern's `DUMB( , <z>, )`
// tokens.
func tokenizePattern(pattern string) []string {
	var tokens []string
	i := 0
	for i < len(pattern) {
		for i < len(pattern) && (pattern[i] == ' ' || pattern[i] == '\t') {
			i++
		}
		if i >= len(pattern) {
			break
		}

		if pattern[i] == '<' {
			end := strings.IndexByte(pattern[i:], '>')
			if end >= 0 {
				tokens = append(tokens, pattern[i:i+end+1])
				i += end + 1
				continue
			}
		}

		switch pattern[i] {
		case '[', ']', '(', ')', ',':
			tokens = append(tokens, string(pattern[i]))
			i++
			continue
		}

		// Regular word — stop at space/tab/marker/bracket/paren/comma.
		start := i
		for i < len(pattern) {
			c := pattern[i]
			if c == ' ' || c == '\t' || c == '<' || c == '[' || c == ']' ||
				c == '(' || c == ')' || c == ',' {
				break
			}
			i++
		}
		if i > start {
			tokens = append(tokens, pattern[start:i])
		}
	}
	return tokens
}

// tokenizeLine splits a source line into words matching the rules used
// by tokenizePattern: string literals stay intact, commas/parens/brackets
// emit as standalone tokens so a call like `DUMB(hello)` tokenises as
// `DUMB`, `(`, `hello`, `)` — aligning with the pattern side.
func tokenizeLine(line string) []string {
	var tokens []string
	i := 0
	for i < len(line) {
		for i < len(line) && (line[i] == ' ' || line[i] == '\t') {
			i++
		}
		if i >= len(line) {
			break
		}

		// String literal
		if line[i] == '"' || line[i] == '\'' {
			quote := line[i]
			start := i
			i++
			for i < len(line) && line[i] != quote {
				i++
			}
			if i < len(line) {
				i++
			}
			tokens = append(tokens, line[start:i])
			continue
		}

		switch line[i] {
		case ',', '(', ')', '[', ']':
			tokens = append(tokens, string(line[i]))
			i++
			continue
		}

		// Word — stop at whitespace, brackets, parens, comma, quotes.
		start := i
		for i < len(line) {
			c := line[i]
			if c == ' ' || c == '\t' || c == ',' || c == '(' || c == ')' ||
				c == '[' || c == ']' || c == '"' || c == '\'' {
				break
			}
			i++
		}
		if i > start {
			tokens = append(tokens, line[start:i])
		}
	}
	return tokens
}

// captureExpression captures an expression from line tokens.
// If this is the last marker in the pattern, captures all remaining tokens.
// Otherwise, captures until the next keyword in the pattern.
func captureExpression(lineWords []string, li *int, patternWords []string, nextPi int, caseSens bool) string {
	if *li >= len(lineWords) {
		return ""
	}

	// Find next literal keyword in pattern to use as delimiter
	delimWord := ""
	for pi := nextPi; pi < len(patternWords); pi++ {
		pw := patternWords[pi]
		if !strings.HasPrefix(pw, "<") && pw != "[" && pw != "]" {
			delimWord = pw
			break
		}
	}

	if delimWord != "" {
		// Capture until the delimiter, paren-balancing so nested
		// parens/brackets/braces inside the expression don't falsely
		// terminate the capture. Harbour's own PP does the same —
		// `_REGULAR_(&(a))` must capture `&(a)` (incl. inner parens)
		// and leave the outer `)` for the pattern's own delimiter.
		var parts []string
		depth := 0
		for *li < len(lineWords) {
			w := lineWords[*li]
			if depth == 0 && matchWord(w, delimWord, caseSens) {
				break
			}
			switch w {
			case "(", "[", "{":
				depth++
			case ")", "]", "}":
				if depth > 0 {
					depth--
				}
			}
			parts = append(parts, w)
			*li++
		}
		return strings.Join(parts, " ")
	}

	// No delimiter: if last marker, capture all remaining tokens
	if nextPi >= len(patternWords) {
		rest := strings.Join(lineWords[*li:], " ")
		*li = len(lineWords)
		return rest
	}

	// Single token capture (between markers)
	tok := lineWords[*li]
	*li++
	return tok
}