five/compiler/pp/pp.go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.

// Preprocessor for Five — handles #include, #define, #ifdef/#endif.
// Harbour: /mnt/d/harbour-core/src/pp/ppcore.c (6383 lines)
//
// Five PP is simplified but covers the essential directives:
//   #include "file.ch"   — file inclusion
//   #define NAME VALUE    — simple text substitution
//   #undef NAME           — remove definition
//   #ifdef NAME / #ifndef NAME / #else / #endif — conditional compilation
//   #pragma               — compiler hints
//
// #command/#translate is supported via command.go (pattern matching + substitution).
// Five also handles CLASS syntax natively in the parser.
package pp

import (
	_ "embed"
	"fmt"
	"os"
	"path/filepath"
	"strings"
)

// embeddedStdCh is include/std.ch baked into the compiler binary so
// xBase commands like ERASE, RENAME, COMMIT, LOCATE, ... reach the
// parser already rewritten as plain function calls. Equivalent to
// Harbour's auto-included std.ch.
//
//go:embed std.ch
var embeddedStdCh string

// Preprocessor processes source code before lexing.
type Preprocessor struct {
	defines     map[string]string // #define name → value
	includeDirs []string          // search paths for #include
	included    map[string]bool   // prevent circular inclusion
	commands    []*Rule           // #command rules
	translates  []*Rule           // #translate rules
	errors      []string
	GoDumps     []string          // collected #pragma BEGINDUMP Go code blocks
}

// New creates a new Preprocessor.
func New() *Preprocessor {
	pp := &Preprocessor{
		defines:  make(map[string]string),
		included: make(map[string]bool),
	}
	pp.addStdRules()
	return pp
}

// addStdRules registers built-in #command rules from the embedded
// std.ch file. processLines walks the directives and stores #command
// entries in pp.commands as a side effect; we discard its output.
//
// Anything not safely expressible as a #command (e.g. parser-handled
// constructs like @ SAY/GET, READ, TRY/CATCH, WITH TIMEOUT) is left
// to the parser.
func (pp *Preprocessor) addStdRules() {
	pp.processLines("std.ch", embeddedStdCh, 0)
}

// AddIncludeDir adds a directory to search for #include files.
func (pp *Preprocessor) AddIncludeDir(dir string) {
	pp.includeDirs = append(pp.includeDirs, dir)
}

// Define adds a #define.
func (pp *Preprocessor) Define(name, value string) {
	pp.defines[name] = value
}

// Process preprocesses the source code, resolving #include and #define.
func (pp *Preprocessor) Process(filename, source string) (string, []string) {
	pp.errors = nil
	result := pp.processLines(filename, source, 0)
	return result, pp.errors
}

const maxIncludeDepth = 20

func (pp *Preprocessor) processLines(filename, source string, depth int) string {
	if depth > maxIncludeDepth {
		pp.errors = append(pp.errors, fmt.Sprintf("%s: #include depth exceeded (max %d)", filename, maxIncludeDepth))
		return source
	}

	lines := strings.Split(source, "\n")
	var result []string
	var ifStack []bool // true = active section, false = skipping
	active := true
	inBlockComment := false // track multi-line /* */ comments
	inPragmaDump := false   // track #pragma BEGINDUMP ... ENDDUMP
	dumpStartLine := 0      // 1-based line where BEGINDUMP appeared
	var dumpLines []string  // accumulate Go code lines

	for i := 0; i < len(lines); i++ {
		line := lines[i]
		// `#command`/`#translate` directives that end with a trailing `;`
		// continue on the next physical line — this is how harbour-core
		// formats its std.ch rules. Join the continuation here so the
		// directive parser sees one logical line. Only `#`-directives
		// participate; user code uses `;` differently.
		if t := strings.TrimSpace(line); strings.HasPrefix(t, "#") {
			for strings.HasSuffix(strings.TrimRight(line, " \t"), ";") && i+1 < len(lines) {
				line = strings.TrimRight(line, " \t;") + " " + strings.TrimSpace(lines[i+1])
				i++
			}
		}
		// Handle #pragma BEGINDUMP ... ENDDUMP (inline Go code blocks)
		if inPragmaDump {
			trimCheck := strings.TrimSpace(line)
			if strings.HasPrefix(trimCheck, "#") {
				dir := strings.TrimSpace(strings.TrimPrefix(trimCheck, "#"))
				if strings.HasPrefix(strings.ToUpper(dir), "PRAGMA ") && strings.Contains(strings.ToUpper(dir), "ENDDUMP") {
					inPragmaDump = false
					body := strings.Join(dumpLines, "\n")
					// Five's inline dumps are Go, not C. Harbour's own
					// #pragma BEGINDUMP convention is C (hb_ret*, HB_FUNC,
					// #include <stdio.h> etc.), so `.prg` files ported
					// from Harbour will attempt to shove C through Five's
					// Go-emit pipeline and fail with cryptic errors like
					// "invalid character U+0023 '#'". Detect the C shape
					// and report a clear, actionable error up front.
					if looksLikeInlineC(body) {
						pp.errors = append(pp.errors, fmt.Sprintf(
							"%s:%d: #pragma BEGINDUMP contains C code — Five accepts inline Go only. Port the block to Go (or use an RTL function), then wrap in #pragma BEGINDUMP ... #pragma ENDDUMP.",
							filename, dumpStartLine))
						// Emit a syntactically invalid line so the parser
						// also fails at the expected position rather than
						// the build silently continuing.
						result = append(result, "__FIVE_INLINE_C_ERROR__")
						dumpLines = nil
						continue
					}
					pp.GoDumps = append(pp.GoDumps, body)
					dumpLines = nil
					result = append(result, fmt.Sprintf("FIVE_GODUMP__ %d", len(pp.GoDumps)-1))
					continue
				}
			}
			dumpLines = append(dumpLines, line)
			result = append(result, "") // blank out for line counting
			continue
		}
		trimmed := strings.TrimSpace(line)

		// Handle multi-line block comments
		if inBlockComment {
			if idx := strings.Index(line, "*/"); idx >= 0 {
				inBlockComment = false
				line = line[idx+2:] // keep content after */
				trimmed = strings.TrimSpace(line)
				if trimmed == "" {
					result = append(result, "")
					continue
				}
			} else {
				result = append(result, "") // blank out comment lines
				continue
			}
		}
		// Strip block comments within a single line and detect opening /*
		line = stripBlockComments(line, &inBlockComment)
		trimmed = strings.TrimSpace(line)

		// Check if in active section
		if len(ifStack) > 0 {
			active = ifStack[len(ifStack)-1]
		} else {
			active = true
		}

		// Preprocessor directives (always processed regardless of active state)
		if strings.HasPrefix(trimmed, "#") {
			directive := strings.TrimPrefix(trimmed, "#")
			directive = strings.TrimSpace(directive)

			// Detect #pragma BEGINDUMP
			upperDir := strings.ToUpper(directive)
			if strings.HasPrefix(upperDir, "PRAGMA ") && strings.Contains(upperDir, "BEGINDUMP") {
				inPragmaDump = true
				dumpStartLine = i + 1 // 1-based for error reporting
				dumpLines = nil
				result = append(result, "")
				continue
			}

			if pp.handleConditional(directive, &ifStack, active) {
				continue
			}

			if !active {
				continue // skip non-conditional directives in inactive sections
			}

			if pp.handleDirective(filename, directive, depth, &result, i+1) {
				continue
			}
		}

		if !active {
			continue // skip lines in inactive #ifdef sections
		}

		// Apply #command/#translate rules
		if len(pp.commands) > 0 || len(pp.translates) > 0 {
			line = pp.applyRules(line)
		}

		// Apply #define substitutions
		if len(pp.defines) > 0 {
			line = pp.applyDefines(line)
		}

		result = append(result, line)
	}

	if len(ifStack) > 0 {
		pp.errors = append(pp.errors, fmt.Sprintf("%s: unterminated #ifdef/#ifndef", filename))
	}

	return strings.Join(result, "\n")
}

// handleConditional processes #ifdef, #ifndef, #else, #endif.
// Returns true if the line was a conditional directive.
func (pp *Preprocessor) handleConditional(directive string, ifStack *[]bool, active bool) bool {
	upper := strings.ToUpper(directive)

	if strings.HasPrefix(upper, "IFDEF ") {
		name := strings.TrimSpace(directive[6:])
		_, defined := pp.defines[name]
		*ifStack = append(*ifStack, defined && active)
		return true
	}

	if strings.HasPrefix(upper, "IFNDEF ") {
		name := strings.TrimSpace(directive[7:])
		_, defined := pp.defines[name]
		*ifStack = append(*ifStack, !defined && active)
		return true
	}

	// #if expr — simplified: support #if 0 (always false), #if 1 (always true),
	// and #if __pragma(...) (treat as false for compatibility)
	if strings.HasPrefix(upper, "IF ") || upper == "IF" {
		rest := strings.TrimSpace(directive[2:])
		val := false
		if rest == "1" || rest == ".T." {
			val = true
		} else if rest == "0" || rest == ".F." {
			val = false
		} else {
			// Unknown expression — default to false (conservative)
			val = false
		}
		*ifStack = append(*ifStack, val && active)
		return true
	}

	// #else — may have trailing comment
	if upper == "ELSE" || strings.HasPrefix(upper, "ELSE ") || strings.HasPrefix(upper, "ELSE\t") {
		if len(*ifStack) > 0 {
			// Flip the top of stack (only if parent was active)
			parentActive := true
			if len(*ifStack) > 1 {
				parentActive = (*ifStack)[len(*ifStack)-2]
			}
			(*ifStack)[len(*ifStack)-1] = !(*ifStack)[len(*ifStack)-1] && parentActive
		}
		return true
	}

	// #endif — may have trailing comment: #endif /* COMMENT */
	stripped := strings.TrimSpace(upper)
	if idx := strings.Index(stripped, " "); idx > 0 {
		stripped = stripped[:idx]
	}
	if idx := strings.Index(stripped, "\t"); idx > 0 {
		stripped = stripped[:idx]
	}
	if stripped == "ENDIF" {
		if len(*ifStack) > 0 {
			*ifStack = (*ifStack)[:len(*ifStack)-1]
		}
		return true
	}

	return false
}

// handleDirective processes non-conditional directives.
func (pp *Preprocessor) handleDirective(filename, directive string, depth int, result *[]string, lineNo int) bool {
	upper := strings.ToUpper(directive)

	// #include "file" or #include <file>
	if strings.HasPrefix(upper, "INCLUDE ") {
		rest := strings.TrimSpace(directive[8:])
		inclFile := pp.extractIncludeFile(rest)
		if inclFile == "" {
			pp.errors = append(pp.errors, fmt.Sprintf("%s:%d: invalid #include", filename, lineNo))
			return true
		}

		content := pp.resolveInclude(filename, inclFile)
		if content == "" {
			// Not found — not an error for Five (some .ch files are optional)
			*result = append(*result, fmt.Sprintf("// #include %q — not found (skipped)", inclFile))
			return true
		}

		// Process included content recursively
		processed := pp.processLines(inclFile, content, depth+1)
		*result = append(*result, strings.Split(processed, "\n")...)
		return true
	}

	// #define NAME [VALUE]
	if strings.HasPrefix(upper, "DEFINE ") {
		rest := strings.TrimSpace(directive[7:])
		// Detect function-like macro: #define NAME( params ) body
		// For now, skip these (don't register as simple text substitution)
		if idx := strings.IndexByte(rest, '('); idx > 0 && idx < strings.IndexAny(rest+" ", " \t") {
			// Function-like macro — not yet supported, skip
			return true
		}
		parts := strings.SplitN(rest, " ", 2)
		name := parts[0]
		value := ""
		if len(parts) > 1 {
			value = strings.TrimSpace(parts[1])
		}
		// Strip trailing // comment and /* */ comment from value
		if idx := strings.Index(value, "//"); idx >= 0 {
			// Make sure // is not inside a string literal
			inStr := false
			for i := 0; i < idx; i++ {
				if value[i] == '"' || value[i] == '\'' {
					inStr = !inStr
				}
			}
			if !inStr {
				value = strings.TrimSpace(value[:idx])
			}
		}
		if idx := strings.Index(value, "/*"); idx >= 0 {
			value = strings.TrimSpace(value[:idx])
		}
		pp.defines[name] = value
		return true
	}

	// #undef NAME
	if strings.HasPrefix(upper, "UNDEF ") {
		name := strings.TrimSpace(directive[6:])
		delete(pp.defines, name)
		return true
	}

	// #pragma — just pass through as comment
	if strings.HasPrefix(upper, "PRAGMA ") {
		*result = append(*result, "// "+directive)
		return true
	}
	// #warning, #error, #stdout — skip (emit as comment)
	if strings.HasPrefix(upper, "WARNING") || strings.HasPrefix(upper, "ERROR") || strings.HasPrefix(upper, "STDOUT") {
		*result = append(*result, "// #"+directive)
		return true
	}

	// #command / #translate — parse and store rules
	if strings.HasPrefix(upper, "COMMAND ") {
		if rule := ParseRule(directive[8:], true, false); rule != nil {
			pp.commands = append(pp.commands, rule)
		}
		return true
	}
	if strings.HasPrefix(upper, "TRANSLATE ") {
		if rule := ParseRule(directive[10:], false, false); rule != nil {
			pp.translates = append(pp.translates, rule)
		}
		return true
	}
	if strings.HasPrefix(upper, "XCOMMAND ") {
		if rule := ParseRule(directive[9:], true, true); rule != nil {
			pp.commands = append(pp.commands, rule)
		}
		return true
	}
	if strings.HasPrefix(upper, "XTRANSLATE ") {
		if rule := ParseRule(directive[11:], false, true); rule != nil {
			pp.translates = append(pp.translates, rule)
		}
		return true
	}

	return false
}

// extractIncludeFile gets the filename from #include "file" or #include <file>
func (pp *Preprocessor) extractIncludeFile(s string) string {
	s = strings.TrimSpace(s)
	if len(s) >= 2 {
		if (s[0] == '"' && s[len(s)-1] == '"') || (s[0] == '<' && s[len(s)-1] == '>') {
			return s[1 : len(s)-1]
		}
	}
	return s // bare filename
}

// resolveInclude searches for an include file and returns its content.
func (pp *Preprocessor) resolveInclude(currentFile, inclFile string) string {
	// Prevent circular inclusion
	absKey := inclFile
	if pp.included[absKey] {
		return ""
	}
	pp.included[absKey] = true
	defer func() { delete(pp.included, absKey) }()

	// Search order:
	// 1. Relative to current file
	// 2. Include directories
	// 3. Harbour include dir (for hbclass.ch etc.)

	searchPaths := []string{}

	// Relative to current file
	if currentFile != "" {
		dir := filepath.Dir(currentFile)
		searchPaths = append(searchPaths, filepath.Join(dir, inclFile))
	}

	// Include directories
	for _, dir := range pp.includeDirs {
		searchPaths = append(searchPaths, filepath.Join(dir, inclFile))
	}

	// Try each path
	for _, path := range searchPaths {
		data, err := os.ReadFile(path)
		if err == nil {
			return string(data)
		}
	}

	return ""
}

// applyRules applies #command and #translate rules to a line.
// #command rules are tried first (they match complete statements).
// #translate rules are tried on any part of a line.
func (pp *Preprocessor) applyRules(line string) string {
	trimmed := strings.TrimSpace(line)
	if trimmed == "" || strings.HasPrefix(trimmed, "//") {
		return line
	}

	// Try #command rules (match from start of line)
	for _, rule := range pp.commands {
		if result, ok := rule.MatchLine(trimmed); ok {
			// Preserve leading whitespace
			indent := line[:len(line)-len(strings.TrimLeft(line, " \t"))]
			return indent + result
		}
	}

	// Try #translate rules (can match substrings)
	for _, rule := range pp.translates {
		if result, ok := rule.MatchLine(trimmed); ok {
			indent := line[:len(line)-len(strings.TrimLeft(line, " \t"))]
			return indent + result
		}
	}

	return line
}

// stripBlockComments removes /* ... */ comments from a line.
// If a /* is found without closing */, sets inBlock to true.
func stripBlockComments(line string, inBlock *bool) string {
	var out strings.Builder
	i := 0
	inStr := byte(0)
	for i < len(line) {
		// Track string literals
		if inStr == 0 && (line[i] == '"' || line[i] == '\'') {
			inStr = line[i]
			out.WriteByte(line[i])
			i++
			continue
		}
		if inStr != 0 {
			if line[i] == inStr {
				inStr = 0
			}
			out.WriteByte(line[i])
			i++
			continue
		}
		// Block comment start
		if i+1 < len(line) && line[i] == '/' && line[i+1] == '*' {
			// Find closing */
			end := strings.Index(line[i+2:], "*/")
			if end >= 0 {
				i = i + 2 + end + 2 // skip past */
				out.WriteByte(' ') // replace comment with space
			} else {
				*inBlock = true
				return out.String() // rest of line is comment
			}
			continue
		}
		out.WriteByte(line[i])
		i++
	}
	return out.String()
}

// applyDefines substitutes #define macros in a line.
// Simple word-boundary replacement (not full macro expansion).
func (pp *Preprocessor) applyDefines(line string) string {
	for name, value := range pp.defines {
		if value == "" {
			continue // flag-only define, no substitution
		}
		// Simple word replacement (not inside strings)
		line = replaceWord(line, name, value)
	}
	return line
}

// replaceWord replaces whole-word occurrences of old with new,
// avoiding replacements inside string literals.
func replaceWord(line, old, new string) string {
	if !strings.Contains(line, old) {
		return line
	}

	var result strings.Builder
	inString := byte(0)
	i := 0

	for i < len(line) {
		// Track string literals
		if inString == 0 && (line[i] == '"' || line[i] == '\'') {
			inString = line[i]
			result.WriteByte(line[i])
			i++
			continue
		}
		if inString != 0 && line[i] == inString {
			inString = 0
			result.WriteByte(line[i])
			i++
			continue
		}
		if inString != 0 {
			result.WriteByte(line[i])
			i++
			continue
		}

		// Check for word match
		if i+len(old) <= len(line) && line[i:i+len(old)] == old {
			// Check word boundaries
			before := i == 0 || !isWordChar(line[i-1])
			after := i+len(old) >= len(line) || !isWordChar(line[i+len(old)])
			if before && after {
				result.WriteString(new)
				i += len(old)
				continue
			}
		}

		result.WriteByte(line[i])
		i++
	}

	return result.String()
}

func isWordChar(c byte) bool {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'
}

// looksLikeInlineC heuristically detects Harbour-style inline C inside
// a #pragma BEGINDUMP block. Any ONE strong signal triggers detection
// so the user sees an early, readable error rather than an obscure Go
// syntax complaint far downstream.
//
// Signals (any match):
//   - `HB_FUNC(` / `HB_FUNC_STATIC(` / `HB_FUNC_TRANSLATE(` — Harbour's C FFI macro
//   - `hb_ret*(` / `hb_param*(` / `hb_stor*(` / `hb_itemNew(` — Harbour C API
//   - `#include <` or `#include "` — C preprocessor include
//   - `#define <ident>(` followed by typed arg list — C-style macro
//   - bare `int main(` / `void main(` — C entry point
//   - `typedef ` / `struct ` at line start — C declarations
//
// Go programs can use `import`, `package`, `func`, `var`, `:=` — none
// of which overlap with these C signatures, so false positives are
// unlikely.
func looksLikeInlineC(body string) bool {
	// Quick-reject: empty body.
	trimmed := strings.TrimSpace(body)
	if trimmed == "" {
		return false
	}

	for _, line := range strings.Split(body, "\n") {
		l := strings.TrimSpace(line)
		// #include <stdio.h> / "hbapi.h" — unambiguous C preprocessor.
		// Go doesn't use #include at all.
		if strings.HasPrefix(l, "#include <") || strings.HasPrefix(l, `#include "`) {
			return true
		}
		// Bare `HB_FUNC( NAME )` with an unquoted identifier is the
		// Harbour C FFI macro. The Go-side counterpart is
		// `hbrt.HB_FUNC("NAME", fn)` — lowercase package prefix and a
		// quoted string. Match the C form strictly.
		if strings.HasPrefix(l, "HB_FUNC(") ||
			strings.HasPrefix(l, "HB_FUNC_STATIC(") ||
			strings.HasPrefix(l, "HB_FUNC_TRANSLATE(") {
			return true
		}
		// C declarations at line start that have no Go analogue.
		if strings.HasPrefix(l, "typedef ") || strings.HasPrefix(l, "struct ") ||
			strings.HasPrefix(l, "int main(") || strings.HasPrefix(l, "void main(") {
			return true
		}
	}

	return false
}