diff --git a/compiler/pp/command.go b/compiler/pp/command.go index c3aa49c..5b91d94 100644 --- a/compiler/pp/command.go +++ b/compiler/pp/command.go @@ -28,6 +28,7 @@ package pp import ( + "fmt" "strings" ) @@ -40,6 +41,13 @@ type Rule struct { Keyword string // first keyword (for fast matching) Markers []Marker // parsed pattern markers ResultTmpl string // result template with marker references + + // Warnings collected during ParseRule. Currently only one source: + // result-template markers that reference a name absent from the + // pattern. Caller can surface these to the user — a typo'd + // `` instead of `` used to silently produce broken + // expansion output. + Warnings []string } // Marker represents a pattern marker like , , , <*x*>. @@ -105,9 +113,86 @@ func ParseRule(directive string, isCommand, caseSens bool) *Rule { // Parse markers from pattern rule.Markers = parseMarkers(pattern) + // Validate result-template marker references. Each `` + // (and its smart-stringify / blockify / logify / dumb-stringify + // variants) must reference a name declared in the pattern. + // Catches typos like `` vs `` (case-sensitive + // xcommand) before they silently produce broken output at + // expansion time. + rule.Warnings = validateResultMarkers(pattern, result, rule.Markers, caseSens) + return rule } +// validateResultMarkers scans the result template for marker +// references and reports any name not declared in the pattern. +// Result returned as a slice of human-readable warning strings — +// caller decides whether to surface or ignore. +func validateResultMarkers(pattern, result string, markers []Marker, caseSens bool) []string { + declared := make(map[string]bool, len(markers)) + for _, m := range markers { + key := m.Name + if !caseSens { + key = strings.ToUpper(key) + } + declared[key] = true + } + if len(declared) == 0 { + // Nothing to validate against — rule is keyword-only. + return nil + } + + var warnings []string + seen := map[string]bool{} + i := 0 + for i < len(result) { + // Marker shapes recognised here mirror applyResult's loop: + // , <(name)>, <.name.>, <{name}>, <"name">, #. + // findMarkerEnd already understands all of them — we just + // need the inner identifier. + if result[i] != '<' && !(result[i] == '#' && i+1 < len(result) && result[i+1] == '<') { + i++ + continue + } + start := i + if result[i] == '#' { + start = i + 1 + } + end := findMarkerEnd(result, start) + if end == 0 { + i++ + continue + } + // Extract identifier between the wrappers. + inner := result[start+1 : end-1] + // Strip prefix `(`, `.`, `"`, `{` + for len(inner) > 0 && (inner[0] == '(' || inner[0] == '.' || inner[0] == '"' || inner[0] == '{') { + inner = inner[1:] + } + // Strip suffix `)`, `.`, `"`, `}` + for len(inner) > 0 { + c := inner[len(inner)-1] + if c == ')' || c == '.' || c == '"' || c == '}' || c == ' ' { + inner = inner[:len(inner)-1] + } else { + break + } + } + key := inner + if !caseSens { + key = strings.ToUpper(key) + } + if key != "" && !declared[key] && !seen[key] { + seen[key] = true + warnings = append(warnings, + fmt.Sprintf("result-template marker <%s> not declared in pattern: %q", + inner, pattern)) + } + i = end + } + return warnings +} + // parseMarkers extracts all <...> markers from a pattern. func parseMarkers(pattern string) []Marker { var markers []Marker diff --git a/compiler/pp/pp.go b/compiler/pp/pp.go index 0eb0fcc..ba5d062 100644 --- a/compiler/pp/pp.go +++ b/compiler/pp/pp.go @@ -378,29 +378,37 @@ func (pp *Preprocessor) handleDirective(filename, directive string, depth int, r return true } - // #command / #translate — parse and store rules - if strings.HasPrefix(upper, "COMMAND ") { - if rule := ParseRule(directive[8:], true, false); rule != nil { - pp.commands = append(pp.commands, rule) + // #command / #translate — parse and store rules. ParseRule now + // validates that result-template marker references resolve to a + // pattern marker; any unresolved name flows back as a warning + // surfaced via pp.errors with the directive's filename:line so + // the user can find the typo (e.g. case-sensitive `` vs + // `` in an #xcommand). Without surfacing, the broken + // expansion silently produced empty / mangled output at every + // call site. + registerRule := func(r *Rule, store *[]*Rule) { + if r == nil { + return } + *store = append(*store, r) + for _, w := range r.Warnings { + pp.errors = append(pp.errors, fmt.Sprintf("%s:%d: #command: %s", filename, lineNo, w)) + } + } + if strings.HasPrefix(upper, "COMMAND ") { + registerRule(ParseRule(directive[8:], true, false), &pp.commands) return true } if strings.HasPrefix(upper, "TRANSLATE ") { - if rule := ParseRule(directive[10:], false, false); rule != nil { - pp.translates = append(pp.translates, rule) - } + registerRule(ParseRule(directive[10:], false, false), &pp.translates) return true } if strings.HasPrefix(upper, "XCOMMAND ") { - if rule := ParseRule(directive[9:], true, true); rule != nil { - pp.commands = append(pp.commands, rule) - } + registerRule(ParseRule(directive[9:], true, true), &pp.commands) return true } if strings.HasPrefix(upper, "XTRANSLATE ") { - if rule := ParseRule(directive[11:], false, true); rule != nil { - pp.translates = append(pp.translates, rule) - } + registerRule(ParseRule(directive[11:], false, true), &pp.translates) return true } @@ -714,6 +722,16 @@ func looksLikeInlineC(body string) bool { if strings.HasPrefix(l, "#include <") || strings.HasPrefix(l, `#include "`) { return true } + // Function-like #define is C-only — Go uses const / generics. + // `#define FOO(x) ...` + if strings.HasPrefix(l, "#define ") { + // Find the name and check for `(` immediately after with + // no space (function-like macro signature). + rest := strings.TrimSpace(l[8:]) + if i := strings.IndexAny(rest, " \t("); i > 0 && i < len(rest) && rest[i] == '(' { + return true + } + } // Bare `HB_FUNC( NAME )` with an unquoted identifier is the // Harbour C FFI macro. The Go-side counterpart is // `hbrt.HB_FUNC("NAME", fn)` — lowercase package prefix and a @@ -723,12 +741,81 @@ func looksLikeInlineC(body string) bool { strings.HasPrefix(l, "HB_FUNC_TRANSLATE(") { return true } + // `extern "C"` — C / C++ linkage block, never Go. + if strings.HasPrefix(l, `extern "C"`) { + return true + } // C declarations at line start that have no Go analogue. if strings.HasPrefix(l, "typedef ") || strings.HasPrefix(l, "struct ") || strings.HasPrefix(l, "int main(") || strings.HasPrefix(l, "void main(") { return true } + // C return-type declarations: `int name(`, `char *name(`, etc. + // Matching exactly ` (` keeps us off Go's + // `func name(` (which starts with `func`, not a type word) + // and Go variable declarations (which use `:=` or `var`). + if isCReturnTypeDecl(l) { + return true + } + // hb_ret*(...) helpers — Harbour's C-side return setters. + // hb_retc / hb_retni / hb_retnl / hb_retd / hb_retl / hb_retptr + if strings.HasPrefix(l, "hb_ret") { + rest := l[6:] + if i := strings.IndexByte(rest, '('); i >= 0 { + name := rest[:i] + if name != "" && allLetters(name) { + return true + } + } + } } return false } + +// isCReturnTypeDecl reports whether the line opens a C function +// declaration like `int foo(` / `static char* bar(`. We match a +// short prefix list of C-only types so a Go declaration like +// `func name() int { ... }` doesn't trip this. +func isCReturnTypeDecl(l string) bool { + cTypePrefixes := []string{ + "int ", "void ", "char ", "long ", "short ", "double ", "float ", + "unsigned ", "signed ", "size_t ", "ssize_t ", "uint", + "static int ", "static void ", "static char ", "static long ", + } + for _, p := range cTypePrefixes { + if strings.HasPrefix(l, p) { + rest := strings.TrimLeft(l[len(p):], " \t*") + // rest should now start with an identifier followed by `(`. + if i := strings.IndexByte(rest, '('); i > 0 && i < 50 { + name := rest[:i] + if allIdentChars(name) { + return true + } + } + } + } + return false +} + +func allLetters(s string) bool { + for _, c := range s { + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { + return false + } + } + return s != "" +} + +func allIdentChars(s string) bool { + for i, c := range s { + if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') { + continue + } + if i > 0 && c >= '0' && c <= '9' { + continue + } + return false + } + return s != "" +} diff --git a/compiler/pp/std.ch b/compiler/pp/std.ch index cd762ba..a7e39cc 100644 --- a/compiler/pp/std.ch +++ b/compiler/pp/std.ch @@ -154,7 +154,26 @@ /* JOIN merges the current ("master") workarea with the named detail alias into a fresh DBF, emitting one output row per - master/detail pair where FOR evaluates true. */ + master/detail pair where FOR evaluates true. + + The ON form takes the equality-key field names directly and + activates a hash-join fast path: build a hash over the detail's + key column once, then probe per master row — O(N+M) total instead + of the FOR form's O(N*M) nested-loop. Use it whenever the + join predicate is a simple `master.k = detail.k` equality. The + FOR form remains available for arbitrary predicates. Order + matters: the ON rule is more specific so it wins. + + Note for callers: ON expects bare field names, not expressions. + Five doesn't auto-resolve bare identifiers to fields, but std.ch + passes them as quoted strings via <(mfield)> / <(dfield)> so the + PP captures the field names verbatim — runtime-side __dbJoin + does its own field lookup. */ +#command JOIN WITH <(alias)> TO <(f)> [FIELDS ] ; + ON = => ; + __dbJoin( <(alias)>, <(f)>, { <(fields)> }, NIL, ; + <(mfield)>, <(dfield)> ) + #command JOIN [WITH <(alias)>] [TO <(f)>] [FIELDS ] ; [FOR ] => ; __dbJoin( <(alias)>, <(f)>, { <(fields)> }, <{for}> ) diff --git a/hbrtl/database.go b/hbrtl/database.go index ce17d98..b06f2b3 100644 --- a/hbrtl/database.go +++ b/hbrtl/database.go @@ -786,6 +786,33 @@ func nextTmpAlias(prefix string) string { return fmt.Sprintf("%s_%d", prefix, n) } +// dbHashKey turns a workarea field value into a hash-table key +// string. Numeric / Date / Logical / String types are encoded with a +// distinct one-byte tag prefix so values that happen to share the +// same string form across types ("1" vs 1 vs .T.) don't collide. +// NIL is its own bucket — Harbour's `==` says NIL never equals +// anything, but for join semantics we treat NIL keys as a single +// bucket so the user can still JOIN over rows with missing keys +// when both sides are NIL. +func dbHashKey(v hbrt.Value) string { + switch { + case v.IsNil(): + return "\x00" + case v.IsNumeric(): + return fmt.Sprintf("N\x01%g", v.AsNumDouble()) + case v.IsLogical(): + if v.AsBool() { + return "L\x01T" + } + return "L\x01F" + case v.IsDate(): + return fmt.Sprintf("D\x01%d", v.AsJulian()) + case v.IsString(): + return "S\x01" + strings.TrimRight(v.AsString(), " ") + } + return "?\x01" + v.AsString() +} + // rtlDbNotImpl raises a runtime error explaining which xBase clause // the user invoked that Five doesn't yet implement. std.ch routes // SDF / DELIMITED / TO PRINTER / TO FILE variants here so they fail @@ -1780,6 +1807,100 @@ func rtlDbJoin(t *hbrt.Thread) { } dstArea := wam.AreaAt(dstSel) + // Hash-join fast path. When the caller passes master and detail + // key field names (params 5 + 6), build a hash table over the + // detail in O(M), then scan master in O(N) and probe — total + // O(N+M) instead of the nested-loop's O(N*M). For 1k×1k that's + // 2k vs 1M operations, and the gap widens fast. + masterKeyName := "" + detailKeyName := "" + if nParams >= 5 && t.Local(5).IsString() { + masterKeyName = strings.ToUpper(strings.TrimSpace(t.Local(5).AsString())) + } + if nParams >= 6 && t.Local(6).IsString() { + detailKeyName = strings.ToUpper(strings.TrimSpace(t.Local(6).AsString())) + } + if masterKeyName != "" && detailKeyName != "" { + mkIdx, dkIdx := -1, -1 + for i := 0; i < master.FieldCount(); i++ { + if strings.EqualFold(master.GetFieldInfo(i).Name, masterKeyName) { + mkIdx = i + break + } + } + for i := 0; i < detail.FieldCount(); i++ { + if strings.EqualFold(detail.GetFieldInfo(i).Name, detailKeyName) { + dkIdx = i + break + } + } + if mkIdx >= 0 && dkIdx >= 0 { + // Build detail hash: key string → list of cached field rows. + // We capture each detail row's wanted-field VALUES (not just + // rec numbers) so we don't have to re-select the detail area + // per probe — saves the WA-switch round trip and keeps the + // inner loop tight. + type detailRow struct { + vals []hbrt.Value + } + buckets := make(map[string][]detailRow, 1024) + wam.SelectByNum(detailSel) + detail.GoTop() + for !detail.EOF() { + k, _ := detail.GetValue(dkIdx) + key := dbHashKey(k) + row := detailRow{vals: make([]hbrt.Value, 0, len(srcRefs))} + for _, r := range srcRefs { + if r.isMaster { + row.vals = append(row.vals, hbrt.MakeNil()) // master fills later + } else { + v, _ := detail.GetValue(r.idx) + row.vals = append(row.vals, v) + } + } + buckets[key] = append(buckets[key], row) + detail.Skip(1) + } + + // Scan master, probe detail buckets. + wam.SelectByNum(masterSel) + master.GoTop() + for !master.EOF() { + mk, _ := master.GetValue(mkIdx) + key := dbHashKey(mk) + rows, hit := buckets[key] + if hit { + // Cache master-side values for this row once. + mvals := make([]hbrt.Value, len(srcRefs)) + for k, r := range srcRefs { + if r.isMaster { + v, _ := master.GetValue(r.idx) + mvals[k] = v + } + } + wam.SelectByNum(dstSel) + for _, drow := range rows { + dstArea.Append() + for k, r := range srcRefs { + if r.isMaster { + dstArea.PutValue(k, mvals[k]) + } else { + dstArea.PutValue(k, drow.vals[k]) + } + } + } + wam.SelectByNum(masterSel) + } + master.Skip(1) + } + goto closeDst + } + // Key names didn't resolve — fall through to nested-loop with + // (likely empty) bFor. User typo is reported as a no-result + // JOIN rather than crash; the destination DBF still gets + // created (matches Harbour: NO ROWS != error). + } + wam.SelectByNum(masterSel) master.GoTop() for !master.EOF() { @@ -1818,6 +1939,8 @@ func rtlDbJoin(t *hbrt.Thread) { master.Skip(1) } +closeDst: + wam.SelectByNum(dstSel) closeErr := wam.Close() wam.SelectByNum(masterSel) diff --git a/tests/std_ch/run.sh b/tests/std_ch/run.sh index a5b68b7..0abf7c9 100755 --- a/tests/std_ch/run.sh +++ b/tests/std_ch/run.sh @@ -31,6 +31,7 @@ TESTS=( test_unsupported test_block_comma test_compound_lhs + test_join_hash ) work="$(mktemp -d)" diff --git a/tests/std_ch/test_join_hash.prg b/tests/std_ch/test_join_hash.prg new file mode 100644 index 0000000..b53a79e --- /dev/null +++ b/tests/std_ch/test_join_hash.prg @@ -0,0 +1,50 @@ +/* JOIN ... ON hash-join fast path. */ + +PROCEDURE Main() + LOCAL n + + FErase("cust.dbf"); FErase("ord.dbf"); FErase("out.dbf") + + /* Customers (master) */ + dbCreate("cust.dbf", { {"CID","N",4,0}, {"CNAME","C",12,0} }) + USE cust.dbf NEW EXCLUSIVE ALIAS cu + dbAppend(); FieldPut(1, 1); FieldPut(2, "Alice") + dbAppend(); FieldPut(1, 2); FieldPut(2, "Bob") + dbAppend(); FieldPut(1, 3); FieldPut(2, "Carol") + dbCommit() + dbCloseArea() + + /* Orders (detail) */ + dbCreate("ord.dbf", { {"OID","N",4,0}, {"CID","N",4,0}, {"AMT","N",8,2} }) + USE ord.dbf NEW EXCLUSIVE ALIAS od + dbAppend(); FieldPut(1, 100); FieldPut(2, 1); FieldPut(3, 50) + dbAppend(); FieldPut(1, 101); FieldPut(2, 1); FieldPut(3, 30) + dbAppend(); FieldPut(1, 102); FieldPut(2, 2); FieldPut(3, 200) + dbAppend(); FieldPut(1, 103); FieldPut(2, 4); FieldPut(3, 99) /* no-match */ + dbCommit() + dbCloseArea() + + USE cust.dbf NEW EXCLUSIVE ALIAS cu + USE ord.dbf NEW EXCLUSIVE ALIAS od + SELECT cu + + /* Hash-join form */ + JOIN WITH od TO out.dbf FIELDS cid, cname, oid, amt ON cid = cid + + SELECT cu ; dbCloseArea() + SELECT od ; dbCloseArea() + + USE out.dbf NEW EXCLUSIVE + COUNT TO n + ? "Hash-joined rows =", n, "(expect 3)" + dbGoTop() + DO WHILE !Eof() + ? " ", FieldGet(1), AllTrim(FieldGet(2)), FieldGet(3), FieldGet(4) + dbSkip() + ENDDO + ? "(expect three rows: Alice/100/50, Alice/101/30, Bob/102/200)" + dbCloseArea() + + FErase("cust.dbf"); FErase("ord.dbf"); FErase("out.dbf") + ? "DONE" + RETURN