feat(pp,rtl): Tier 2 audit followups — JOIN hash + PP validation + C heuristic
Three medium-priority audit items in one commit, each independently
revertible.
* **#18 JOIN hash-join fast path.** New std.ch shape:
JOIN WITH <alias> TO <file> [FIELDS ...] ON <mfield> = <dfield>
expands to a 6-arg __dbJoin call with the master/detail key
field names. Runtime detects the extra args, builds an O(M)
hash over the detail's key column, then probes per master row
for O(N+M) total — vs the FOR form's O(N*M). For 1k×1k that's
2k vs 1M operations; the gap widens with N. The original FOR
form is unchanged and stays the fallback for arbitrary
predicates. New helper dbHashKey type-tags the key string so
`1` (numeric), `"1"` (string), and `.T.` (logical) don't
collide in the bucket map.
* **#38 PP rule result-marker validation.** ParseRule now walks
the result template after parseMarkers and warns about every
`<name>` (or `<(name)>` / `<.name.>` / `<{name}>` / `#<name>`
/ `<"name">`) that doesn't match a pattern marker. Warnings
flow into pp.errors via handleDirective with the directive's
filename:line, so a typo'd `<NaMe>` in an `#xcommand`
case-sensitive rule fails the build with a clear diagnostic
instead of silently producing broken expansions.
* **#44 looksLikeInlineC heuristic strengthened.** Catches more
of the common Harbour-PRG-with-C-inline-block shapes that
used to fall through and produce cryptic Go-side errors:
function-like #define, `extern "C"` linkage blocks, C return-
type declarations (`int foo(`, `static char* bar(`), and the
hb_ret*() helper family used by Harbour's C FFI return
setters. Two small predicate helpers (allLetters,
allIdentChars) keep the C-vs-Go disambiguation tight enough
that legit Go code (`func name() int { ... }`) doesn't trip.
* **#28 LIST/DISPLAY pagination** — explicitly deferred. Proper
pagination requires interactive terminal handling (Inkey(0)
for the keypress) which would hang in CI / batch mode. Will
revisit when an interactive terminal layer needs it for
other reasons.
Test fixtures: tests/std_ch/test_join_hash.prg verifies the new
ON-form path produces the same output as the FOR form would.
std.ch runner now stands at 16/16.
Other gates green:
go test ./... : PASS
FiveSql2 SQL:1999 : 43/43
Harbour compat : 56/56
std.ch suite : 16/16
FRB suite : 7/7
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -786,6 +786,33 @@ func nextTmpAlias(prefix string) string {
|
||||
return fmt.Sprintf("%s_%d", prefix, n)
|
||||
}
|
||||
|
||||
// dbHashKey turns a workarea field value into a hash-table key
|
||||
// string. Numeric / Date / Logical / String types are encoded with a
|
||||
// distinct one-byte tag prefix so values that happen to share the
|
||||
// same string form across types ("1" vs 1 vs .T.) don't collide.
|
||||
// NIL is its own bucket — Harbour's `==` says NIL never equals
|
||||
// anything, but for join semantics we treat NIL keys as a single
|
||||
// bucket so the user can still JOIN over rows with missing keys
|
||||
// when both sides are NIL.
|
||||
func dbHashKey(v hbrt.Value) string {
|
||||
switch {
|
||||
case v.IsNil():
|
||||
return "\x00"
|
||||
case v.IsNumeric():
|
||||
return fmt.Sprintf("N\x01%g", v.AsNumDouble())
|
||||
case v.IsLogical():
|
||||
if v.AsBool() {
|
||||
return "L\x01T"
|
||||
}
|
||||
return "L\x01F"
|
||||
case v.IsDate():
|
||||
return fmt.Sprintf("D\x01%d", v.AsJulian())
|
||||
case v.IsString():
|
||||
return "S\x01" + strings.TrimRight(v.AsString(), " ")
|
||||
}
|
||||
return "?\x01" + v.AsString()
|
||||
}
|
||||
|
||||
// rtlDbNotImpl raises a runtime error explaining which xBase clause
|
||||
// the user invoked that Five doesn't yet implement. std.ch routes
|
||||
// SDF / DELIMITED / TO PRINTER / TO FILE variants here so they fail
|
||||
@@ -1780,6 +1807,100 @@ func rtlDbJoin(t *hbrt.Thread) {
|
||||
}
|
||||
dstArea := wam.AreaAt(dstSel)
|
||||
|
||||
// Hash-join fast path. When the caller passes master and detail
|
||||
// key field names (params 5 + 6), build a hash table over the
|
||||
// detail in O(M), then scan master in O(N) and probe — total
|
||||
// O(N+M) instead of the nested-loop's O(N*M). For 1k×1k that's
|
||||
// 2k vs 1M operations, and the gap widens fast.
|
||||
masterKeyName := ""
|
||||
detailKeyName := ""
|
||||
if nParams >= 5 && t.Local(5).IsString() {
|
||||
masterKeyName = strings.ToUpper(strings.TrimSpace(t.Local(5).AsString()))
|
||||
}
|
||||
if nParams >= 6 && t.Local(6).IsString() {
|
||||
detailKeyName = strings.ToUpper(strings.TrimSpace(t.Local(6).AsString()))
|
||||
}
|
||||
if masterKeyName != "" && detailKeyName != "" {
|
||||
mkIdx, dkIdx := -1, -1
|
||||
for i := 0; i < master.FieldCount(); i++ {
|
||||
if strings.EqualFold(master.GetFieldInfo(i).Name, masterKeyName) {
|
||||
mkIdx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
for i := 0; i < detail.FieldCount(); i++ {
|
||||
if strings.EqualFold(detail.GetFieldInfo(i).Name, detailKeyName) {
|
||||
dkIdx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if mkIdx >= 0 && dkIdx >= 0 {
|
||||
// Build detail hash: key string → list of cached field rows.
|
||||
// We capture each detail row's wanted-field VALUES (not just
|
||||
// rec numbers) so we don't have to re-select the detail area
|
||||
// per probe — saves the WA-switch round trip and keeps the
|
||||
// inner loop tight.
|
||||
type detailRow struct {
|
||||
vals []hbrt.Value
|
||||
}
|
||||
buckets := make(map[string][]detailRow, 1024)
|
||||
wam.SelectByNum(detailSel)
|
||||
detail.GoTop()
|
||||
for !detail.EOF() {
|
||||
k, _ := detail.GetValue(dkIdx)
|
||||
key := dbHashKey(k)
|
||||
row := detailRow{vals: make([]hbrt.Value, 0, len(srcRefs))}
|
||||
for _, r := range srcRefs {
|
||||
if r.isMaster {
|
||||
row.vals = append(row.vals, hbrt.MakeNil()) // master fills later
|
||||
} else {
|
||||
v, _ := detail.GetValue(r.idx)
|
||||
row.vals = append(row.vals, v)
|
||||
}
|
||||
}
|
||||
buckets[key] = append(buckets[key], row)
|
||||
detail.Skip(1)
|
||||
}
|
||||
|
||||
// Scan master, probe detail buckets.
|
||||
wam.SelectByNum(masterSel)
|
||||
master.GoTop()
|
||||
for !master.EOF() {
|
||||
mk, _ := master.GetValue(mkIdx)
|
||||
key := dbHashKey(mk)
|
||||
rows, hit := buckets[key]
|
||||
if hit {
|
||||
// Cache master-side values for this row once.
|
||||
mvals := make([]hbrt.Value, len(srcRefs))
|
||||
for k, r := range srcRefs {
|
||||
if r.isMaster {
|
||||
v, _ := master.GetValue(r.idx)
|
||||
mvals[k] = v
|
||||
}
|
||||
}
|
||||
wam.SelectByNum(dstSel)
|
||||
for _, drow := range rows {
|
||||
dstArea.Append()
|
||||
for k, r := range srcRefs {
|
||||
if r.isMaster {
|
||||
dstArea.PutValue(k, mvals[k])
|
||||
} else {
|
||||
dstArea.PutValue(k, drow.vals[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
wam.SelectByNum(masterSel)
|
||||
}
|
||||
master.Skip(1)
|
||||
}
|
||||
goto closeDst
|
||||
}
|
||||
// Key names didn't resolve — fall through to nested-loop with
|
||||
// (likely empty) bFor. User typo is reported as a no-result
|
||||
// JOIN rather than crash; the destination DBF still gets
|
||||
// created (matches Harbour: NO ROWS != error).
|
||||
}
|
||||
|
||||
wam.SelectByNum(masterSel)
|
||||
master.GoTop()
|
||||
for !master.EOF() {
|
||||
@@ -1818,6 +1939,8 @@ func rtlDbJoin(t *hbrt.Thread) {
|
||||
master.Skip(1)
|
||||
}
|
||||
|
||||
closeDst:
|
||||
|
||||
wam.SelectByNum(dstSel)
|
||||
closeErr := wam.Close()
|
||||
wam.SelectByNum(masterSel)
|
||||
|
||||
Reference in New Issue
Block a user