feat(pp,rtl): Tier 2 audit followups — JOIN hash + PP validation + C heuristic

Three medium-priority audit items in one commit, each independently
revertible.

  * **#18 JOIN hash-join fast path.** New std.ch shape:
        JOIN WITH <alias> TO <file> [FIELDS ...] ON <mfield> = <dfield>
    expands to a 6-arg __dbJoin call with the master/detail key
    field names. Runtime detects the extra args, builds an O(M)
    hash over the detail's key column, then probes per master row
    for O(N+M) total — vs the FOR form's O(N*M). For 1k×1k that's
    2k vs 1M operations; the gap widens with N. The original FOR
    form is unchanged and stays the fallback for arbitrary
    predicates. New helper dbHashKey type-tags the key string so
    `1` (numeric), `"1"` (string), and `.T.` (logical) don't
    collide in the bucket map.

  * **#38 PP rule result-marker validation.** ParseRule now walks
    the result template after parseMarkers and warns about every
    `<name>` (or `<(name)>` / `<.name.>` / `<{name}>` / `#<name>`
    / `<"name">`) that doesn't match a pattern marker. Warnings
    flow into pp.errors via handleDirective with the directive's
    filename:line, so a typo'd `<NaMe>` in an `#xcommand`
    case-sensitive rule fails the build with a clear diagnostic
    instead of silently producing broken expansions.

  * **#44 looksLikeInlineC heuristic strengthened.** Catches more
    of the common Harbour-PRG-with-C-inline-block shapes that
    used to fall through and produce cryptic Go-side errors:
    function-like #define, `extern "C"` linkage blocks, C return-
    type declarations (`int foo(`, `static char* bar(`), and the
    hb_ret*() helper family used by Harbour's C FFI return
    setters. Two small predicate helpers (allLetters,
    allIdentChars) keep the C-vs-Go disambiguation tight enough
    that legit Go code (`func name() int { ... }`) doesn't trip.

  * **#28 LIST/DISPLAY pagination** — explicitly deferred. Proper
    pagination requires interactive terminal handling (Inkey(0)
    for the keypress) which would hang in CI / batch mode. Will
    revisit when an interactive terminal layer needs it for
    other reasons.

Test fixtures: tests/std_ch/test_join_hash.prg verifies the new
ON-form path produces the same output as the FOR form would.
std.ch runner now stands at 16/16.

Other gates green:
  go test ./...      : PASS
  FiveSql2 SQL:1999  : 43/43
  Harbour compat     : 56/56
  std.ch suite       : 16/16
  FRB suite          : 7/7

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 19:21:19 +09:00
parent 29ca02e1bc
commit 2008266da7
6 changed files with 379 additions and 14 deletions

View File

@@ -786,6 +786,33 @@ func nextTmpAlias(prefix string) string {
return fmt.Sprintf("%s_%d", prefix, n)
}
// dbHashKey turns a workarea field value into a hash-table key
// string. Numeric / Date / Logical / String types are encoded with a
// distinct one-byte tag prefix so values that happen to share the
// same string form across types ("1" vs 1 vs .T.) don't collide.
// NIL is its own bucket — Harbour's `==` says NIL never equals
// anything, but for join semantics we treat NIL keys as a single
// bucket so the user can still JOIN over rows with missing keys
// when both sides are NIL.
func dbHashKey(v hbrt.Value) string {
switch {
case v.IsNil():
return "\x00"
case v.IsNumeric():
return fmt.Sprintf("N\x01%g", v.AsNumDouble())
case v.IsLogical():
if v.AsBool() {
return "L\x01T"
}
return "L\x01F"
case v.IsDate():
return fmt.Sprintf("D\x01%d", v.AsJulian())
case v.IsString():
return "S\x01" + strings.TrimRight(v.AsString(), " ")
}
return "?\x01" + v.AsString()
}
// rtlDbNotImpl raises a runtime error explaining which xBase clause
// the user invoked that Five doesn't yet implement. std.ch routes
// SDF / DELIMITED / TO PRINTER / TO FILE variants here so they fail
@@ -1780,6 +1807,100 @@ func rtlDbJoin(t *hbrt.Thread) {
}
dstArea := wam.AreaAt(dstSel)
// Hash-join fast path. When the caller passes master and detail
// key field names (params 5 + 6), build a hash table over the
// detail in O(M), then scan master in O(N) and probe — total
// O(N+M) instead of the nested-loop's O(N*M). For 1k×1k that's
// 2k vs 1M operations, and the gap widens fast.
masterKeyName := ""
detailKeyName := ""
if nParams >= 5 && t.Local(5).IsString() {
masterKeyName = strings.ToUpper(strings.TrimSpace(t.Local(5).AsString()))
}
if nParams >= 6 && t.Local(6).IsString() {
detailKeyName = strings.ToUpper(strings.TrimSpace(t.Local(6).AsString()))
}
if masterKeyName != "" && detailKeyName != "" {
mkIdx, dkIdx := -1, -1
for i := 0; i < master.FieldCount(); i++ {
if strings.EqualFold(master.GetFieldInfo(i).Name, masterKeyName) {
mkIdx = i
break
}
}
for i := 0; i < detail.FieldCount(); i++ {
if strings.EqualFold(detail.GetFieldInfo(i).Name, detailKeyName) {
dkIdx = i
break
}
}
if mkIdx >= 0 && dkIdx >= 0 {
// Build detail hash: key string → list of cached field rows.
// We capture each detail row's wanted-field VALUES (not just
// rec numbers) so we don't have to re-select the detail area
// per probe — saves the WA-switch round trip and keeps the
// inner loop tight.
type detailRow struct {
vals []hbrt.Value
}
buckets := make(map[string][]detailRow, 1024)
wam.SelectByNum(detailSel)
detail.GoTop()
for !detail.EOF() {
k, _ := detail.GetValue(dkIdx)
key := dbHashKey(k)
row := detailRow{vals: make([]hbrt.Value, 0, len(srcRefs))}
for _, r := range srcRefs {
if r.isMaster {
row.vals = append(row.vals, hbrt.MakeNil()) // master fills later
} else {
v, _ := detail.GetValue(r.idx)
row.vals = append(row.vals, v)
}
}
buckets[key] = append(buckets[key], row)
detail.Skip(1)
}
// Scan master, probe detail buckets.
wam.SelectByNum(masterSel)
master.GoTop()
for !master.EOF() {
mk, _ := master.GetValue(mkIdx)
key := dbHashKey(mk)
rows, hit := buckets[key]
if hit {
// Cache master-side values for this row once.
mvals := make([]hbrt.Value, len(srcRefs))
for k, r := range srcRefs {
if r.isMaster {
v, _ := master.GetValue(r.idx)
mvals[k] = v
}
}
wam.SelectByNum(dstSel)
for _, drow := range rows {
dstArea.Append()
for k, r := range srcRefs {
if r.isMaster {
dstArea.PutValue(k, mvals[k])
} else {
dstArea.PutValue(k, drow.vals[k])
}
}
}
wam.SelectByNum(masterSel)
}
master.Skip(1)
}
goto closeDst
}
// Key names didn't resolve — fall through to nested-loop with
// (likely empty) bFor. User typo is reported as a no-result
// JOIN rather than crash; the destination DBF still gets
// created (matches Harbour: NO ROWS != error).
}
wam.SelectByNum(masterSel)
master.GoTop()
for !master.EOF() {
@@ -1818,6 +1939,8 @@ func rtlDbJoin(t *hbrt.Thread) {
master.Skip(1)
}
closeDst:
wam.SelectByNum(dstSel)
closeErr := wam.Close()
wam.SelectByNum(masterSel)