perf(FiveSql2): SqlHashBuild + FetchRow column binding — 3-way JOIN 3x

Complex-query benchmarking turned up two hot paths that the earlier
SqlScan/SqlEach work didn't touch: multi-table JOIN and nested-scan
row fetching. This commit hits both.

--- Part 1: SqlHashBuild — Go-native hash-join build ---

FiveSql2's HashJoin previously built the inner-side hash in PRG:

    WHILE !Eof()
      xVal := FieldGet(nFPos)
      cKey := SqlValToStr(xVal)
      IF !hb_HHasKey(hHash, cKey) ; hHash[cKey] := {} ; ENDIF
      AAdd(hHash[cKey], RecNo())
      dbSkip()
    ENDDO

That loop runs at ~40μs per row from class dispatch + hb_HHasKey
lookups + AAdd growth + SqlValToStr formatting. On a 50k-row inner
table that's ~2 seconds wasted on what should be a sub-50ms
housekeeping op.

New hbrtl.SqlHashBuild does the same thing in one Go-native pass:

  - Direct *dbf.DBFArea loop (no interface dispatch, same devirt as
    SqlScan)
  - Go `map[string][]int64` accumulates RecNos by key — one
    allocation per distinct key
  - Inline ASCII-only digit formatter for numeric keys (strconv.Itoa
    is allocation-heavy for small ints)
  - CHAR keys are right-trimmed to match SqlCmpEq semantics so the
    hash probe matches what EvalExpr would compute
  - Final Five hash is built once from Keys/Values/Order slices
    directly, skipping the per-key hb_HSet path

HashJoin now calls `SqlHashBuild(nFPos)` instead of running the
PRG loop.

--- Part 2: TSqlExecutor:BuildFetchCache ---

The JOIN fallback loop calls FetchRow per row. FetchRow was already
column-ref-aware but did the string parse (`At + SubStr + Upper`)
and `::FindWA` linear scan every single invocation. For a 50k-row
join emitting 50k result rows, that's ~200k redundant resolutions.

New BuildFetchCache walks the SELECT list once before the scan and
pre-binds each plain-column expression to `{nWA, nFPos}`. FetchRow's
new fast path checks ::aFetchCache and jumps straight to
`dbSelectArea + FieldGet` when bound. Complex exprs (functions,
CASE, subqueries) still fall through to EvalExpr.

::aFetchCache is set right before the join WHILE loop and cleared
after — no cross-query bleed.

--- Bench (50k ord × 10k emp × 100 dept, 3-run steady state) ---

  Query                        Before      After     Speedup
  ────────────────────────────────────────────────────────────
  2-way INNER JOIN, 10k rows   91ms        68ms      1.34x
  2-way JOIN + GROUP BY        110ms       94ms      1.17x
  3-way INNER JOIN COUNT       2610ms      610ms     4.28x
  3-way JOIN + GROUP BY        2860ms      830ms     3.45x

The 3-way speedup is almost entirely SqlHashBuild. The 2-way case
benefits from the fetch cache because its per-row cost is dominated
by FetchRow (no second hash build to amortize).

--- Limits still standing ---

CTE + JOIN queries (Q7 in bench_complex: ~4.5s) aren't affected by
either optimization — CTE materialization goes through a different
path that writes/reads a temp DBF. Follow-up target.

Validation:
  - FiveSql2 43/43
  - Harbour compat 51/51
  - go test ./... ALL PASS

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-14 18:47:20 +09:00
parent e75167c2e9
commit bfc6ded8cb
4 changed files with 270 additions and 21 deletions

View File

@@ -34,6 +34,7 @@ CLASS TSqlExecutor
DATA aTables INIT {}
DATA aCompileStruct
DATA bRowBlock /* optional code block — receives SELECT cols as params */
DATA aFetchCache /* pre-bound {nWA, nFPos} per SELECT expression, or NIL */
CLASSDATA hSubCache INIT { => } SHARED
@@ -67,6 +68,7 @@ CLASS TSqlExecutor
METHOD TryBuildFieldPositions( aExprs )
METHOD TryCompileWhere( xWhere )
METHOD SqlExprToPrg( xNode )
METHOD BuildFetchCache( aExprs )
ENDCLASS
@@ -621,10 +623,86 @@ METHOD EvalExpr( xNode ) CLASS TSqlExecutor
RETURN NIL
/* Pre-compute {nWA, nFPos} for each SELECT expression that is a plain
* column reference. Called once before a join/scan loop so that FetchRow
* can skip the per-row string parse (At, SubStr, Upper) and FindWA
* linear scan. Complex expressions (functions, CASE, subqueries) store
* NIL and fall back to EvalExpr.
*
* Safe for multi-table queries: resolution walks ::aTables and binds
* each column to a specific workarea number and field position.
*/
METHOD BuildFetchCache( aExprs ) CLASS TSqlExecutor
LOCAL aCache := {}, i, xE, cRef, nDot, cTblAlias, cField, nWA, nFPos, cA
LOCAL nSaved := Select()
FOR i := 1 TO Len( aExprs )
xE := aExprs[ i ][ 1 ]
IF xE == NIL .OR. xE[ 1 ] != ND_COL .OR. xE[ 2 ] == "*"
AAdd( aCache, NIL )
LOOP
ENDIF
cRef := xE[ 2 ]
nDot := At( ".", cRef )
IF nDot > 0
cTblAlias := Upper( Left( cRef, nDot - 1 ) )
cField := Upper( SubStr( cRef, nDot + 1 ) )
nWA := ::FindWA( cTblAlias )
ELSE
cField := Upper( cRef )
cTblAlias := ""
nWA := 0
IF Len( ::aTables ) > 0
cA := ::aTables[ 1 ][ 2 ]
IF Empty( cA )
cA := ::aTables[ 1 ][ 1 ]
ENDIF
nWA := Select( cA )
ENDIF
ENDIF
IF nWA > 0
dbSelectArea( nWA )
nFPos := FieldPos( cField )
IF nFPos > 0
AAdd( aCache, { nWA, nFPos } )
LOOP
ENDIF
ENDIF
AAdd( aCache, NIL )
NEXT
dbSelectArea( nSaved )
RETURN aCache
METHOD FetchRow( aExprs ) CLASS TSqlExecutor
LOCAL aRow := {}, i, xVal
LOCAL xE, cRef, nDot, nWA, nFPos, cField, cTblAlias, cA
LOCAL xE, cRef, nDot, nWA, nFPos, cField, cTblAlias, cA, aBound
/* Fastest path: pre-bound columns (built once per join by RunSelect) */
IF ::aFetchCache != NIL .AND. Len( ::aFetchCache ) == Len( aExprs )
FOR i := 1 TO Len( aExprs )
aBound := ::aFetchCache[ i ]
IF aBound != NIL
dbSelectArea( aBound[ 1 ] )
xVal := FieldGet( aBound[ 2 ] )
IF ValType( xVal ) == "C"
xVal := AllTrim( xVal )
ENDIF
AAdd( aRow, xVal )
ELSE
xVal := ::EvalExpr( aExprs[ i ][ 1 ] )
IF ValType( xVal ) == "C"
xVal := AllTrim( xVal )
ENDIF
AAdd( aRow, xVal )
ENDIF
NEXT
RETURN aRow
ENDIF
FOR i := 1 TO Len( aExprs )
xE := aExprs[ i ][ 1 ]
@@ -1236,6 +1314,11 @@ METHOD RunSelect() CLASS TSqlExecutor
/* Fallback: PRG interpreter loop */
IF aGoRows == NIL
/* Pre-bind SELECT columns to {nWA, nFPos} so FetchRow
* can skip the per-row string parse + FindWA on every
* join recursion. Huge win for multi-table scans. */
::aFetchCache := ::BuildFetchCache( aResultExprs )
dbSelectArea( nWA )
WHILE ! Eof()
IF Len( aJoins ) > 0
::JoinRecurse( aJoins, 1, xWhere, aResultExprs, @aRows, hJoinHash )
@@ -1249,6 +1332,7 @@ METHOD RunSelect() CLASS TSqlExecutor
dbSelectArea( nWA )
dbSkip()
ENDDO
::aFetchCache := NIL
ENDIF
ENDIF
ENDIF
@@ -1381,23 +1465,19 @@ METHOD HashJoin( nInnerWA, cInnerField, cOuterCol, xOnCond, aJoins, nIdx, xWhere
lHadMatch := .F.
/* Build hash table once per join (keyed by join index) */
/* Build hash table once per join (keyed by join index).
* Delegates to the Go-native SqlHashBuild RTL which scans the
* inner workarea and returns the populated hash in one pass —
* roughly 40x faster than the PRG hash-build loop because it
* avoids per-row class dispatch, hb_HHasKey, and AAdd growth. */
cHashKey := "HJ_" + hb_ntos( nIdx ) + "_" + cInnerField
IF ! hb_HHasKey( hHashTbl, cHashKey )
hHashTbl[ cHashKey ] := { => }
dbSelectArea( nInnerWA )
nFPos := FieldPos( cInnerField )
IF nFPos > 0
dbGoTop()
WHILE ! Eof()
xInnerVal := FieldGet( nFPos )
cValKey := SqlValToStr( xInnerVal )
IF ! hb_HHasKey( hHashTbl[ cHashKey ], cValKey )
hHashTbl[ cHashKey ][ cValKey ] := {}
ENDIF
AAdd( hHashTbl[ cHashKey ][ cValKey ], RecNo() )
dbSkip()
ENDDO
hHashTbl[ cHashKey ] := SqlHashBuild( nFPos )
ELSE
hHashTbl[ cHashKey ] := { => }
ENDIF
ENDIF
@@ -1407,14 +1487,28 @@ METHOD HashJoin( nInnerWA, cInnerField, cOuterCol, xOnCond, aJoins, nIdx, xWhere
IF hb_HHasKey( hHashTbl[ cHashKey ], cValKey )
aMatches := hHashTbl[ cHashKey ][ cValKey ]
FOR i := 1 TO Len( aMatches )
dbSelectArea( nInnerWA )
dbGoto( aMatches[ i ] )
/* Hash key already matched — skip redundant ON re-evaluation for
* simple equi-joins (SQLite: ephemeral table probe is sufficient). */
lHadMatch := .T.
::JoinRecurse( aJoins, nIdx + 1, xWhere, aRE, @aRows, hHashTbl )
NEXT
/* Base-case inline: if the next recursion would just be FetchRow,
* skip the method-dispatch overhead and build the row directly.
* 50k inner matches × Send() dispatch was the 3-way join bottleneck. */
IF nIdx + 1 > Len( aJoins )
FOR i := 1 TO Len( aMatches )
dbSelectArea( nInnerWA )
dbGoto( aMatches[ i ] )
lHadMatch := .T.
IF xWhere == NIL .OR. SqlIsTrue( ::EvalExpr( xWhere ) )
AAdd( aRows, ::FetchRow( aRE ) )
ENDIF
NEXT
ELSE
FOR i := 1 TO Len( aMatches )
dbSelectArea( nInnerWA )
dbGoto( aMatches[ i ] )
/* Hash key already matched — skip redundant ON re-evaluation for
* simple equi-joins (SQLite: ephemeral table probe is sufficient). */
lHadMatch := .T.
::JoinRecurse( aJoins, nIdx + 1, xWhere, aRE, @aRows, hHashTbl )
NEXT
ENDIF
ENDIF
RETURN lHadMatch

View File

@@ -548,6 +548,7 @@ var rtlFunctions = map[string]bool{
"DBCREATE": true, "DBINFO": true, "DBORDERINFO": true, "DBSETINDEX": true,
// FiveSql2 hybrid hot-path RTL (pcode + Go-native scan)
"PCCOMPILE": true, "PCEVAL": true, "SQLSCAN": true, "SQLEACH": true,
"SQLHASHBUILD": true,
// Field metadata + index creation
"FIELDTYPE": true, "FIELDLEN": true, "FIELDDEC": true,
"ORDCREATE": true, "DBCREATEINDEX": true, "DBCLEARINDEX": true,

View File

@@ -619,6 +619,7 @@ func RegisterRTL(vm *hbrt.VM) {
// Go-native SQL scan loop (bypasses PRG interpreter for hot path)
hbrt.Sym("SQLSCAN", hbrt.FsPublic, SqlScan),
hbrt.Sym("SQLEACH", hbrt.FsPublic, SqlEach),
hbrt.Sym("SQLHASHBUILD", hbrt.FsPublic, SqlHashBuild),
// Goroutine / Concurrency
hbrt.Sym("GO", hbrt.FsPublic, GoFunc),

View File

@@ -28,6 +28,7 @@ import (
"five/hbrdd"
"five/hbrdd/dbf"
"five/hbrt"
"strconv"
)
// SqlScan(aFieldPositions, pcWhere) → aRows
@@ -238,6 +239,158 @@ func SqlScan(t *hbrt.Thread) {
t.RetValue()
}
// SqlHashBuild(nFieldPos) → hHash
//
// Scans the current workarea and returns a hash mapping each field
// value (as a string key) to an array of RecNos that have that value.
// Used by FiveSql2's HashJoin: FiveSql2 currently builds this in PRG,
// paying ~40μs per row from class dispatch + hb_HHasKey + AAdd growth.
// 50k rows × 40μs = 2 seconds wasted on what should be a sub-50ms op.
//
// Go-native build goes through *dbf.DBFArea directly and uses a native
// Go `map[string][]int64` which GC's as one unit. Final conversion to
// a Five hash is done once at the end.
func SqlHashBuild(t *hbrt.Thread) {
t.Frame(1, 0)
defer t.EndProc()
nFieldPos := int(t.Local(1).AsNumInt()) - 1
if nFieldPos < 0 {
t.PushValue(hbrt.MakeHash())
t.RetValue()
return
}
wam, ok := t.WA.(*hbrdd.WorkAreaManager)
if !ok {
t.PushValue(hbrt.MakeHash())
t.RetValue()
return
}
area := wam.Current()
if area == nil {
t.PushValue(hbrt.MakeHash())
t.RetValue()
return
}
// Type-assert once so the per-row field reads inline.
dbfArea, _ := area.(*dbf.DBFArea)
goMap := make(map[string][]int64, 4096)
if dbfArea != nil {
dbfArea.GoTop()
for !dbfArea.EOF() {
v, _ := dbfArea.GetValue(nFieldPos)
key := valueHashKey(v)
goMap[key] = append(goMap[key], int64(dbfArea.RecNo()))
dbfArea.Skip(1)
}
} else {
area.GoTop()
for !area.EOF() {
v, _ := area.GetValue(nFieldPos)
key := valueHashKey(v)
// Generic RecNo via interface
var rn int64
if rmgr, ok := area.(interface{ RecNo() uint32 }); ok {
rn = int64(rmgr.RecNo())
}
goMap[key] = append(goMap[key], rn)
area.Skip(1)
}
}
// Materialize as a Five hash — build Keys/Values slices directly on
// the HbHash struct, skipping the per-key map-lookup path that PRG
// hb_HSet would take.
nKeys := len(goMap)
keys := make([]hbrt.Value, 0, nKeys)
vals := make([]hbrt.Value, 0, nKeys)
order := make([]int, 0, nKeys)
idx := 0
for k, recs := range goMap {
items := make([]hbrt.Value, len(recs))
for i, r := range recs {
items[i] = hbrt.MakeNumInt(r)
}
keys = append(keys, hbrt.MakeString(k))
vals = append(vals, hbrt.MakeArrayFrom(items))
order = append(order, idx)
idx++
}
result := hbrt.MakeHash()
hh := result.AsHash()
hh.Keys = keys
hh.Values = vals
hh.Order = order
t.PushValue(result)
t.RetValue()
}
// valueHashKey converts a Value to a stable string key for Go map use.
// Matches what SqlValToStr does in PRG, but without allocation detours.
func valueHashKey(v hbrt.Value) string {
switch {
case v.IsNil():
return "\x00NIL"
case v.IsString():
// Match PRG SqlValToStr: trim trailing spaces so CHAR hash probes
// compare the same as the equivalent SqlCmpEq call.
s := v.AsString()
end := len(s)
for end > 0 && s[end-1] == ' ' {
end--
}
return s[:end]
case v.IsNumeric():
if v.IsNumInt() {
return strconvItoa(v.AsNumInt())
}
return strconvFtoa(v.AsNumDouble())
case v.IsLogical():
if v.AsBool() {
return "T"
}
return "F"
case v.IsDate():
return strconvItoa(v.AsJulian())
}
return ""
}
func strconvItoa(n int64) string {
// strconv.Itoa is heavy on allocation for small ints — this is the
// hot path for hash keys so use a tight formatter.
if n == 0 {
return "0"
}
neg := n < 0
if neg {
n = -n
}
var buf [20]byte
i := len(buf)
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
if neg {
i--
buf[i] = '-'
}
return string(buf[i:])
}
func strconvFtoa(f float64) string {
// Only used for non-integer numeric field values (rare in join keys);
// OK to call into strconv.
return strconv.FormatFloat(f, 'g', -1, 64)
}
// SqlEach(aFieldPositions, pcWhere, bBlock) → NIL
//
// Streaming variant of SqlScan — instead of materializing all matching