perf(fivesql2): Go-native FetchRow fast path — 1.3-1.7x on agg/window

TSqlExecutor:FetchRow was the per-row workhorse for aggregation,
HAVING, and window queries. Even with the pre-built aFetchCache
binding columns to (nWA, nFPos), the PRG FOR loop paid one method
dispatch per column per row (dbSelectArea, FieldGet, AllTrim,
AAdd) — profile pinned it at ~30% of B4 CPU.

SqlFetchRowFast collapses the cache-path loop into a single Go
call:
  - bound entry: SelectByNum + area.GetValue directly
  - unbound (aggregate/expression): self:EvalExpr via Send
  - character values: TrimSpace inline
The PRG FetchRow keeps its original cache-miss fallback path
unchanged for rare queries where aFetchCache isn't built.

Bench deltas (median of 3 steady runs, 1000 iters):
  B4_GROUP_HAVING 418 → 327 us  -22% (1.28x)
  B9_ROW_NUMBER   191 → 120 us  -37% (1.59x)
  B10_RANK_PART   228 → 135 us  -41% (1.69x)
  B11_SUM_OVER    249 → 156 us  -37% (1.60x)
  B14_COUNT       235 → 219 us  -7%
  B15_CTE_WIN_JOIN 1577 → 1452 us  -8%
Single-table SELECT (B1-B3, B5-B7, B8) stays flat — those already
hit the column-binding fast path and don't need aggregate dispatch.

FiveSql2 43/43, Harbour compat 56/56.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 13:50:02 +09:00
parent c84cde6175
commit 935883bb88
3 changed files with 94 additions and 20 deletions

View File

@@ -753,28 +753,14 @@ RETURN aCache
METHOD FetchRow( aExprs ) CLASS TSqlExecutor
LOCAL aRow := {}, i, xVal
LOCAL xE, cRef, nDot, nWA, nFPos, cField, cTblAlias, cA, aBound
LOCAL xE, cRef, nDot, nWA, nFPos, cField, cTblAlias, cA
/* Fastest path: pre-bound columns (built once per join by RunSelect) */
/* Fastest path: pre-bound columns (built once per join by RunSelect).
* Go-native: SqlFetchRowFast collapses the per-row Harbour FOR loop
* into a single Go call, saving ~30% of GROUP BY CPU spent in PRG
* method dispatch. Falls back to self:EvalExpr for unbound entries. */
IF ::aFetchCache != NIL .AND. Len( ::aFetchCache ) == Len( aExprs )
FOR i := 1 TO Len( aExprs )
aBound := ::aFetchCache[ i ]
IF aBound != NIL
dbSelectArea( aBound[ 1 ] )
xVal := FieldGet( aBound[ 2 ] )
IF ValType( xVal ) == "C"
xVal := AllTrim( xVal )
ENDIF
AAdd( aRow, xVal )
ELSE
xVal := ::EvalExpr( aExprs[ i ][ 1 ] )
IF ValType( xVal ) == "C"
xVal := AllTrim( xVal )
ENDIF
AAdd( aRow, xVal )
ENDIF
NEXT
RETURN aRow
RETURN SqlFetchRowFast( Self, aExprs, ::aFetchCache )
ENDIF
FOR i := 1 TO Len( aExprs )

View File

@@ -638,6 +638,7 @@ func RegisterRTL(vm *hbrt.VM) {
hbrt.Sym("SQLCOERCEFORCMP", hbrt.FsPublic, SqlCoerceForCmp),
hbrt.Sym("SQLISTRUE", hbrt.FsPublic, SqlIsTrue),
hbrt.Sym("SQLISAGGNAME", hbrt.FsPublic, SqlIsAggName),
hbrt.Sym("SQLFETCHROWFAST", hbrt.FsPublic, SqlFetchRowFast),
hbrt.Sym("SQLCMPEQ", hbrt.FsPublic, SqlCmpEq),
hbrt.Sym("SQLCMPLT", hbrt.FsPublic, SqlCmpLt),
hbrt.Sym("SQLEXTRACTTEMPLATE", hbrt.FsPublic, SqlExtractTemplate),

View File

@@ -2405,3 +2405,90 @@ func SqlEach(t *hbrt.Thread) {
t.RetNil()
}
// SqlFetchRowFast(oSelf, aExprs, aFetchCache) → aRow
//
// Go-native replacement for TSqlExecutor:FetchRow. Profile showed
// FetchRow at ~30% of B4 GROUP+HAVING CPU — 100 rows × 1000 iters of
// PRG method dispatch per column per row, even with the aFetchCache
// fast path. This collapses the per-row loop into one Go call: bound
// cache entries (`{nWA, nFPos}`) do a direct SelectByNum+GetValue;
// unbound entries fall back to `self:EvalExpr(exprs[i][1])` via Send.
// Character values get trimmed inline (mirrors PRG AllTrim, which is
// really TrimSpace in practice since DBF pads with ASCII space).
func SqlFetchRowFast(t *hbrt.Thread) {
t.Frame(3, 0)
defer t.EndProc()
self := t.Local(1)
exprsVal := t.Local(2)
cacheVal := t.Local(3)
if !exprsVal.IsArray() {
t.PushValue(hbrt.MakeArrayFrom(nil))
t.RetValue()
return
}
exprs := exprsVal.AsArray().Items
n := len(exprs)
var cache []hbrt.Value
useCache := false
if cacheVal.IsArray() {
cache = cacheVal.AsArray().Items
useCache = len(cache) == n
}
wa := getWA(t)
out := make([]hbrt.Value, 0, n)
for i := 0; i < n; i++ {
var val hbrt.Value
hit := false
if useCache {
entry := cache[i]
if !entry.IsNil() && entry.IsArray() {
items := entry.AsArray().Items
if len(items) >= 2 && wa != nil {
nWA := uint16(items[0].AsNumInt())
nFPos := int(items[1].AsNumInt())
wa.SelectByNum(nWA)
if area := wa.Current(); area != nil {
if v, err := area.GetValue(nFPos - 1); err == nil {
val = v
hit = true
}
}
}
}
}
if !hit {
// Fallback: self:EvalExpr(exprs[i][1])
var exprNode hbrt.Value
if exprs[i].IsArray() {
items := exprs[i].AsArray().Items
if len(items) > 0 {
exprNode = items[0]
}
}
t.PushValue(self)
t.PushValue(exprNode)
t.Send("EVALEXPR", 1)
val = t.Pop2()
}
if val.IsString() {
s := val.AsString()
trimmed := strings.TrimSpace(s)
if len(trimmed) != len(s) {
val = hbrt.MakeString(trimmed)
}
}
out = append(out, val)
}
t.PushValue(hbrt.MakeArrayFrom(out))
t.RetValue()
}