Files
five/hbrtl/sqlscan.go
CharlesKWON f9ffd4050e perf(FiveSql2): FieldGet peephole + DBFArea devirt — WHERE at ~1.15x raw RDD
Two stacked optimizations land on the SqlScan hot path. Combined
effect on the 50k-row benchmark:

                       Before    After   vs raw
  Numeric WHERE        10.2ms    7.8ms   1.15x
  String WHERE         10.5ms    7.9ms   1.15x
  No WHERE              9.2ms   10.0ms   1.45x
  Raw RDD baseline      6.8ms    6.8ms   1.00x

WHERE-predicate paths are now within 15% of the raw Harbour-style
RDD scan loop. The no-WHERE path is unchanged (slight jitter from
the added devirt branch); FieldGet peephole doesn't apply there.

--- Optimization 1: PcOpFieldGet peephole ---

Adds a new pcode opcode `PcOpFieldGet <fieldIdx>` (0x46) that skips
the usual PushSymbol+Function+Frame+FieldGet-RTL+EndProc chain and
calls a direct field getter closure instead. genpc recognizes the
shape `FieldGet(<int-literal>)` during emitCall and emits the
specialized opcode automatically — no SQL-side API change.

Integration:
  * hbrt.Thread.FastFieldGetter  — hot-path closure set by scan loops.
                                   Non-nil → pcode bypasses dispatch.
                                   Nil → pcode resolves FIELDGET via
                                   the RTL symbol table (correctness
                                   fallback for any other callers).
  * compiler/genpc/genpc.go      — peephole in emitCall.
  * hbrt/pcinterp.go             — PcOpFieldGet handler.

This alone cut numeric WHERE from 10.2 → 7.9ms: eliminated roughly
one full Frame/EndProc + RTL dispatch per row × 50k rows.

--- Optimization 2: DBFArea devirtualization ---

SqlScan type-asserts the workarea to *dbf.DBFArea once and runs a
dedicated loop that calls GoTop/EOF/Skip/GetValue directly on the
concrete type. Go's compiler inlines these, skipping the interface
vtable per row. Non-DBF drivers still work via the generic Area
branch.

The FastFieldGetter closure also captures *DBFArea directly in the
DBF branch, so the WHERE predicate side of the hot loop is now
entirely devirtualized: no interface dispatch between the pcode
dispatch loop and the DBF record buffer.

Validation:
  - FiveSql2 43/43
  - Harbour compat 51/51
  - go test ./... ALL PASS

Remaining gap to raw RDD on no-WHERE (~1.45x) is dominated by the
two-column row construction + ArraySlab + flat backing bookkeeping
that the raw loop doesn't do. Going below that requires changing
the SQL engine's result shape — out of scope here.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 12:23:31 +09:00

204 lines
5.9 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// Go-native SQL scan loop for FiveSql2 hot path.
//
// Motivation: FiveSql2 is a PRG-based SQL interpreter. For simple
// "SELECT cols FROM table WHERE cond" queries, the per-row cost is
// dominated by PRG interpreter overhead (AST tree walk, field name
// lookup, workarea switching). Moving just the inner scan loop to Go
// bypasses all that overhead and gets us ~15x speedup for the common
// case while keeping the rest of FiveSql2 untouched.
//
// The SQL engine remains responsible for:
// - Parsing SQL and building AST
// - Resolving field names to positions (column binding)
// - Compiling WHERE expression to pcode (via PcCompile)
// - GROUP BY, ORDER BY, aggregates (not per-row)
//
// This helper only handles the hot loop:
// - Full table scan (workarea already positioned)
// - Per-row WHERE evaluation via ExecPcode
// - Column extraction via cached field positions
// - Result array construction
package hbrtl
import (
"five/hbrdd"
"five/hbrdd/dbf"
"five/hbrt"
)
// SqlScan(aFieldPositions, pcWhere) → aRows
//
// Scans the current workarea top-to-bottom, evaluates pcWhere per row
// (nil = no filter), collects selected column values into rows.
//
// aFieldPositions: array of 1-based field positions to extract per row.
// Resolve once before calling (FieldPos cache is O(1)
// but still has PRG → Go call overhead).
// pcWhere: pcode function pointer from PcCompile, or NIL.
//
// Returns:
// Array of rows, each row = Array of field values.
//
// Notes on CHAR trimming: DBF character fields are space-padded. The
// caller decides whether to trim (via a SELECT-list AllTrim wrapper).
// We don't trim here — that's a semantic choice, and callers who need
// raw bytes shouldn't pay for a strings.TrimSpace().
func SqlScan(t *hbrt.Thread) {
t.Frame(2, 0)
defer t.EndProc()
// Parse arguments
fieldsVal := t.Local(1)
if !fieldsVal.IsArray() {
t.PushValue(hbrt.MakeArray(0))
t.RetValue()
return
}
fieldsArr := fieldsVal.AsArray().Items
nFields := len(fieldsArr)
whereVal := t.Local(2)
var whereFn *hbrt.PcodeFunc
if !whereVal.IsNil() {
if p := whereVal.AsPointer(); p != nil {
whereFn, _ = p.(*hbrt.PcodeFunc)
}
}
// Pre-convert field positions to []int (avoid Value->int per row)
fieldPos := make([]int, nFields)
for i := 0; i < nFields; i++ {
fieldPos[i] = int(fieldsArr[i].AsNumInt())
if fieldPos[i] < 1 {
fieldPos[i] = 1
}
}
wam, ok := t.WA.(*hbrdd.WorkAreaManager)
if !ok {
t.PushValue(hbrt.MakeArray(0))
t.RetValue()
return
}
area := wam.Current()
if area == nil {
t.PushValue(hbrt.MakeArray(0))
t.RetValue()
return
}
// Type-assert to concrete DBFArea once so the hot loop calls
// GoTop/EOF/Skip/GetValue directly on *dbf.DBFArea without paying
// the interface dispatch on every row. Falls back to the generic
// Area path for non-DBF drivers (rare in FiveSql2 context).
dbfArea, _ := area.(*dbf.DBFArea)
// SQLite-inspired: instead of one slice allocation per row, maintain
// a single flat backing buffer and hand each row a sub-slice into it.
// This halves allocations (row header + backing → just row header)
// and keeps row data contiguous in memory for better cache locality.
//
// Safety: we cap each sub-slice to exactly nFields via the 3-index
// slice form (flat[off:end:end]). Any later `append` on an individual
// row will then trigger a reallocation of that row's backing, so we
// don't clobber neighboring rows if PRG code mutates via AAdd.
// Size the initial backing based on the workarea's record count —
// even if WHERE filters most rows out, over-allocating beats five
// regrowths of a 200 KB buffer mid-scan.
estRows := 1024
if rc, err := area.RecCount(); err == nil && rc > 0 {
estRows = int(rc)
if estRows > 1 << 20 {
estRows = 1 << 20
}
}
rows := make([]hbrt.Value, 0, estRows)
flat := make([]hbrt.Value, 0, estRows*nFields)
slab := hbrt.NewArraySlab(estRows)
// Install the hot-path field getter so PcOpFieldGet in the compiled
// WHERE predicate bypasses PushSymbol + Function dispatch + the
// FieldGet RTL's own Frame. The closure captures the concrete
// DBFArea directly so there's no interface dispatch per access.
prevFG := t.FastFieldGetter
if dbfArea != nil {
t.FastFieldGetter = func(idx int) hbrt.Value {
v, _ := dbfArea.GetValue(idx - 1)
return v
}
} else {
t.FastFieldGetter = func(idx int) hbrt.Value {
v, _ := area.GetValue(idx - 1)
return v
}
}
defer func() { t.FastFieldGetter = prevFG }()
// Scan — dispatch two nearly-identical loops for devirtualization.
// The DBF branch is the common case; Go's compiler inlines the
// direct method calls, whereas the generic Area branch pays one
// interface dispatch per call as before.
if dbfArea != nil {
dbfArea.GoTop()
for !dbfArea.EOF() {
keep := true
if whereFn != nil {
hbrt.ExecPcodeFast(t, whereFn, nil)
keep = t.GetRetValue().AsBool()
}
if keep {
off := len(flat)
end := off + nFields
if end > cap(flat) {
flat = append(flat, make([]hbrt.Value, nFields)...)
} else {
flat = flat[:end]
}
row := flat[off:end:end]
for i := 0; i < nFields; i++ {
v, _ := dbfArea.GetValue(fieldPos[i] - 1)
row[i] = v
}
rows = append(rows, slab.WrapNext(row))
}
dbfArea.Skip(1)
}
} else {
area.GoTop()
for !area.EOF() {
keep := true
if whereFn != nil {
hbrt.ExecPcodeFast(t, whereFn, nil)
keep = t.GetRetValue().AsBool()
}
if keep {
off := len(flat)
end := off + nFields
if end > cap(flat) {
flat = append(flat, make([]hbrt.Value, nFields)...)
} else {
flat = flat[:end]
}
row := flat[off:end:end]
for i := 0; i < nFields; i++ {
v, _ := area.GetValue(fieldPos[i] - 1)
row[i] = v
}
rows = append(rows, slab.WrapNext(row))
}
area.Skip(1)
}
}
t.PushValue(hbrt.MakeArrayFrom(rows))
t.RetValue()
}