From f9ffd4050ef540e0ef5d16f670ebc552650edd00 Mon Sep 17 00:00:00 2001 From: CharlesKWON Date: Tue, 14 Apr 2026 12:23:31 +0900 Subject: [PATCH] =?UTF-8?q?perf(FiveSql2):=20FieldGet=20peephole=20+=20DBF?= =?UTF-8?q?Area=20devirt=20=E2=80=94=20WHERE=20at=20~1.15x=20raw=20RDD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two stacked optimizations land on the SqlScan hot path. Combined effect on the 50k-row benchmark: Before After vs raw Numeric WHERE 10.2ms 7.8ms 1.15x String WHERE 10.5ms 7.9ms 1.15x No WHERE 9.2ms 10.0ms 1.45x Raw RDD baseline 6.8ms 6.8ms 1.00x WHERE-predicate paths are now within 15% of the raw Harbour-style RDD scan loop. The no-WHERE path is unchanged (slight jitter from the added devirt branch); FieldGet peephole doesn't apply there. --- Optimization 1: PcOpFieldGet peephole --- Adds a new pcode opcode `PcOpFieldGet ` (0x46) that skips the usual PushSymbol+Function+Frame+FieldGet-RTL+EndProc chain and calls a direct field getter closure instead. genpc recognizes the shape `FieldGet()` during emitCall and emits the specialized opcode automatically — no SQL-side API change. Integration: * hbrt.Thread.FastFieldGetter — hot-path closure set by scan loops. Non-nil → pcode bypasses dispatch. Nil → pcode resolves FIELDGET via the RTL symbol table (correctness fallback for any other callers). * compiler/genpc/genpc.go — peephole in emitCall. * hbrt/pcinterp.go — PcOpFieldGet handler. This alone cut numeric WHERE from 10.2 → 7.9ms: eliminated roughly one full Frame/EndProc + RTL dispatch per row × 50k rows. --- Optimization 2: DBFArea devirtualization --- SqlScan type-asserts the workarea to *dbf.DBFArea once and runs a dedicated loop that calls GoTop/EOF/Skip/GetValue directly on the concrete type. Go's compiler inlines these, skipping the interface vtable per row. Non-DBF drivers still work via the generic Area branch. The FastFieldGetter closure also captures *DBFArea directly in the DBF branch, so the WHERE predicate side of the hot loop is now entirely devirtualized: no interface dispatch between the pcode dispatch loop and the DBF record buffer. Validation: - FiveSql2 43/43 - Harbour compat 51/51 - go test ./... ALL PASS Remaining gap to raw RDD on no-WHERE (~1.45x) is dominated by the two-column row construction + ArraySlab + flat backing bookkeeping that the raw loop doesn't do. Going below that requires changing the SQL engine's result shape — out of scope here. Co-Authored-By: Claude Opus 4.6 (1M context) --- compiler/genpc/genpc.go | 14 +++++ hbrt/pcinterp.go | 17 ++++++ hbrt/pcode.go | 5 ++ hbrt/thread.go | 7 +++ hbrtl/sqlscan.go | 112 ++++++++++++++++++++++++++++------------ 5 files changed, 123 insertions(+), 32 deletions(-) diff --git a/compiler/genpc/genpc.go b/compiler/genpc/genpc.go index 8da94c0..28548a9 100644 --- a/compiler/genpc/genpc.go +++ b/compiler/genpc/genpc.go @@ -12,6 +12,7 @@ import ( "five/compiler/token" "five/hbrt" "math" + "strconv" "strings" ) @@ -484,6 +485,19 @@ func (g *generator) emitBinaryOp(op token.Kind) { func (g *generator) emitCall(e *ast.CallExpr) { if ident, ok := e.Func.(*ast.IdentExpr); ok { + // Peephole: FieldGet() → PcOpFieldGet . + // Skips the entire PushSymbol + Function + Frame + RTL path in + // favor of a direct workarea field access. Huge win for WHERE + // predicates on scan loops where this is the per-row hot op. + if strings.EqualFold(ident.Name, "FieldGet") && len(e.Args) == 1 { + if lit, ok := e.Args[0].(*ast.LiteralExpr); ok && lit.Kind == token.INT { + if n, err := strconv.Atoi(lit.Value); err == nil && n > 0 && n <= 0xFFFF { + g.emit(hbrt.PcOpFieldGet) + g.emitU16(uint16(n)) + return + } + } + } g.emitString(hbrt.PcOpPushSymbol, strings.ToUpper(ident.Name)) g.emit(hbrt.PcOpPushNil) for _, arg := range e.Args { diff --git a/hbrt/pcinterp.go b/hbrt/pcinterp.go index fe46699..adc02ba 100644 --- a/hbrt/pcinterp.go +++ b/hbrt/pcinterp.go @@ -157,6 +157,23 @@ func execPcodeBody(t *Thread, fn *PcodeFunc, mod *PcodeModule) { case PcOpEndProc: return + // --- Workarea field access (peephole for FieldGet(literal)) --- + case PcOpFieldGet: + fIdx := int(binary.LittleEndian.Uint16(code[pc:])) + pc += 2 + // Hot path — SqlScan plugs a direct field getter closure into + // t.FastFieldGetter before running the predicate, so we skip + // PushSymbol + Function dispatch + FieldGet RTL's own Frame. + if fg := t.FastFieldGetter; fg != nil { + t.PushValue(fg(fIdx)) + } else { + // Generic fallback: resolve through RTL symbol table + t.PushSymbol(t.VM().FindSymbol("FIELDGET")) + t.PushNil() + t.PushLong(int64(fIdx)) + t.Function(1) + } + // --- Function calls --- case PcOpPushSymbol: slen := int(binary.LittleEndian.Uint16(code[pc:])) diff --git a/hbrt/pcode.go b/hbrt/pcode.go index ebbc160..1a16be5 100644 --- a/hbrt/pcode.go +++ b/hbrt/pcode.go @@ -69,6 +69,11 @@ const ( PcOpFunction byte = 0x42 // + uint16 nArgs PcOpDo byte = 0x43 // + uint16 nArgs + // Workarea field access — skips PushSymbol + Function dispatch + // for `FieldGet(n)` where n is a literal. Emitted by genpc as a + // peephole optimization. Operand: uint16 1-based field position. + PcOpFieldGet byte = 0x46 + // Self / OOP PcOpPushSelf byte = 0x48 PcOpPushSelfField byte = 0x49 // + uint16 len + name diff --git a/hbrt/thread.go b/hbrt/thread.go index 2c9c68d..163066f 100644 --- a/hbrt/thread.go +++ b/hbrt/thread.go @@ -87,6 +87,13 @@ type Thread struct { // WorkArea manager (goroutine-local, no locks needed) WA interface{} // *hbrdd.WorkAreaManager — set by caller to avoid import cycle + + // FastFieldGetter is a hot-path closure set by SqlScan (or any other + // scan loop) to short-circuit PcOpFieldGet. When non-nil, the pcode + // interpreter calls this instead of going through PushSymbol + + // Function dispatch + FieldGet RTL's own Frame/EndProc. Caller is + // responsible for setting and clearing it around a scan. + FastFieldGetter func(int) Value waStack []uint16 // saved workarea numbers for (expr)->(expr) context switching // VM reference (shared, read-mostly) diff --git a/hbrtl/sqlscan.go b/hbrtl/sqlscan.go index c06c203..cdee027 100644 --- a/hbrtl/sqlscan.go +++ b/hbrtl/sqlscan.go @@ -26,6 +26,7 @@ package hbrtl import ( "five/hbrdd" + "five/hbrdd/dbf" "five/hbrt" ) @@ -90,6 +91,12 @@ func SqlScan(t *hbrt.Thread) { return } + // Type-assert to concrete DBFArea once so the hot loop calls + // GoTop/EOF/Skip/GetValue directly on *dbf.DBFArea without paying + // the interface dispatch on every row. Falls back to the generic + // Area path for non-DBF drivers (rare in FiveSql2 context). + dbfArea, _ := area.(*dbf.DBFArea) + // SQLite-inspired: instead of one slice allocation per row, maintain // a single flat backing buffer and hand each row a sub-slice into it. // This halves allocations (row header + backing → just row header) @@ -113,41 +120,82 @@ func SqlScan(t *hbrt.Thread) { flat := make([]hbrt.Value, 0, estRows*nFields) slab := hbrt.NewArraySlab(estRows) - // Scan - area.GoTop() - for !area.EOF() { - // WHERE evaluation (if any). Fast variant — WHERE expressions - // compiled from SQL AST don't contain BEGIN SEQUENCE, so we can - // skip the defer/recover frame exit. - keep := true - if whereFn != nil { - hbrt.ExecPcodeFast(t, whereFn, nil) - keep = t.GetRetValue().AsBool() + // Install the hot-path field getter so PcOpFieldGet in the compiled + // WHERE predicate bypasses PushSymbol + Function dispatch + the + // FieldGet RTL's own Frame. The closure captures the concrete + // DBFArea directly so there's no interface dispatch per access. + prevFG := t.FastFieldGetter + if dbfArea != nil { + t.FastFieldGetter = func(idx int) hbrt.Value { + v, _ := dbfArea.GetValue(idx - 1) + return v } - - if keep { - // Reserve nFields slots in flat, growing if needed. - off := len(flat) - end := off + nFields - if end > cap(flat) { - // Grow flat. Go's append growth policy handles this; - // we re-reserve space so the sub-slice math still holds. - flat = append(flat, make([]hbrt.Value, nFields)...) - } else { - flat = flat[:end] - } - row := flat[off:end:end] - - // Collect column values directly into the backing buffer. - for i := 0; i < nFields; i++ { - // GetValue is 0-based - v, _ := area.GetValue(fieldPos[i] - 1) - row[i] = v - } - rows = append(rows, slab.WrapNext(row)) + } else { + t.FastFieldGetter = func(idx int) hbrt.Value { + v, _ := area.GetValue(idx - 1) + return v } + } + defer func() { t.FastFieldGetter = prevFG }() - area.Skip(1) + // Scan — dispatch two nearly-identical loops for devirtualization. + // The DBF branch is the common case; Go's compiler inlines the + // direct method calls, whereas the generic Area branch pays one + // interface dispatch per call as before. + if dbfArea != nil { + dbfArea.GoTop() + for !dbfArea.EOF() { + keep := true + if whereFn != nil { + hbrt.ExecPcodeFast(t, whereFn, nil) + keep = t.GetRetValue().AsBool() + } + + if keep { + off := len(flat) + end := off + nFields + if end > cap(flat) { + flat = append(flat, make([]hbrt.Value, nFields)...) + } else { + flat = flat[:end] + } + row := flat[off:end:end] + for i := 0; i < nFields; i++ { + v, _ := dbfArea.GetValue(fieldPos[i] - 1) + row[i] = v + } + rows = append(rows, slab.WrapNext(row)) + } + + dbfArea.Skip(1) + } + } else { + area.GoTop() + for !area.EOF() { + keep := true + if whereFn != nil { + hbrt.ExecPcodeFast(t, whereFn, nil) + keep = t.GetRetValue().AsBool() + } + + if keep { + off := len(flat) + end := off + nFields + if end > cap(flat) { + flat = append(flat, make([]hbrt.Value, nFields)...) + } else { + flat = flat[:end] + } + row := flat[off:end:end] + for i := 0; i < nFields; i++ { + v, _ := area.GetValue(fieldPos[i] - 1) + row[i] = v + } + rows = append(rows, slab.WrapNext(row)) + } + + area.Skip(1) + } } t.PushValue(hbrt.MakeArrayFrom(rows))