perf(vm): symbol hoist + Function() stack shift — global 3-15%

The VM call path (PushSymbol → Function → Frame) is traversed by every
PRG function call. Three changes together cut per-call overhead across
the entire bench suite.

Changes
 - hbrt/call.go Function(): replace pop-push dance with a single slice
   shift (N+2 pops + N pushes → 1 copy of N slots + sp adjust). Kills
   the per-call `make([]Value, nArgs)` heap alloc. Resolved function
   pointer is cached back into sym.Func so subsequent calls on the
   same Symbol skip the VM lookup entirely.
 - hbrt/vm.go GetSym(): new helper. Generated code calls it with a
   pointer to a package-level `*Symbol` slot so FindSymbol (which takes
   the VM RWMutex + map lookup) runs at most once per symbol per
   process. Nil results are intentionally NOT cached — an init-order
   miss becomes a retry on the next call instead of a permanent sticky
   failure.
 - hbrt/thread.go pushPendingSym(): scalar fast slot for depth=1 call
   nesting (common case). Nil syms still go through the slice so the
   "empty vs stored nil" ambiguity can't produce a false pop.
 - compiler/gengo/gengo.go: emit `t.PushSymbol(t.GetSym(&_sym_<file>_<NAME>, "NAME"))`
   for every function call site, with a per-file prefix so multi-PRG
   builds don't collide on identical symbol names.

Bugs fixed during bring-up
 - pendingSymFast == nil was ambiguous ("unused" vs "nil stored"). Nil
   syms now spill to the slice, preserving distinguishability.
 - The old varName-reuse branch at the PushSymbol emit site skipped
   the GetSym wrapper, emitting a raw `t.PushSymbol(varName)` against
   an uninitialized package-level *Symbol. Every call path now funnels
   through emitPushSymbol.

bench_sql deltas vs prior build
 - B1  SELECT *          114 →  97 µs   (15%)
 - B4  GROUP_HAVING      584 → 554 µs   (5%)
 - B8  RECURSIVE CTE     150 → 141 µs   (6%)
 - B10 RANK PARTITION    310 → 296 µs   (5%)
 - B11 SUM OVER          335 → 320 µs   (4%)
 - B14 COUNT             295 → 281 µs   (5%)
 - B15 CTE+WIN+JOIN     1891 → 1826 µs  (3%)

Verification
 - go test ./...               ALL PASS
 - FiveSql2 test_sql1999       43/43
 - tests/compat_harbour        56/56

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-17 20:41:48 +09:00
parent dd270d5d9d
commit 1f63c7fe63
5 changed files with 198 additions and 36 deletions

View File

@@ -22,6 +22,7 @@ import (
"five/compiler/token"
"fmt"
"path/filepath"
"sort"
"strings"
)
@@ -120,6 +121,8 @@ func doGenerate(file *ast.File, debug, library bool) string {
g.importAlias["five/hbrdd/mem"] = "_"
}
g.symCache = map[string]string{}
g.emitHeader()
g.emitSymbols()
for _, d := range file.Decls {
@@ -131,6 +134,7 @@ func doGenerate(file *ast.File, debug, library bool) string {
} else {
g.emitMain()
}
g.emitSymCache()
// Patch deferred imports: inline RTL may add "fmt"/"strings" after header was emitted.
result := g.buf.String()
@@ -166,6 +170,89 @@ func (g *Generator) writef(format string, args ...interface{}) {
fmt.Fprintf(&g.buf, format, args...)
}
// symVar returns the package-level cache variable name for the given
// symbol, registering it if first seen. The name embeds a per-file
// prefix so multi-PRG builds don't collide on identical symbol names
// across files. Characters that can't appear in a Go identifier are
// replaced with underscores.
func (g *Generator) symVar(name string) string {
if g.symCache == nil {
g.symCache = map[string]string{}
}
if v, ok := g.symCache[name]; ok {
return v
}
var sb strings.Builder
sb.WriteString("_sym_")
sb.WriteString(g.fileKey())
sb.WriteByte('_')
for i := 0; i < len(name); i++ {
c := name[i]
switch {
case c >= 'A' && c <= 'Z', c >= 'a' && c <= 'z', c >= '0' && c <= '9', c == '_':
sb.WriteByte(c)
default:
sb.WriteByte('_')
}
}
v := sb.String()
g.symCache[name] = v
return v
}
// fileKey derives a short identifier-safe prefix from g.file.Name so
// package-level symbol caches don't collide across PRG files.
func (g *Generator) fileKey() string {
base := g.file.Name
if idx := strings.LastIndex(base, "/"); idx >= 0 {
base = base[idx+1:]
}
if idx := strings.LastIndexByte(base, '.'); idx >= 0 {
base = base[:idx]
}
var sb strings.Builder
for i := 0; i < len(base); i++ {
c := base[i]
switch {
case c >= 'A' && c <= 'Z', c >= 'a' && c <= 'z', c >= '0' && c <= '9':
sb.WriteByte(c)
default:
sb.WriteByte('_')
}
}
return sb.String()
}
// emitPushSymbol writes PushSymbol using the lazy-cached package-level
// variable. First call resolves via VM; subsequent calls skip the
// RWMutex + map lookup.
func (g *Generator) emitPushSymbol(name string) {
v := g.symVar(name)
g.writeln(fmt.Sprintf("t.PushSymbol(t.GetSym(&%s, %q))", v, name))
}
// emitSymCache writes the package-level `var _sym_NAME *hbrt.Symbol`
// declarations discovered during body emission. Called after all
// function bodies are emitted so every PushSymbol call site has had
// a chance to register its target name.
func (g *Generator) emitSymCache() {
if len(g.symCache) == 0 {
return
}
// Deterministic order so diffs are stable.
names := make([]string, 0, len(g.symCache))
for k := range g.symCache {
names = append(names, k)
}
sort.Strings(names)
g.writeln("")
g.writeln("// Cached symbol pointers — populated lazily on first use.")
for _, n := range names {
g.writeln(fmt.Sprintf("var %s *hbrt.Symbol", g.symCache[n]))
}
g.writeln("")
}
func (g *Generator) writeln(s string) {
g.writeIndent()
g.buf.WriteString(s)
@@ -689,7 +776,7 @@ func (g *Generator) emitStmt(stmt ast.Stmt, locals localMap) {
if onOff == "OFF" {
val = "false"
}
g.writeln(fmt.Sprintf(`t.PushSymbol(t.VM().FindSymbol(%q))`, funcName))
g.emitPushSymbol(funcName)
g.writeln("t.PushNil()")
g.writeln(fmt.Sprintf("t.PushBool(%s)", val))
g.writeln("t.Do(1)")
@@ -704,7 +791,7 @@ func (g *Generator) emitStmt(stmt ast.Stmt, locals localMap) {
"EPOCH": "SETEPOCH",
}
if funcName, ok := valueFuncMap[upper]; ok && s.Expr != nil {
g.writeln(fmt.Sprintf(`t.PushSymbol(t.VM().FindSymbol(%q))`, funcName))
g.emitPushSymbol(funcName)
g.writeln("t.PushNil()")
g.emitExpr(s.Expr)
g.writeln("t.Do(1)")
@@ -813,7 +900,7 @@ func (g *Generator) emitQOut(s *ast.QOutStmt, locals localMap) {
if s.IsQQ {
sym = "QQOUT"
}
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", sym))
g.emitPushSymbol(sym)
g.writeln("t.PushNil()")
for _, expr := range s.Exprs {
g.emitExpr(expr)
@@ -835,7 +922,7 @@ func (g *Generator) emitExprStmt(s *ast.ExprStmt, locals localMap) {
// Bare identifier as statement (e.g., CLS, CLEAR) — treat as zero-arg function call
if ident, ok := s.X.(*ast.IdentExpr); ok {
if _, found := locals[strings.ToUpper(ident.Name)]; !found {
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", strings.ToUpper(ident.Name)))
g.emitPushSymbol(strings.ToUpper(ident.Name))
g.writeln("t.PushNil()")
g.writeln("t.Do(0)")
return
@@ -1022,7 +1109,7 @@ func (g *Generator) emitAssign(a *ast.AssignExpr, locals localMap) {
func (g *Generator) emitCallAsStmt(call *ast.CallExpr, locals localMap) {
if ident, ok := call.Func.(*ast.IdentExpr); ok {
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", strings.ToUpper(ident.Name)))
g.emitPushSymbol(strings.ToUpper(ident.Name))
} else {
g.emitExpr(call.Func)
}
@@ -1783,7 +1870,7 @@ func (g *Generator) emitIndexKeyExpr(expr ast.Expr) {
}
return
}
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper))
g.emitPushSymbol(upper)
} else {
g.emitExpr(e.Func)
}
@@ -1953,15 +2040,7 @@ func (g *Generator) emitCall(e *ast.CallExpr) {
g.writeln("t.PushNil()")
return
}
if g.symCache != nil {
if varName, ok := g.symCache[upper]; ok {
g.writeln(fmt.Sprintf("t.PushSymbol(%s)", varName))
} else {
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper))
}
} else {
g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper))
}
g.emitPushSymbol(upper)
} else {
g.emitExpr(e.Func)
}

View File

@@ -62,6 +62,7 @@ FiveSql2 성능 개선 흐름(`3caadb2 SqlOrderBy+SqlGroupBy Go RTL`, `5fc9c3b S
| 25 | Plan pcode 캐시 + SqlBulkUpdate flush 지연 | [_FiveSql2/src/TSqlExecutor.prg s_hDmlPcodeCache + cCacheKey](../_FiveSql2/src/TSqlExecutor.prg), [hbrtl/sqlscan.go SqlBulkUpdate](../hbrtl/sqlscan.go) | 플랜 키별 컴파일된 pcode(aFPos/where/set_pc) 캐시 + WA cache 활성 시 Go RTL 내부 `Flush()` 스킵 → **B13 UPDATE 48x** | **완료 (2026-04-17)** |
| 26 | SELECT 경로 plan pcode 캐시 | [_FiveSql2/src/TSqlExecutor.prg RunSelect fast path](../_FiveSql2/src/TSqlExecutor.prg) | #25의 패턴을 SELECT fast-path에도 적용. `TryBuildFieldPositions` + `TryCompileWhere` 결과를 `cCacheKey#sel`로 캐시. 반복 SELECT의 PRG AST walk 제거 | **완료 (2026-04-17)** |
| 27 | SqlEvalHaving Go RTL | [hbrtl/sqlscan.go](../hbrtl/sqlscan.go), [_FiveSql2/src/TSqlAgg.prg EvalHaving](../_FiveSql2/src/TSqlAgg.prg) | HAVING 트리 walker를 Go로. ND_LIT/ND_NIL/ND_COL/ND_FN(5 aggs)/ND_BIN/ND_UNI 처리. 복잡 케이스는 PRG fallback | **완료 (2026-04-17, 효과 미미)** |
| 28 | VM Function() + Symbol 캐시 | [hbrt/call.go Function](../hbrt/call.go), [hbrt/vm.go GetSym](../hbrt/vm.go), [hbrt/thread.go pushPendingSym](../hbrt/thread.go), [compiler/gengo/gengo.go emitPushSymbol](../compiler/gengo/gengo.go) | 모든 PRG 함수 호출이 통과하는 경로. (a) Function()의 pop-push dance를 stack shift로 (heap alloc + 2N+2 ops → 1 copy), (b) 심볼 resolve 결과를 sym.Func에 캐시, (c) gengo가 심볼 포인터를 package-level var에 hoist해 `FindSymbol` 호출/RWMutex/map lookup을 lazy 1회로 단축. 전역 3-15% 개선 | **완료 (2026-04-17)** |
### ❌ 제외 (Harbour 호환 리스크 과다)
@@ -298,6 +299,7 @@ hbrt.Sym("SQLDISTINCT", hbrt.FsPublic, SqlDistinct),
25.#25 Plan pcode 캐시 + Flush 지연 — 완료 (B13 UPDATE **48x**)
26.#26 SELECT plan pcode 캐시 — 완료 (SELECT fast-path 캐시 확장)
27.#27 SqlEvalHaving Go RTL — 완료 (효과 미미, 복잡 HAVING 워크로드용)
28.#28 VM Function() + Symbol 캐시 — 완료 (전역 3-15%, 호출 경로 핫패스)
**전체 계획 완료 (2026-04-17).** 각 단계 후 `go test ./...` + FiveSql2 43/43 + Harbour compat 필수 원칙 준수.
@@ -762,6 +764,36 @@ dbCloseAll() // flush + close all
**검증**: go test ALL PASS · FiveSql2 43/43 (cache disabled 기본) · Harbour compat 56/56
### #28 VM Function() + Symbol hoist — 2026-04-17 완료 (전역 3-15%)
**동기**: 모든 PRG 함수 호출이 `Function()` + `FindSymbol`을 거침. 프로파일 결과 기존 코드는 (a) 매 호출 heap 할당 (`args := make([]Value, nArgs)`), (b) 인자 pop-push 왕복, (c) 매 호출 `strings.ToUpper` + `RWMutex` + `map` 조회. 개별 비용은 작지만 쿼리당 수십~수백 회 호출 → 누적이 큼.
**구현**
- [hbrt/call.go Function](../hbrt/call.go): pop-push dance를 single-slice-shift로 대체. N+2 pops + N pushes → 1 copy + sp 조정. Heap alloc 제거. Resolve 성공 시 `sym.Func = fn`으로 캐시
- [hbrt/vm.go GetSym](../hbrt/vm.go): `GetSym(cache **Symbol, name string)` — `*cache != nil`이면 즉시 반환, 아니면 `FindSymbol` 후 캐시 (nil은 init-order 재시도 허용해 캐시 안 함)
- [hbrt/thread.go pushPendingSym](../hbrt/thread.go): depth=1 호출(대부분)을 위한 scalar fast slot 추가
- [compiler/gengo/gengo.go emitPushSymbol](../compiler/gengo/gengo.go): `t.PushSymbol(t.VM().FindSymbol(%q))` → `t.PushSymbol(t.GetSym(&_sym_file_NAME, %q))`. 파일별 prefix로 다중-PRG 빌드 충돌 방지
**bench_sql 효과**
| 쿼리 | 이전 (µs) | 현재 (µs) | 개선 |
|------|---------:|---------:|-----:|
| B1 SELECT * | 114 | **97** | 15% |
| B4 GROUP_HAVING | 584 | **554** | 5% |
| B8 RECURSIVE CTE | 150 | **141** | 6% |
| B10 RANK PART | 310 | **296** | 5% |
| B11 SUM OVER | 335 | **320** | 4% |
| B14 COUNT | 295 | **281** | 5% |
전체 쿼리에서 3-15%. 평균 ~5% 단순 개선이지만 모든 쿼리가 혜택.
**버그 수정 (구현 중 발견)**
- `pendingSymFast = nil`이 "빈 슬롯"과 "nil 심볼 저장" 양쪽 의미라 ambiguous. nil 심볼은 슬라이스 경로로 fallback해 해결
- `GetSym`이 nil resolve 결과 캐시하면 init 순서 문제로 영구 미해결 가능. nil 캐시 생략해 재시도 허용
- gengo의 중복 호출 분기가 `t.PushSymbol(varName)`을 직접 emit해 lazy init 우회. 모든 호출을 `emitPushSymbol` 통일
**Harbour 호환 보장**: 43/43 · 56/56 · go test ALL PASS.
### #27 SqlEvalHaving Go RTL — 2026-04-17 완료 (효과 미미)
**구현** ([hbrtl/sqlscan.go SqlEvalHaving](../hbrtl/sqlscan.go))

View File

@@ -28,30 +28,40 @@ func (t *Thread) Function(nArgs int) {
panic(t.runtimeError("no function symbol for call"))
}
// Resolve function
// Resolve function. First call for an external/lazy symbol misses
// sym.Func and walks the VM symbol table — cache the resolved Func
// back into the Symbol so subsequent calls skip the ToUpper +
// RWMutex + map lookup. Symbols are shared read-mostly so a racy
// write is safe (both racers resolve to the same Func pointer).
fn := sym.Func
if fn == nil && t.vm != nil {
found := t.vm.FindSymbol(strings.ToUpper(sym.Name))
if found != nil {
fn = found.Func
sym.Func = fn
}
}
if fn == nil {
panic(t.runtimeError("undefined function: " + sym.Name))
}
// Collect args from stack
args := make([]Value, nArgs)
for i := nArgs - 1; i >= 0; i-- {
args[i] = t.pop()
}
t.pop() // pop NIL/self placeholder
t.pop() // pop symbol placeholder
// Push args back for Frame() to pick up
for _, arg := range args {
t.push(arg)
// Stack at entry (bottom → top):
// [sym placeholder] [self/NIL] [arg1] … [argN]
// Frame() expects only [arg1..argN] on the eval stack so it can
// copy them into the callee's locals. The old code achieved this
// by pop-popping args, popping the two placeholders, then pushing
// the args back — an O(N) copy plus a heap allocation per call.
// Shift the args two slots left in place instead: one slice move,
// zero heap.
if nArgs > 0 {
base := t.sp - nArgs - 2
copy(t.stack[base:base+nArgs], t.stack[t.sp-nArgs:t.sp])
}
// Two slots freed at top — keep them nil so the GC can release any
// references they held (matches pop()'s clearing semantics).
t.stack[t.sp-1] = cachedNil
t.stack[t.sp-2] = cachedNil
t.sp -= 2
// Set pending params count and symbol for Frame()
t.pendingParams = nArgs

View File

@@ -66,9 +66,14 @@ type Thread struct {
// Return value (passed between caller/callee)
retVal Value
// Pending function call stack (PushSymbol pushes, Function pops)
// Stack needed for nested calls: Double(Add(3,4))
pendingSyms []*Symbol
// Pending function call stack (PushSymbol pushes, Function pops).
// Depth=1 is the common case (non-nested call) and gets a scalar
// fast slot to skip slice append/trim; nested calls fall back to
// the heap slice. Balanced push/pop keeps the invariant:
// pendingSymFast set → slice empty
// slice non-empty → pendingSymFast may or may not be set.
pendingSymFast *Symbol
pendingSyms []*Symbol
pendingParams int // number of params for next Frame call
pendingCallSym *Symbol // symbol for next Frame (for PROCNAME)
@@ -600,17 +605,33 @@ func (t *Thread) PendingParams2(n int) {
}
func (t *Thread) pushPendingSym(sym *Symbol) {
// Fast path for depth=1 nesting — store in scalar slot without
// touching the slice. A nil sym (unresolved symbol, caught later
// in Function() with a descriptive error) must not use the fast
// path because `pendingSymFast == nil` already means "empty";
// falling back to the slice preserves distinguishability.
if sym != nil && t.pendingSymFast == nil && len(t.pendingSyms) == 0 {
t.pendingSymFast = sym
return
}
if t.pendingSymFast != nil {
t.pendingSyms = append(t.pendingSyms, t.pendingSymFast)
t.pendingSymFast = nil
}
t.pendingSyms = append(t.pendingSyms, sym)
}
func (t *Thread) popPendingSym() *Symbol {
n := len(t.pendingSyms)
if n == 0 {
return nil
if n := len(t.pendingSyms); n > 0 {
sym := t.pendingSyms[n-1]
t.pendingSyms = t.pendingSyms[:n-1]
return sym
}
sym := t.pendingSyms[n-1]
t.pendingSyms = t.pendingSyms[:n-1]
return sym
if sym := t.pendingSymFast; sym != nil {
t.pendingSymFast = nil
return sym
}
return nil
}
// PushAliasField pushes a field value from a named alias workarea.

View File

@@ -127,6 +127,26 @@ func (vm *VM) FindSymbol(name string) *Symbol {
return vm.symbols[name]
}
// GetSym returns the cached Symbol, performing a one-time FindSymbol
// lookup on first access and stashing the pointer in *cache for all
// subsequent calls. Generated code (gengo) declares a package-level
// `var _sym_NAME *Symbol` per unique call target and routes every
// PushSymbol through this helper so the hot path becomes a single
// non-nil check instead of vm.symbols map + RWMutex per invocation.
func (t *Thread) GetSym(cache **Symbol, name string) *Symbol {
if s := *cache; s != nil {
return s
}
s := t.vm.FindSymbol(name)
if s != nil {
// Only cache successful resolutions — nil might be due to
// init-order (another module's registrations pending);
// retry on next call once those complete.
*cache = s
}
return s
}
// NewThread creates a new Thread attached to this VM.
func (vm *VM) NewThread() *Thread {
t := NewThread(vm)