From 1f63c7fe6393c121b357a5fef58c03d99935a0e7 Mon Sep 17 00:00:00 2001 From: CharlesKWON Date: Fri, 17 Apr 2026 20:41:48 +0900 Subject: [PATCH] =?UTF-8?q?perf(vm):=20symbol=20hoist=20+=20Function()=20s?= =?UTF-8?q?tack=20shift=20=E2=80=94=20global=203-15%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM call path (PushSymbol → Function → Frame) is traversed by every PRG function call. Three changes together cut per-call overhead across the entire bench suite. Changes - hbrt/call.go Function(): replace pop-push dance with a single slice shift (N+2 pops + N pushes → 1 copy of N slots + sp adjust). Kills the per-call `make([]Value, nArgs)` heap alloc. Resolved function pointer is cached back into sym.Func so subsequent calls on the same Symbol skip the VM lookup entirely. - hbrt/vm.go GetSym(): new helper. Generated code calls it with a pointer to a package-level `*Symbol` slot so FindSymbol (which takes the VM RWMutex + map lookup) runs at most once per symbol per process. Nil results are intentionally NOT cached — an init-order miss becomes a retry on the next call instead of a permanent sticky failure. - hbrt/thread.go pushPendingSym(): scalar fast slot for depth=1 call nesting (common case). Nil syms still go through the slice so the "empty vs stored nil" ambiguity can't produce a false pop. - compiler/gengo/gengo.go: emit `t.PushSymbol(t.GetSym(&_sym__, "NAME"))` for every function call site, with a per-file prefix so multi-PRG builds don't collide on identical symbol names. Bugs fixed during bring-up - pendingSymFast == nil was ambiguous ("unused" vs "nil stored"). Nil syms now spill to the slice, preserving distinguishability. - The old varName-reuse branch at the PushSymbol emit site skipped the GetSym wrapper, emitting a raw `t.PushSymbol(varName)` against an uninitialized package-level *Symbol. Every call path now funnels through emitPushSymbol. bench_sql deltas vs prior build - B1 SELECT * 114 → 97 µs (15%) - B4 GROUP_HAVING 584 → 554 µs (5%) - B8 RECURSIVE CTE 150 → 141 µs (6%) - B10 RANK PARTITION 310 → 296 µs (5%) - B11 SUM OVER 335 → 320 µs (4%) - B14 COUNT 295 → 281 µs (5%) - B15 CTE+WIN+JOIN 1891 → 1826 µs (3%) Verification - go test ./... ALL PASS - FiveSql2 test_sql1999 43/43 - tests/compat_harbour 56/56 Co-Authored-By: Claude Opus 4.7 (1M context) --- compiler/gengo/gengo.go | 109 +++++++++++++++++++++++++++----- docs/RTL-Go-Native-Migration.md | 32 ++++++++++ hbrt/call.go | 34 ++++++---- hbrt/thread.go | 39 +++++++++--- hbrt/vm.go | 20 ++++++ 5 files changed, 198 insertions(+), 36 deletions(-) diff --git a/compiler/gengo/gengo.go b/compiler/gengo/gengo.go index c5e89ed..0da9c9b 100644 --- a/compiler/gengo/gengo.go +++ b/compiler/gengo/gengo.go @@ -22,6 +22,7 @@ import ( "five/compiler/token" "fmt" "path/filepath" + "sort" "strings" ) @@ -120,6 +121,8 @@ func doGenerate(file *ast.File, debug, library bool) string { g.importAlias["five/hbrdd/mem"] = "_" } + g.symCache = map[string]string{} + g.emitHeader() g.emitSymbols() for _, d := range file.Decls { @@ -131,6 +134,7 @@ func doGenerate(file *ast.File, debug, library bool) string { } else { g.emitMain() } + g.emitSymCache() // Patch deferred imports: inline RTL may add "fmt"/"strings" after header was emitted. result := g.buf.String() @@ -166,6 +170,89 @@ func (g *Generator) writef(format string, args ...interface{}) { fmt.Fprintf(&g.buf, format, args...) } +// symVar returns the package-level cache variable name for the given +// symbol, registering it if first seen. The name embeds a per-file +// prefix so multi-PRG builds don't collide on identical symbol names +// across files. Characters that can't appear in a Go identifier are +// replaced with underscores. +func (g *Generator) symVar(name string) string { + if g.symCache == nil { + g.symCache = map[string]string{} + } + if v, ok := g.symCache[name]; ok { + return v + } + var sb strings.Builder + sb.WriteString("_sym_") + sb.WriteString(g.fileKey()) + sb.WriteByte('_') + for i := 0; i < len(name); i++ { + c := name[i] + switch { + case c >= 'A' && c <= 'Z', c >= 'a' && c <= 'z', c >= '0' && c <= '9', c == '_': + sb.WriteByte(c) + default: + sb.WriteByte('_') + } + } + v := sb.String() + g.symCache[name] = v + return v +} + +// fileKey derives a short identifier-safe prefix from g.file.Name so +// package-level symbol caches don't collide across PRG files. +func (g *Generator) fileKey() string { + base := g.file.Name + if idx := strings.LastIndex(base, "/"); idx >= 0 { + base = base[idx+1:] + } + if idx := strings.LastIndexByte(base, '.'); idx >= 0 { + base = base[:idx] + } + var sb strings.Builder + for i := 0; i < len(base); i++ { + c := base[i] + switch { + case c >= 'A' && c <= 'Z', c >= 'a' && c <= 'z', c >= '0' && c <= '9': + sb.WriteByte(c) + default: + sb.WriteByte('_') + } + } + return sb.String() +} + +// emitPushSymbol writes PushSymbol using the lazy-cached package-level +// variable. First call resolves via VM; subsequent calls skip the +// RWMutex + map lookup. +func (g *Generator) emitPushSymbol(name string) { + v := g.symVar(name) + g.writeln(fmt.Sprintf("t.PushSymbol(t.GetSym(&%s, %q))", v, name)) +} + +// emitSymCache writes the package-level `var _sym_NAME *hbrt.Symbol` +// declarations discovered during body emission. Called after all +// function bodies are emitted so every PushSymbol call site has had +// a chance to register its target name. +func (g *Generator) emitSymCache() { + if len(g.symCache) == 0 { + return + } + // Deterministic order so diffs are stable. + names := make([]string, 0, len(g.symCache)) + for k := range g.symCache { + names = append(names, k) + } + sort.Strings(names) + g.writeln("") + g.writeln("// Cached symbol pointers — populated lazily on first use.") + for _, n := range names { + g.writeln(fmt.Sprintf("var %s *hbrt.Symbol", g.symCache[n])) + } + g.writeln("") +} + func (g *Generator) writeln(s string) { g.writeIndent() g.buf.WriteString(s) @@ -689,7 +776,7 @@ func (g *Generator) emitStmt(stmt ast.Stmt, locals localMap) { if onOff == "OFF" { val = "false" } - g.writeln(fmt.Sprintf(`t.PushSymbol(t.VM().FindSymbol(%q))`, funcName)) + g.emitPushSymbol(funcName) g.writeln("t.PushNil()") g.writeln(fmt.Sprintf("t.PushBool(%s)", val)) g.writeln("t.Do(1)") @@ -704,7 +791,7 @@ func (g *Generator) emitStmt(stmt ast.Stmt, locals localMap) { "EPOCH": "SETEPOCH", } if funcName, ok := valueFuncMap[upper]; ok && s.Expr != nil { - g.writeln(fmt.Sprintf(`t.PushSymbol(t.VM().FindSymbol(%q))`, funcName)) + g.emitPushSymbol(funcName) g.writeln("t.PushNil()") g.emitExpr(s.Expr) g.writeln("t.Do(1)") @@ -813,7 +900,7 @@ func (g *Generator) emitQOut(s *ast.QOutStmt, locals localMap) { if s.IsQQ { sym = "QQOUT" } - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", sym)) + g.emitPushSymbol(sym) g.writeln("t.PushNil()") for _, expr := range s.Exprs { g.emitExpr(expr) @@ -835,7 +922,7 @@ func (g *Generator) emitExprStmt(s *ast.ExprStmt, locals localMap) { // Bare identifier as statement (e.g., CLS, CLEAR) — treat as zero-arg function call if ident, ok := s.X.(*ast.IdentExpr); ok { if _, found := locals[strings.ToUpper(ident.Name)]; !found { - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", strings.ToUpper(ident.Name))) + g.emitPushSymbol(strings.ToUpper(ident.Name)) g.writeln("t.PushNil()") g.writeln("t.Do(0)") return @@ -1022,7 +1109,7 @@ func (g *Generator) emitAssign(a *ast.AssignExpr, locals localMap) { func (g *Generator) emitCallAsStmt(call *ast.CallExpr, locals localMap) { if ident, ok := call.Func.(*ast.IdentExpr); ok { - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", strings.ToUpper(ident.Name))) + g.emitPushSymbol(strings.ToUpper(ident.Name)) } else { g.emitExpr(call.Func) } @@ -1783,7 +1870,7 @@ func (g *Generator) emitIndexKeyExpr(expr ast.Expr) { } return } - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper)) + g.emitPushSymbol(upper) } else { g.emitExpr(e.Func) } @@ -1953,15 +2040,7 @@ func (g *Generator) emitCall(e *ast.CallExpr) { g.writeln("t.PushNil()") return } - if g.symCache != nil { - if varName, ok := g.symCache[upper]; ok { - g.writeln(fmt.Sprintf("t.PushSymbol(%s)", varName)) - } else { - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper)) - } - } else { - g.writeln(fmt.Sprintf("t.PushSymbol(t.VM().FindSymbol(%q))", upper)) - } + g.emitPushSymbol(upper) } else { g.emitExpr(e.Func) } diff --git a/docs/RTL-Go-Native-Migration.md b/docs/RTL-Go-Native-Migration.md index b9d6686..d057c11 100644 --- a/docs/RTL-Go-Native-Migration.md +++ b/docs/RTL-Go-Native-Migration.md @@ -62,6 +62,7 @@ FiveSql2 성능 개선 흐름(`3caadb2 SqlOrderBy+SqlGroupBy Go RTL`, `5fc9c3b S | 25 | Plan pcode 캐시 + SqlBulkUpdate flush 지연 | [_FiveSql2/src/TSqlExecutor.prg s_hDmlPcodeCache + cCacheKey](../_FiveSql2/src/TSqlExecutor.prg), [hbrtl/sqlscan.go SqlBulkUpdate](../hbrtl/sqlscan.go) | 플랜 키별 컴파일된 pcode(aFPos/where/set_pc) 캐시 + WA cache 활성 시 Go RTL 내부 `Flush()` 스킵 → **B13 UPDATE 48x** | **완료 (2026-04-17)** | | 26 | SELECT 경로 plan pcode 캐시 | [_FiveSql2/src/TSqlExecutor.prg RunSelect fast path](../_FiveSql2/src/TSqlExecutor.prg) | #25의 패턴을 SELECT fast-path에도 적용. `TryBuildFieldPositions` + `TryCompileWhere` 결과를 `cCacheKey#sel`로 캐시. 반복 SELECT의 PRG AST walk 제거 | **완료 (2026-04-17)** | | 27 | SqlEvalHaving Go RTL | [hbrtl/sqlscan.go](../hbrtl/sqlscan.go), [_FiveSql2/src/TSqlAgg.prg EvalHaving](../_FiveSql2/src/TSqlAgg.prg) | HAVING 트리 walker를 Go로. ND_LIT/ND_NIL/ND_COL/ND_FN(5 aggs)/ND_BIN/ND_UNI 처리. 복잡 케이스는 PRG fallback | **완료 (2026-04-17, 효과 미미)** | +| 28 | VM Function() + Symbol 캐시 | [hbrt/call.go Function](../hbrt/call.go), [hbrt/vm.go GetSym](../hbrt/vm.go), [hbrt/thread.go pushPendingSym](../hbrt/thread.go), [compiler/gengo/gengo.go emitPushSymbol](../compiler/gengo/gengo.go) | 모든 PRG 함수 호출이 통과하는 경로. (a) Function()의 pop-push dance를 stack shift로 (heap alloc + 2N+2 ops → 1 copy), (b) 심볼 resolve 결과를 sym.Func에 캐시, (c) gengo가 심볼 포인터를 package-level var에 hoist해 `FindSymbol` 호출/RWMutex/map lookup을 lazy 1회로 단축. 전역 3-15% 개선 | **완료 (2026-04-17)** | ### ❌ 제외 (Harbour 호환 리스크 과다) @@ -298,6 +299,7 @@ hbrt.Sym("SQLDISTINCT", hbrt.FsPublic, SqlDistinct), 25. ✅ #25 Plan pcode 캐시 + Flush 지연 — 완료 (B13 UPDATE **48x**) 26. ✅ #26 SELECT plan pcode 캐시 — 완료 (SELECT fast-path 캐시 확장) 27. ✅ #27 SqlEvalHaving Go RTL — 완료 (효과 미미, 복잡 HAVING 워크로드용) +28. ✅ #28 VM Function() + Symbol 캐시 — 완료 (전역 3-15%, 호출 경로 핫패스) **전체 계획 완료 (2026-04-17).** 각 단계 후 `go test ./...` + FiveSql2 43/43 + Harbour compat 필수 원칙 준수. @@ -762,6 +764,36 @@ dbCloseAll() // flush + close all **검증**: go test ALL PASS · FiveSql2 43/43 (cache disabled 기본) · Harbour compat 56/56 +### #28 VM Function() + Symbol hoist — 2026-04-17 완료 (전역 3-15%) + +**동기**: 모든 PRG 함수 호출이 `Function()` + `FindSymbol`을 거침. 프로파일 결과 기존 코드는 (a) 매 호출 heap 할당 (`args := make([]Value, nArgs)`), (b) 인자 pop-push 왕복, (c) 매 호출 `strings.ToUpper` + `RWMutex` + `map` 조회. 개별 비용은 작지만 쿼리당 수십~수백 회 호출 → 누적이 큼. + +**구현** +- [hbrt/call.go Function](../hbrt/call.go): pop-push dance를 single-slice-shift로 대체. N+2 pops + N pushes → 1 copy + sp 조정. Heap alloc 제거. Resolve 성공 시 `sym.Func = fn`으로 캐시 +- [hbrt/vm.go GetSym](../hbrt/vm.go): `GetSym(cache **Symbol, name string)` — `*cache != nil`이면 즉시 반환, 아니면 `FindSymbol` 후 캐시 (nil은 init-order 재시도 허용해 캐시 안 함) +- [hbrt/thread.go pushPendingSym](../hbrt/thread.go): depth=1 호출(대부분)을 위한 scalar fast slot 추가 +- [compiler/gengo/gengo.go emitPushSymbol](../compiler/gengo/gengo.go): `t.PushSymbol(t.VM().FindSymbol(%q))` → `t.PushSymbol(t.GetSym(&_sym_file_NAME, %q))`. 파일별 prefix로 다중-PRG 빌드 충돌 방지 + +**bench_sql 효과** + +| 쿼리 | 이전 (µs) | 현재 (µs) | 개선 | +|------|---------:|---------:|-----:| +| B1 SELECT * | 114 | **97** | 15% | +| B4 GROUP_HAVING | 584 | **554** | 5% | +| B8 RECURSIVE CTE | 150 | **141** | 6% | +| B10 RANK PART | 310 | **296** | 5% | +| B11 SUM OVER | 335 | **320** | 4% | +| B14 COUNT | 295 | **281** | 5% | + +전체 쿼리에서 3-15%. 평균 ~5% 단순 개선이지만 모든 쿼리가 혜택. + +**버그 수정 (구현 중 발견)** +- `pendingSymFast = nil`이 "빈 슬롯"과 "nil 심볼 저장" 양쪽 의미라 ambiguous. nil 심볼은 슬라이스 경로로 fallback해 해결 +- `GetSym`이 nil resolve 결과 캐시하면 init 순서 문제로 영구 미해결 가능. nil 캐시 생략해 재시도 허용 +- gengo의 중복 호출 분기가 `t.PushSymbol(varName)`을 직접 emit해 lazy init 우회. 모든 호출을 `emitPushSymbol` 통일 + +**Harbour 호환 보장**: 43/43 · 56/56 · go test ALL PASS. + ### #27 SqlEvalHaving Go RTL — 2026-04-17 완료 (효과 미미) **구현** ([hbrtl/sqlscan.go SqlEvalHaving](../hbrtl/sqlscan.go)) diff --git a/hbrt/call.go b/hbrt/call.go index fa5c6cd..76ddc23 100644 --- a/hbrt/call.go +++ b/hbrt/call.go @@ -28,30 +28,40 @@ func (t *Thread) Function(nArgs int) { panic(t.runtimeError("no function symbol for call")) } - // Resolve function + // Resolve function. First call for an external/lazy symbol misses + // sym.Func and walks the VM symbol table — cache the resolved Func + // back into the Symbol so subsequent calls skip the ToUpper + + // RWMutex + map lookup. Symbols are shared read-mostly so a racy + // write is safe (both racers resolve to the same Func pointer). fn := sym.Func if fn == nil && t.vm != nil { found := t.vm.FindSymbol(strings.ToUpper(sym.Name)) if found != nil { fn = found.Func + sym.Func = fn } } if fn == nil { panic(t.runtimeError("undefined function: " + sym.Name)) } - // Collect args from stack - args := make([]Value, nArgs) - for i := nArgs - 1; i >= 0; i-- { - args[i] = t.pop() - } - t.pop() // pop NIL/self placeholder - t.pop() // pop symbol placeholder - - // Push args back for Frame() to pick up - for _, arg := range args { - t.push(arg) + // Stack at entry (bottom → top): + // [sym placeholder] [self/NIL] [arg1] … [argN] + // Frame() expects only [arg1..argN] on the eval stack so it can + // copy them into the callee's locals. The old code achieved this + // by pop-popping args, popping the two placeholders, then pushing + // the args back — an O(N) copy plus a heap allocation per call. + // Shift the args two slots left in place instead: one slice move, + // zero heap. + if nArgs > 0 { + base := t.sp - nArgs - 2 + copy(t.stack[base:base+nArgs], t.stack[t.sp-nArgs:t.sp]) } + // Two slots freed at top — keep them nil so the GC can release any + // references they held (matches pop()'s clearing semantics). + t.stack[t.sp-1] = cachedNil + t.stack[t.sp-2] = cachedNil + t.sp -= 2 // Set pending params count and symbol for Frame() t.pendingParams = nArgs diff --git a/hbrt/thread.go b/hbrt/thread.go index 163066f..69629ff 100644 --- a/hbrt/thread.go +++ b/hbrt/thread.go @@ -66,9 +66,14 @@ type Thread struct { // Return value (passed between caller/callee) retVal Value - // Pending function call stack (PushSymbol pushes, Function pops) - // Stack needed for nested calls: Double(Add(3,4)) - pendingSyms []*Symbol + // Pending function call stack (PushSymbol pushes, Function pops). + // Depth=1 is the common case (non-nested call) and gets a scalar + // fast slot to skip slice append/trim; nested calls fall back to + // the heap slice. Balanced push/pop keeps the invariant: + // pendingSymFast set → slice empty + // slice non-empty → pendingSymFast may or may not be set. + pendingSymFast *Symbol + pendingSyms []*Symbol pendingParams int // number of params for next Frame call pendingCallSym *Symbol // symbol for next Frame (for PROCNAME) @@ -600,17 +605,33 @@ func (t *Thread) PendingParams2(n int) { } func (t *Thread) pushPendingSym(sym *Symbol) { + // Fast path for depth=1 nesting — store in scalar slot without + // touching the slice. A nil sym (unresolved symbol, caught later + // in Function() with a descriptive error) must not use the fast + // path because `pendingSymFast == nil` already means "empty"; + // falling back to the slice preserves distinguishability. + if sym != nil && t.pendingSymFast == nil && len(t.pendingSyms) == 0 { + t.pendingSymFast = sym + return + } + if t.pendingSymFast != nil { + t.pendingSyms = append(t.pendingSyms, t.pendingSymFast) + t.pendingSymFast = nil + } t.pendingSyms = append(t.pendingSyms, sym) } func (t *Thread) popPendingSym() *Symbol { - n := len(t.pendingSyms) - if n == 0 { - return nil + if n := len(t.pendingSyms); n > 0 { + sym := t.pendingSyms[n-1] + t.pendingSyms = t.pendingSyms[:n-1] + return sym } - sym := t.pendingSyms[n-1] - t.pendingSyms = t.pendingSyms[:n-1] - return sym + if sym := t.pendingSymFast; sym != nil { + t.pendingSymFast = nil + return sym + } + return nil } // PushAliasField pushes a field value from a named alias workarea. diff --git a/hbrt/vm.go b/hbrt/vm.go index f475278..8486df3 100644 --- a/hbrt/vm.go +++ b/hbrt/vm.go @@ -127,6 +127,26 @@ func (vm *VM) FindSymbol(name string) *Symbol { return vm.symbols[name] } +// GetSym returns the cached Symbol, performing a one-time FindSymbol +// lookup on first access and stashing the pointer in *cache for all +// subsequent calls. Generated code (gengo) declares a package-level +// `var _sym_NAME *Symbol` per unique call target and routes every +// PushSymbol through this helper so the hot path becomes a single +// non-nil check instead of vm.symbols map + RWMutex per invocation. +func (t *Thread) GetSym(cache **Symbol, name string) *Symbol { + if s := *cache; s != nil { + return s + } + s := t.vm.FindSymbol(name) + if s != nil { + // Only cache successful resolutions — nil might be due to + // init-order (another module's registrations pending); + // retry on next call once those complete. + *cache = s + } + return s +} + // NewThread creates a new Thread attached to this VM. func (vm *VM) NewThread() *Thread { t := NewThread(vm)