From 99f3ca5687aa0b2e1648a446d391a2475a8b79f2 Mon Sep 17 00:00:00 2001 From: CharlesKWON Date: Wed, 15 Apr 2026 23:06:35 +0900 Subject: [PATCH] =?UTF-8?q?perf(FiveSql2):=20EXISTS=20semi-join=20lift=20?= =?UTF-8?q?=E2=80=94=20H3=20correlated=20EXISTS=20~2000x=20faster?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correlated EXISTS with high-cardinality keys was stuck at O(outer × inner) because memoization couldn't amortize across unique correlation values. H3 in the subquery stress bench: SELECT e.name FROM emp e WHERE EXISTS (SELECT 1 FROM ord WHERE ord.emp_id = e.id AND ord.qty > 15) 500 outer rows × 500 distinct e.id values × 5000-row ord scan = 10s, with no path to improvement from caching the subquery result. Fix: detect the semi-join shape on the subquery and rewrite it at runtime into a non-correlated DISTINCT scan whose result is cached as a hash set. Each outer row then becomes an O(1) hash probe. --- What we lift --- SELECT ... FROM inner_table WHERE inner.col = outer.col [AND other_non_correlated_preds] Shape constraints (all must hold): - single table, no JOIN - no GROUP BY, no HAVING, no UNION - WHERE is an AND tree containing an equi-term where one side is a column with an alias prefix from the subquery's own FROM and the other is a column from an outer alias - the remaining AND terms (non-correlated residue) have no outer references of their own — rules out patterns like `WHERE e2.dept = e.dept AND e2.salary > e.salary` where the second term can't live without the outer context --- How the lift works --- 1. Walk the WHERE as a flat AND-term list 2. Find and remove the first correlated equi-term, remember the inner column name and outer column reference 3. Verify residue is non-correlated via a recursive AST walker (SemiJoinHasOuterRef) — bail to fallback if not 4. Clone hQuery with: columns = {DISTINCT inner.col} where = residue (or NIL) distinct = .T. limit / top / order_by / group_by / having cleared 5. Run the cloned subquery once via a nested TSqlExecutor — no PushOuter because it's now non-correlated 6. Build a hash set keyed on SqlValToStr(each distinct inner value) 7. Per EXISTS probe: Resolve the outer column reference, look up in the hash set Cached in ::aSemiJoinSlots indexed by xSubNode identity so the analysis + lifted scan runs exactly once per subquery expression. Subqueries that don't match the shape store the sentinel "NO" so subsequent probes skip re-analysis and fall through to the existing SubqueryCached + LIMIT 1 path. NOT EXISTS works through the same path — lNegate flag just flips the final hash-lookup result. --- Bench (emp=500, prod=100, ord=5k) --- Pattern Before After Speedup ──────────────────────────────────────────────────────────── H3 EXISTS correlated 10.0s 4.5ms ~2200x H8 NOT EXISTS self-join 900ms 890ms same (can't lift: remainder `e2.salary > e.salary` is correlated) H11 Scalar + EXISTS + derived 3.2s 1.0s 3.2x H8 correctly falls through to the non-lifted path because the remainder outer-reference check (SemiJoinHasOuterRef) rejects the `e2.salary > e.salary` term. The 5-row answer is still correct. Validation: - FiveSql2 43/43 - Harbour compat 51/51 - go test ./... ALL PASS - H3 returns 125 rows (matches pre-change correct result) - H8 returns 5 rows (matches pre-change correct result) Known pre-existing bug, unrelated: H7 (scalar correlated subquery with inner INNER JOIN) returns zero for rows 2..N — workarea state leaks between consecutive subquery invocations. Not touched here, filed for follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) --- _FiveSql2/src/TSqlExecutor.prg | 341 +++++++++++++++++++++++++++++++-- 1 file changed, 330 insertions(+), 11 deletions(-) diff --git a/_FiveSql2/src/TSqlExecutor.prg b/_FiveSql2/src/TSqlExecutor.prg index b6ede21..67c5101 100644 --- a/_FiveSql2/src/TSqlExecutor.prg +++ b/_FiveSql2/src/TSqlExecutor.prg @@ -38,6 +38,7 @@ CLASS TSqlExecutor DATA hSubCorrCache INIT { => } /* per-outer-key subquery result cache */ DATA aSubCacheSlots INIT {} /* list of {xSubNode, {id, aFreeVars}} */ DATA nSubCacheSeq INIT 0 /* monotonic ID for subqueries */ + DATA aSemiJoinSlots INIT {} /* list of {xSubNode, semiJoinData | "NO"} */ CLASSDATA hSubCache INIT { => } SHARED @@ -75,6 +76,8 @@ CLASS TSqlExecutor METHOD SubqueryCached( xSubNode ) METHOD CollectFreeVars( hQ ) METHOD CollectExprFreeVars( xE, aLocalAliases, aFree ) + METHOD ExistsViaSemiJoin( xSubNode, lNegate ) + METHOD TryBuildSemiJoin( xSubNode ) ENDCLASS @@ -571,21 +574,35 @@ METHOD EvalExpr( xNode ) CLASS TSqlExecutor RETURN NIL CASE xNode[ 1 ] == ND_FN - /* EXISTS and NOT EXISTS — we only need to know whether the - * subquery returns at least one row, not compute the full - * result. Force a LIMIT 1 into the subquery's hQuery so the - * inner scan short-circuits on the first match. Then route - * through SubqueryCached so correlated EXISTS still memoizes - * on free-variable values (helps when correlation is low - * cardinality; no-op when every outer row is unique). */ + /* EXISTS and NOT EXISTS handling: + * + * 1. If the subquery matches the semi-join shape (single-table + * with a `inner.col = outer.col` equi-term and no JOIN / + * GROUP BY / aggregate), lift it into a non-correlated + * hash set probe: run the subquery ONCE with the correlated + * term removed and DISTINCT on inner.col, then each outer + * row becomes an O(1) hash lookup. This is the key win + * for patterns like + * WHERE EXISTS (SELECT 1 FROM ord WHERE ord.emp_id = e.id + * AND ord.qty > 15) + * where the correlation is 1:1 with outer rows so plain + * memoization doesn't help. + * + * 2. Otherwise inject LIMIT 1 and route through SubqueryCached + * so at least the scan short-circuits on first match and + * low-cardinality correlations still memoize. */ IF ( xNode[ 2 ] == "EXISTS" .OR. xNode[ 2 ] == "NOT EXISTS" ) .AND. ; Len( xNode[ 3 ] ) > 0 .AND. ; xNode[ 3 ][ 1 ] != NIL .AND. ValType( xNode[ 3 ][ 1 ] ) == "A" .AND. ; xNode[ 3 ][ 1 ][ 1 ] == ND_SUB .AND. xNode[ 3 ][ 1 ][ 2 ] != NIL - /* Install LIMIT 1 on the subquery hQuery. EXISTS only cares - * about the existence of a match, so the subquery scan can - * stop at the first row — the scan loop in RunSelect honours - * hQuery["limit"] as an early-termination target. */ + + aSubResult := ::ExistsViaSemiJoin( xNode[ 3 ][ 1 ], xNode[ 2 ] == "NOT EXISTS" ) + IF aSubResult != NIL + /* Semi-join lift succeeded; result is already a boolean */ + RETURN aSubResult + ENDIF + + /* Fallback: LIMIT 1 + cached run */ IF ValType( xNode[ 3 ][ 1 ][ 2 ] ) == "H" xNode[ 3 ][ 1 ][ 2 ][ "limit" ] := 1 ENDIF @@ -1578,6 +1595,308 @@ RETURN lHadMatch /* Subquery result cache for non-correlated subqueries */ +/* ExistsViaSemiJoin — try to answer an EXISTS / NOT EXISTS probe via + * a pre-built hash set instead of re-executing the subquery per outer + * row. Returns a boolean (the EXISTS result) on success, NIL when the + * subquery shape can't be lifted (caller should fall back to the + * normal per-row path). + * + * The lifted state is built on first call by TryBuildSemiJoin and + * cached in ::aSemiJoinSlots keyed on xSubNode identity. The sentinel + * string "NO" marks subqueries we already tried and can't lift, so + * subsequent calls skip the analysis. + */ +METHOD ExistsViaSemiJoin( xSubNode, lNegate ) CLASS TSqlExecutor + + LOCAL i, nSlot, oData, xOuterVal, cKey, lMatch + + /* Look up previous analysis */ + nSlot := 0 + FOR i := 1 TO Len( ::aSemiJoinSlots ) + IF ::aSemiJoinSlots[ i ][ 1 ] == xSubNode + nSlot := i + EXIT + ENDIF + NEXT + IF nSlot == 0 + oData := ::TryBuildSemiJoin( xSubNode ) + AAdd( ::aSemiJoinSlots, { xSubNode, iif( oData == NIL, "NO", oData ) } ) + nSlot := Len( ::aSemiJoinSlots ) + ENDIF + oData := ::aSemiJoinSlots[ nSlot ][ 2 ] + + /* Shape couldn't be lifted — let caller use fallback */ + IF ValType( oData ) != "H" + RETURN NIL + ENDIF + + /* Probe: evaluate outer column reference and look up in hash set */ + xOuterVal := ::Resolve( oData[ "outer_ref" ] ) + cKey := SqlValToStr( xOuterVal ) + lMatch := hb_HHasKey( oData[ "inner_set" ], cKey ) + IF lNegate + RETURN ! lMatch + ENDIF +RETURN lMatch + + +/* TryBuildSemiJoin — attempt to lift a correlated EXISTS subquery into + * a non-correlated hash set. Returns a hash { "outer_ref", "inner_set" } + * on success, NIL if the subquery doesn't match the supported shape. + * + * Supported shape: + * SELECT ... FROM inner_table WHERE inner.col = outer.col [AND rest] + * with no JOIN, no GROUP BY / HAVING, no ORDER BY. The `rest` can be + * any non-correlated predicate; it stays in the lifted subquery. + * + * Implementation: + * 1. Walk the WHERE tree as an AND list, find the first equi-term + * whose two sides are `innerTable.col` and `outerAlias.col`. + * Rebuild the remainder predicate from everything else. + * 2. Synthesize a modified hQuery: same FROM, DISTINCT inner.col as + * the only SELECT column, WHERE = remainder. + * 3. Run it once via a nested TSqlExecutor. Build a hash set keyed + * on SqlValToStr(innerCol). + */ +METHOD TryBuildSemiJoin( xSubNode ) CLASS TSqlExecutor + + LOCAL hQ, aLocalAliases, i, aT + LOCAL aAndTerms, xTerm, xLeft, xRight + LOCAL lLeftIsInner, lRightIsInner + LOCAL cInnerAlias, cInnerField, xOuterRef + LOCAL aRemainTerms, xRemain + LOCAL hLifted, oSub, aResult, hSet, cKey + LOCAL xVal, aTopWhere + + IF xSubNode == NIL .OR. ValType( xSubNode ) != "A" .OR. Len( xSubNode ) < 2 + RETURN NIL + ENDIF + hQ := xSubNode[ 2 ] + IF ValType( hQ ) != "H" + RETURN NIL + ENDIF + + /* Shape constraints — fall back for anything complex */ + IF hb_HHasKey( hQ, "joins" ) .AND. ValType( hQ[ "joins" ] ) == "A" .AND. Len( hQ[ "joins" ] ) > 0 + RETURN NIL + ENDIF + IF hb_HHasKey( hQ, "group_by" ) .AND. ValType( hQ[ "group_by" ] ) == "A" .AND. Len( hQ[ "group_by" ] ) > 0 + RETURN NIL + ENDIF + IF hb_HHasKey( hQ, "having" ) .AND. hQ[ "having" ] != NIL + RETURN NIL + ENDIF + IF hb_HHasKey( hQ, "union" ) .AND. hQ[ "union" ] != NIL + RETURN NIL + ENDIF + IF ! hb_HHasKey( hQ, "tables" ) .OR. Len( hQ[ "tables" ] ) != 1 + RETURN NIL + ENDIF + IF ! hb_HHasKey( hQ, "where" ) .OR. hQ[ "where" ] == NIL + RETURN NIL + ENDIF + + /* Collect subquery's own table aliases to tell inner from outer */ + aLocalAliases := {} + aT := hQ[ "tables" ][ 1 ] + AAdd( aLocalAliases, Upper( aT[ 1 ] ) ) + IF Len( aT ) >= 2 .AND. ! Empty( aT[ 2 ] ) + AAdd( aLocalAliases, Upper( aT[ 2 ] ) ) + ENDIF + + /* Flatten WHERE into a list of AND-terms */ + aAndTerms := {} + aTopWhere := { hQ[ "where" ] } + WHILE Len( aTopWhere ) > 0 + xTerm := aTopWhere[ 1 ] + ADel( aTopWhere, 1 ) + ASize( aTopWhere, Len( aTopWhere ) - 1 ) + IF xTerm != NIL .AND. ValType( xTerm ) == "A" .AND. Len( xTerm ) >= 4 .AND. ; + xTerm[ 1 ] == ND_BIN .AND. xTerm[ 2 ] == "AND" + AAdd( aTopWhere, xTerm[ 3 ] ) + AAdd( aTopWhere, xTerm[ 4 ] ) + ELSE + AAdd( aAndTerms, xTerm ) + ENDIF + ENDDO + + /* Find the equi-term that correlates inner.col with outer.col */ + cInnerAlias := "" + cInnerField := "" + xOuterRef := NIL + aRemainTerms := {} + FOR i := 1 TO Len( aAndTerms ) + xTerm := aAndTerms[ i ] + IF ! Empty( cInnerField ) .OR. ; + xTerm == NIL .OR. ValType( xTerm ) != "A" .OR. Len( xTerm ) < 4 .OR. ; + xTerm[ 1 ] != ND_BIN .OR. xTerm[ 2 ] != "=" .OR. ; + xTerm[ 3 ] == NIL .OR. xTerm[ 4 ] == NIL .OR. ; + xTerm[ 3 ][ 1 ] != ND_COL .OR. xTerm[ 4 ][ 1 ] != ND_COL + AAdd( aRemainTerms, xTerm ) + LOOP + ENDIF + xLeft := xTerm[ 3 ] + xRight := xTerm[ 4 ] + lLeftIsInner := SemiJoinIsInner( xLeft, aLocalAliases ) + lRightIsInner := SemiJoinIsInner( xRight, aLocalAliases ) + IF lLeftIsInner .AND. ! lRightIsInner + cInnerField := SemiJoinStripAlias( xLeft[ 2 ] ) + xOuterRef := xRight[ 2 ] + ELSEIF lRightIsInner .AND. ! lLeftIsInner + cInnerField := SemiJoinStripAlias( xRight[ 2 ] ) + xOuterRef := xLeft[ 2 ] + ELSE + AAdd( aRemainTerms, xTerm ) + ENDIF + NEXT + + IF Empty( cInnerField ) .OR. xOuterRef == NIL + RETURN NIL + ENDIF + + /* The remainder must be entirely non-correlated — otherwise the + * lifted subquery can't evaluate without an outer row, and any + * result would be wrong. This rules out patterns like + * WHERE e2.dept = e.dept AND e2.salary > e.salary + * where the second term still references the outer row. */ + FOR i := 1 TO Len( aRemainTerms ) + IF SemiJoinHasOuterRef( aRemainTerms[ i ], aLocalAliases ) + RETURN NIL + ENDIF + NEXT + + /* Rebuild the remainder WHERE as a right-leaning AND chain, or NIL */ + xRemain := NIL + FOR i := 1 TO Len( aRemainTerms ) + IF xRemain == NIL + xRemain := aRemainTerms[ i ] + ELSE + xRemain := SqlNode( ND_BIN, "AND", xRemain, aRemainTerms[ i ], NIL ) + ENDIF + NEXT + + /* Clone hQuery, replace SELECT list with DISTINCT inner.col, + * swap WHERE for the remainder. Other keys (tables, limit, etc.) + * are shallow-copied intentionally. */ + hLifted := { => } + FOR i := 1 TO Len( hb_HKeys( hQ ) ) + hLifted[ hb_HKeys( hQ )[ i ] ] := hQ[ hb_HKeys( hQ )[ i ] ] + NEXT + hLifted[ "columns" ] := { { SqlNode( ND_COL, cInnerField, NIL, NIL, NIL ), cInnerField } } + hLifted[ "where" ] := xRemain + hLifted[ "distinct" ] := .T. + hLifted[ "limit" ] := 0 + hLifted[ "top" ] := 0 + hLifted[ "order_by" ] := {} + hLifted[ "group_by" ] := {} + hLifted[ "having" ] := NIL + + /* Run the lifted query once. No PushOuter — it's now non-correlated. */ + oSub := TSqlExecutor():New( hLifted, ::aParams ) + oSub:nDepth := ::nDepth + aResult := oSub:Run() + IF ValType( aResult ) != "A" .OR. Len( aResult ) < 2 .OR. ValType( aResult[ 2 ] ) != "A" + RETURN NIL + ENDIF + + /* Build the hash set */ + hSet := { => } + FOR i := 1 TO Len( aResult[ 2 ] ) + IF Len( aResult[ 2 ][ i ] ) > 0 + xVal := aResult[ 2 ][ i ][ 1 ] + cKey := SqlValToStr( xVal ) + hSet[ cKey ] := .T. + ENDIF + NEXT + +RETURN { "outer_ref" => xOuterRef, "inner_set" => hSet } + + +/* Helpers for TryBuildSemiJoin — module-level functions to keep the + * method body short. */ +STATIC FUNCTION SemiJoinIsInner( xCol, aLocalAliases ) + LOCAL cRef, nDot, cAlias + + IF xCol == NIL .OR. ValType( xCol ) != "A" .OR. xCol[ 1 ] != ND_COL + RETURN .F. + ENDIF + cRef := xCol[ 2 ] + nDot := At( ".", cRef ) + IF nDot == 0 + /* Unqualified — assume inner since it would resolve in own FROM */ + RETURN .T. + ENDIF + cAlias := Upper( Left( cRef, nDot - 1 ) ) +RETURN AScan( aLocalAliases, cAlias ) > 0 + + +STATIC FUNCTION SemiJoinStripAlias( cRef ) + LOCAL nDot := At( ".", cRef ) + IF nDot > 0 + RETURN SubStr( cRef, nDot + 1 ) + ENDIF +RETURN cRef + + +/* Recursively check whether an AST expression references any column + * whose alias prefix is NOT in the local alias list. Unqualified + * refs are assumed local. Returns .T. on first outer reference seen. */ +STATIC FUNCTION SemiJoinHasOuterRef( xE, aLocalAliases ) + LOCAL i, cRef, nDot, cAlias + + IF xE == NIL .OR. ValType( xE ) != "A" .OR. Len( xE ) < 1 + RETURN .F. + ENDIF + + DO CASE + CASE xE[ 1 ] == ND_COL + IF Len( xE ) >= 2 .AND. ValType( xE[ 2 ] ) == "C" + cRef := xE[ 2 ] + nDot := At( ".", cRef ) + IF nDot == 0 + RETURN .F. /* unqualified → assumed local */ + ENDIF + cAlias := Upper( Left( cRef, nDot - 1 ) ) + RETURN AScan( aLocalAliases, cAlias ) == 0 + ENDIF + + CASE xE[ 1 ] == ND_BIN .OR. xE[ 1 ] == ND_RANGE + IF SemiJoinHasOuterRef( xE[ 3 ], aLocalAliases ) + RETURN .T. + ENDIF + IF SemiJoinHasOuterRef( xE[ 4 ], aLocalAliases ) + RETURN .T. + ENDIF + IF Len( xE ) >= 5 .AND. SemiJoinHasOuterRef( xE[ 5 ], aLocalAliases ) + RETURN .T. + ENDIF + + CASE xE[ 1 ] == ND_UNI + RETURN SemiJoinHasOuterRef( xE[ 3 ], aLocalAliases ) + + CASE xE[ 1 ] == ND_FN + IF Len( xE ) >= 3 .AND. ValType( xE[ 3 ] ) == "A" + FOR i := 1 TO Len( xE[ 3 ] ) + IF SemiJoinHasOuterRef( xE[ 3 ][ i ], aLocalAliases ) + RETURN .T. + ENDIF + NEXT + ENDIF + + CASE xE[ 1 ] == ND_LIST + IF Len( xE ) >= 2 .AND. ValType( xE[ 2 ] ) == "A" + FOR i := 1 TO Len( xE[ 2 ] ) + IF SemiJoinHasOuterRef( xE[ 2 ][ i ], aLocalAliases ) + RETURN .T. + ENDIF + NEXT + ENDIF + + ENDCASE + +RETURN .F. + + /* SubqueryCached — correlated-aware subquery execution with memoization. * * Walks the subquery's AST on first call to identify free variables —