// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com) // All rights reserved. // Bitmap filter — Rushmore-style optimization for SET FILTER. // Optimized for modern CPUs: POPCNT, TZCNT, LZCNT, cache-line aligned operations. package hbrdd import ( "math/bits" "runtime" "sync" "sync/atomic" ) // BitmapFilter holds a bit array where each bit represents one record. type BitmapFilter struct { bits []uint64 maxRec uint32 count int32 // atomic for concurrent builds } // NewBitmapFilter creates a bitmap for maxRec records, all bits off. func NewBitmapFilter(maxRec uint32) *BitmapFilter { nWords := (maxRec + 63) >> 6 // /64 return &BitmapFilter{ bits: make([]uint64, nWords), maxRec: maxRec, } } // NewBitmapFilterFull creates a bitmap with all valid bits set. func NewBitmapFilterFull(maxRec uint32) *BitmapFilter { nWords := (maxRec + 63) >> 6 bm := &BitmapFilter{ bits: make([]uint64, nWords), maxRec: maxRec, count: int32(maxRec), } for i := range bm.bits { bm.bits[i] = ^uint64(0) } // Mask off excess bits in last word if tail := maxRec & 63; tail != 0 && len(bm.bits) > 0 { bm.bits[len(bm.bits)-1] = (uint64(1) << tail) - 1 } return bm } // --- Single-bit operations (inline-friendly) --- func (bm *BitmapFilter) Set(recNo uint32) { r := recNo - 1 if r >= bm.maxRec { return } mask := uint64(1) << (r & 63) p := &bm.bits[r>>6] if *p&mask == 0 { *p |= mask atomic.AddInt32(&bm.count, 1) } } func (bm *BitmapFilter) Clear(recNo uint32) { r := recNo - 1 if r >= bm.maxRec { return } mask := uint64(1) << (r & 63) p := &bm.bits[r>>6] if *p&mask != 0 { *p &^= mask atomic.AddInt32(&bm.count, -1) } } func (bm *BitmapFilter) Test(recNo uint32) bool { r := recNo - 1 if r >= bm.maxRec { return false } return bm.bits[r>>6]&(uint64(1)<<(r&63)) != 0 } func (bm *BitmapFilter) Count() int { return int(atomic.LoadInt32(&bm.count)) } // --- Traversal (hot path — fully optimized) --- // NextSet finds the next set bit >= recNo (1-based). Returns 0 if none. // Uses TZCNT (TrailingZeros) for O(1) per word. func (bm *BitmapFilter) NextSet(recNo uint32) uint32 { if recNo < 1 { recNo = 1 } r := recNo - 1 if r >= bm.maxRec { return 0 } idx := r >> 6 // Mask off bits below current position in first word word := bm.bits[idx] >> (r & 63) if word != 0 { return recNo + uint32(bits.TrailingZeros64(word)) } // Scan remaining words n := uint32(len(bm.bits)) for idx++; idx < n; idx++ { if bm.bits[idx] != 0 { result := (idx << 6) + 1 + uint32(bits.TrailingZeros64(bm.bits[idx])) if result > bm.maxRec { return 0 } return result } } return 0 } // PrevSet finds the previous set bit <= recNo (1-based). Returns 0 if none. // Uses LZCNT (LeadingZeros) for O(1) per word. func (bm *BitmapFilter) PrevSet(recNo uint32) uint32 { if recNo > bm.maxRec { recNo = bm.maxRec } if recNo < 1 { return 0 } r := recNo - 1 idx := r >> 6 // Mask off bits above current position in first word shift := 63 - (r & 63) word := bm.bits[idx] << shift if word != 0 { return recNo - uint32(bits.LeadingZeros64(word)) } // Scan backwards for idx > 0 { idx-- if bm.bits[idx] != 0 { return (idx << 6) + 1 + uint32(63-bits.LeadingZeros64(bm.bits[idx])) } } return 0 } // --- Bulk operations (4-word unrolled for cache line) --- // And returns intersection. Does NOT allocate if dst is provided. func (bm *BitmapFilter) And(other *BitmapFilter) *BitmapFilter { n := len(bm.bits) if len(other.bits) < n { n = len(other.bits) } result := &BitmapFilter{ bits: make([]uint64, len(bm.bits)), maxRec: bm.maxRec, } // 4-word unrolled loop i := 0 for ; i+3 < n; i += 4 { result.bits[i] = bm.bits[i] & other.bits[i] result.bits[i+1] = bm.bits[i+1] & other.bits[i+1] result.bits[i+2] = bm.bits[i+2] & other.bits[i+2] result.bits[i+3] = bm.bits[i+3] & other.bits[i+3] } for ; i < n; i++ { result.bits[i] = bm.bits[i] & other.bits[i] } result.recount() return result } // AndInPlace performs AND in-place (no allocation). func (bm *BitmapFilter) AndInPlace(other *BitmapFilter) { n := len(bm.bits) if len(other.bits) < n { n = len(other.bits) } i := 0 for ; i+3 < n; i += 4 { bm.bits[i] &= other.bits[i] bm.bits[i+1] &= other.bits[i+1] bm.bits[i+2] &= other.bits[i+2] bm.bits[i+3] &= other.bits[i+3] } for ; i < n; i++ { bm.bits[i] &= other.bits[i] } // Clear words beyond other's length for ; i < len(bm.bits); i++ { bm.bits[i] = 0 } bm.recount() } // Or returns union. func (bm *BitmapFilter) Or(other *BitmapFilter) *BitmapFilter { maxR := bm.maxRec if other.maxRec > maxR { maxR = other.maxRec } n := len(bm.bits) if len(other.bits) > n { n = len(other.bits) } result := &BitmapFilter{ bits: make([]uint64, n), maxRec: maxR, } a, b := bm.bits, other.bits i := 0 min := len(a) if len(b) < min { min = len(b) } for ; i+3 < min; i += 4 { result.bits[i] = a[i] | b[i] result.bits[i+1] = a[i+1] | b[i+1] result.bits[i+2] = a[i+2] | b[i+2] result.bits[i+3] = a[i+3] | b[i+3] } for ; i < min; i++ { result.bits[i] = a[i] | b[i] } for ; i < len(a); i++ { result.bits[i] = a[i] } for ; i < len(b); i++ { result.bits[i] = b[i] } result.recount() return result } // OrInPlace performs OR in-place. func (bm *BitmapFilter) OrInPlace(other *BitmapFilter) { if len(other.bits) > len(bm.bits) { newBits := make([]uint64, len(other.bits)) copy(newBits, bm.bits) bm.bits = newBits } if other.maxRec > bm.maxRec { bm.maxRec = other.maxRec } i := 0 n := len(other.bits) for ; i+3 < n; i += 4 { bm.bits[i] |= other.bits[i] bm.bits[i+1] |= other.bits[i+1] bm.bits[i+2] |= other.bits[i+2] bm.bits[i+3] |= other.bits[i+3] } for ; i < n; i++ { bm.bits[i] |= other.bits[i] } bm.recount() } // AndNot returns A AND NOT B (difference) — single pass, no intermediate. func (bm *BitmapFilter) AndNot(other *BitmapFilter) *BitmapFilter { result := &BitmapFilter{ bits: make([]uint64, len(bm.bits)), maxRec: bm.maxRec, } n := len(other.bits) if n > len(bm.bits) { n = len(bm.bits) } i := 0 for ; i+3 < n; i += 4 { result.bits[i] = bm.bits[i] &^ other.bits[i] result.bits[i+1] = bm.bits[i+1] &^ other.bits[i+1] result.bits[i+2] = bm.bits[i+2] &^ other.bits[i+2] result.bits[i+3] = bm.bits[i+3] &^ other.bits[i+3] } for ; i < n; i++ { result.bits[i] = bm.bits[i] &^ other.bits[i] } for ; i < len(bm.bits); i++ { result.bits[i] = bm.bits[i] } result.recount() return result } // Not inverts all bits (respecting maxRec boundary). func (bm *BitmapFilter) Not() *BitmapFilter { result := &BitmapFilter{ bits: make([]uint64, len(bm.bits)), maxRec: bm.maxRec, } for i := range bm.bits { result.bits[i] = ^bm.bits[i] } // Mask off excess in last word if tail := bm.maxRec & 63; tail != 0 && len(result.bits) > 0 { result.bits[len(result.bits)-1] &= (uint64(1) << tail) - 1 } result.recount() return result } // --- Parallel bitmap build (Five's goroutine advantage) --- // BuildParallel evaluates a test function on all records using goroutines. // testFn(recNo) returns true if the record matches the filter. func BuildParallel(maxRec uint32, testFn func(uint32) bool) *BitmapFilter { bm := NewBitmapFilter(maxRec) nCPU := runtime.NumCPU() if nCPU > 16 { nCPU = 16 } if maxRec < 1000 { nCPU = 1 // not worth parallelizing } chunkSize := (maxRec + uint32(nCPU) - 1) / uint32(nCPU) var wg sync.WaitGroup for cpu := 0; cpu < nCPU; cpu++ { start := uint32(cpu)*chunkSize + 1 end := start + chunkSize if end > maxRec+1 { end = maxRec + 1 } wg.Add(1) go func(s, e uint32) { defer wg.Done() for r := s; r < e; r++ { if testFn(r) { bm.Set(r) } } }(start, end) } wg.Wait() return bm } // --- Internal --- func (bm *BitmapFilter) recount() { c := 0 for _, w := range bm.bits { c += bits.OnesCount64(w) } atomic.StoreInt32(&bm.count, int32(c)) }