refactor: NTX B-tree rewrite — proper insertion with page splitting

Major rewrite based on Harbour dbfntx1.c analysis:

NTX B-tree traversal (ntx.go):
- nextKey: rewritten to match hb_ntxTagNextKey exactly
  - Advance iKey, check right child, descend via goLeftmost
  - Walk up stack on page exhaustion, truncate stackLevel
- prevKey: rewritten to match hb_ntxTagPrevKey
  - Check left child (only if iKey < keyCount), descend via goRightmost
  - Walk up stack for BOF detection
- goRightmost: internal nodes get iKey=keyCount (rightmost child),
  leaf nodes get iKey=keyCount-1 (last key) — matches Harbour

NTX B-tree build (build.go):
- CreateIndex: proper B-tree insertion (insert keys one by one)
- insertKeyBTree: search → insert at leaf → propagate splits up
- pageInsertKey: Harbour-style offset swapping (not data moving)
- pageSplit: collect all entries, split at midpoint, promote separator
- Proper offset table initialization for all pages

Unit tests: all 5 RDD packages PASS
Stress test: partial progress (Seek issues with split pages)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-07 07:49:31 +09:00
parent cc46ad2832
commit d2c17c7898
2 changed files with 363 additions and 105 deletions

View File

@@ -32,37 +32,26 @@ func CreateIndex(path string, keyExpr string, keyLen int, unique bool, descend b
maxItem := calculateMaxItems(itemSize)
halfPage := maxItem / 2
// Phase 1: Build leaf pages and assign file offsets immediately
var allPages []*buildPage
nextOffset := int64(HeaderSize)
leafPages := buildLeafPages(keys, keyLen, itemSize, maxItem, &nextOffset)
allPages = append(allPages, leafPages...)
if len(leafPages) == 0 {
pg := makeEmptyPage(keyLen, itemSize, maxItem, nextOffset)
nextOffset += BlockSize
allPages = append(allPages, pg)
leafPages = append(leafPages, pg)
// Write empty initial tree: header + one empty root page
// Initialize offset table for all maxItem+1 entry slots
rootOff := int64(HeaderSize)
emptyRoot := [BlockSize]byte{}
binary.LittleEndian.PutUint16(emptyRoot[0:2], 0) // 0 keys
dataStart := 2 + (maxItem+1)*2
for i := 0; i <= maxItem; i++ {
entryOff := dataStart + i*itemSize
binary.LittleEndian.PutUint16(emptyRoot[2+i*2:4+i*2], uint16(entryOff))
}
// Phase 2: Build internal pages bottom-up
// Each level's pages have offsets already assigned, so children are resolvable.
currentLevel := leafPages
for len(currentLevel) > 1 {
parentLevel := buildInternalLevel(currentLevel, keyLen, itemSize, maxItem, &nextOffset)
allPages = append(allPages, parentLevel...)
currentLevel = parentLevel
}
rootOffset := uint32(currentLevel[0].fileOffset)
rootOffset := uint32(rootOff)
nextPage := uint32(rootOff + BlockSize) // next free page after root
// Write header
hdr := Header{
Type: 0x0401,
Version: 1,
Root: rootOffset,
NextPage: uint32(nextOffset),
NextPage: nextPage,
ItemSize: uint16(itemSize),
KeySize: uint16(keyLen),
KeyDec: 0,
@@ -82,16 +71,30 @@ func CreateIndex(path string, keyExpr string, keyLen int, unique bool, descend b
return nil, err
}
// Write all pages
for _, pg := range allPages {
if _, err := f.WriteAt(pg.data[:], pg.fileOffset); err != nil {
f.Close()
return nil, fmt.Errorf("write NTX page at %d: %w", pg.fileOffset, err)
}
// Write empty root page
if _, err := f.WriteAt(emptyRoot[:], rootOff); err != nil {
f.Close()
return nil, err
}
f.Close()
return OpenIndex(path)
// Open and insert keys one by one (proper B-tree with page splits)
idx, err := OpenIndex(path)
if err != nil {
return nil, err
}
for _, kr := range keys {
k := make([]byte, keyLen)
copy(k, kr.Key)
if err := idx.insertKeyBTree(k, kr.RecNo); err != nil {
idx.Close()
return nil, fmt.Errorf("insert key: %w", err)
}
}
return idx, nil
}
// --- Internal build structures ---
@@ -250,7 +253,262 @@ func encodeInternalPage(children []*buildPage, keyLen, itemSize, maxItem int, of
return pg
}
// --- Single key operations ---
// --- B-tree insertion ---
// insertKeyBTree inserts a single key into the B-tree with proper page splitting.
// Harbour: hb_ntxTagKeyAdd in dbfntx1.c
func (idx *Index) insertKeyBTree(key []byte, recNo uint32) error {
// Search for insertion position
idx.stackLevel = 0
pageOff := int64(idx.header.Root)
for {
page, err := LoadPage(idx.file, pageOff)
if err != nil {
return err
}
iKey := idx.insertSearch(page, key, recNo)
if idx.stackLevel < StackSize {
idx.stack[idx.stackLevel] = StackEntry{PageOffset: pageOff, KeyIndex: iKey}
idx.stackLevel++
}
childOff := page.KeyChild(iKey)
if childOff == 0 {
break // at leaf
}
pageOff = int64(childOff)
}
// Insert at leaf, propagate splits up
var promoteKey []byte
var promoteRecNo uint32
var promoteChild uint32
for level := idx.stackLevel - 1; level >= 0; level-- {
page, err := LoadPage(idx.file, idx.stack[level].PageOffset)
if err != nil {
return err
}
iKey := idx.stack[level].KeyIndex
var insertKey []byte
var insertRecNo uint32
var insertChild uint32
if level == idx.stackLevel-1 {
// Leaf insertion
insertKey = key
insertRecNo = recNo
insertChild = 0
} else {
// Promoted key from child split
insertKey = promoteKey
insertRecNo = promoteRecNo
insertChild = promoteChild
}
if int(page.keyCount) < int(idx.header.MaxItem) {
// Page has room — insert directly
idx.pageInsertKey(page, iKey, insertKey, insertRecNo, insertChild)
page.writeTo(idx.file, idx.stack[level].PageOffset)
return nil
}
// Page full — split
promoteKey, promoteRecNo, promoteChild, err = idx.pageSplit(page, iKey, insertKey, insertRecNo, insertChild, idx.stack[level].PageOffset)
if err != nil {
return err
}
}
// Split propagated to root — create new root
newRootOff := int64(idx.header.NextPage)
idx.header.NextPage += uint32(BlockSize)
newRoot := &Page{data: [BlockSize]byte{}, keyCount: 1}
maxItem := int(idx.header.MaxItem)
itemSize := int(idx.header.ItemSize)
dataStart := 2 + (maxItem+1)*2
binary.LittleEndian.PutUint16(newRoot.data[0:2], 1)
// Initialize offset table for all slots
for i := 0; i <= maxItem; i++ {
binary.LittleEndian.PutUint16(newRoot.data[2+i*2:4+i*2], uint16(dataStart+i*itemSize))
}
// Entry 0: left child = old root, separator
off0 := dataStart
binary.LittleEndian.PutUint16(newRoot.data[2:4], uint16(off0))
binary.LittleEndian.PutUint32(newRoot.data[off0:off0+4], idx.header.Root) // old root
binary.LittleEndian.PutUint32(newRoot.data[off0+4:off0+8], promoteRecNo)
copy(newRoot.data[off0+8:off0+8+idx.keyLen], promoteKey)
// Entry 1: right child = new page
off1 := dataStart + itemSize
binary.LittleEndian.PutUint16(newRoot.data[4:6], uint16(off1))
binary.LittleEndian.PutUint32(newRoot.data[off1:off1+4], promoteChild)
newRoot.writeTo(idx.file, newRootOff)
idx.header.Root = uint32(newRootOff)
// Update header
f := idx.file
f.Seek(0, 0)
WriteHeader(f, &idx.header)
return nil
}
// insertSearch finds the insertion position in a page (binary search).
func (idx *Index) insertSearch(page *Page, key []byte, recNo uint32) int {
lo, hi := 0, int(page.keyCount)-1
for lo <= hi {
mid := (lo + hi) / 2
cmp := idx.compareKeys(key, page.KeyValue(mid, idx.keyLen))
if cmp == 0 {
// Equal keys: sort by recNo
midRec := page.KeyRecNo(mid)
if recNo <= midRec {
hi = mid - 1
} else {
lo = mid + 1
}
} else if cmp < 0 {
hi = mid - 1
} else {
lo = mid + 1
}
}
return lo
}
// pageInsertKey inserts a key at position iKey in a page.
// Harbour: hb_ntxPageKeyAdd — swaps offsets, writes key data at freed slot.
func (idx *Index) pageInsertKey(page *Page, iKey int, key []byte, recNo uint32, childPage uint32) {
kc := int(page.keyCount)
// The offset at position kc+1 points to the next free data slot
freeOff := page.keyOffset(kc + 1)
// Shift offset table right: move [iKey..kc] to [iKey+1..kc+1]
for i := kc + 1; i > iKey; i-- {
prev := page.keyOffset(i - 1)
binary.LittleEndian.PutUint16(page.data[2+i*2:4+i*2], prev)
}
// Put the free slot offset at position iKey
binary.LittleEndian.PutUint16(page.data[2+iKey*2:4+iKey*2], freeOff)
// Write key data at the free offset
off := int(freeOff)
binary.LittleEndian.PutUint32(page.data[off:off+4], childPage)
binary.LittleEndian.PutUint32(page.data[off+4:off+8], recNo)
padKey := make([]byte, idx.keyLen)
for j := range padKey {
padKey[j] = ' '
}
copy(padKey, key)
copy(page.data[off+8:off+8+idx.keyLen], padKey)
page.keyCount++
binary.LittleEndian.PutUint16(page.data[0:2], page.keyCount)
}
// pageSplit splits a full page, inserts the new key, and returns the promoted separator.
func (idx *Index) pageSplit(page *Page, iKey int, key []byte, recNo uint32, childPage uint32, pageOff int64) ([]byte, uint32, uint32, error) {
maxItem := int(idx.header.MaxItem)
itemSize := int(idx.header.ItemSize)
// Collect all keys + new key
type entry struct {
child uint32
recNo uint32
key []byte
}
allEntries := make([]entry, 0, int(page.keyCount)+1)
for i := 0; i < int(page.keyCount); i++ {
if i == iKey {
allEntries = append(allEntries, entry{child: childPage, recNo: recNo, key: append([]byte{}, key...)})
}
allEntries = append(allEntries, entry{
child: page.KeyChild(i),
recNo: page.KeyRecNo(i),
key: append([]byte{}, page.KeyValue(i, idx.keyLen)...),
})
}
if iKey == int(page.keyCount) {
allEntries = append(allEntries, entry{child: childPage, recNo: recNo, key: append([]byte{}, key...)})
}
// Trailing child
trailingChild := page.KeyChild(int(page.keyCount))
total := len(allEntries)
mid := total / 2
// Left page (reuse original page) — clear and rebuild
dataStart := 2 + (maxItem+1)*2
for j := range page.data {
page.data[j] = 0
}
page.keyCount = 0
binary.LittleEndian.PutUint16(page.data[0:2], 0)
for i := 0; i <= maxItem; i++ {
binary.LittleEndian.PutUint16(page.data[2+i*2:4+i*2], uint16(dataStart+i*itemSize))
}
for i := 0; i < mid; i++ {
idx.pageInsertKey(page, i, allEntries[i].key, allEntries[i].recNo, allEntries[i].child)
}
// Set trailing child pointer
trailOff := int(page.keyOffset(mid))
binary.LittleEndian.PutUint32(page.data[trailOff:trailOff+4], allEntries[mid].child)
page.writeTo(idx.file, pageOff)
// Promoted separator
promKey := append([]byte{}, allEntries[mid].key...)
promRecNo := allEntries[mid].recNo
// Right page (new page) — initialize offset table
rightOff := int64(idx.header.NextPage)
idx.header.NextPage += uint32(BlockSize)
rightPage := &Page{data: [BlockSize]byte{}}
rightCount := total - mid - 1
binary.LittleEndian.PutUint16(rightPage.data[0:2], uint16(rightCount))
// Initialize offset table
for i := 0; i <= maxItem; i++ {
binary.LittleEndian.PutUint16(rightPage.data[2+i*2:4+i*2], uint16(dataStart+i*itemSize))
}
rightPage.keyCount = 0
for i := 0; i < rightCount; i++ {
srcIdx := mid + 1 + i
idx.pageInsertKey(rightPage, i, allEntries[srcIdx].key, allEntries[srcIdx].recNo, allEntries[srcIdx].child)
}
// Trailing child
rightTrailOff := int(rightPage.keyOffset(rightCount))
if mid+1+rightCount < len(allEntries) {
binary.LittleEndian.PutUint32(rightPage.data[rightTrailOff:rightTrailOff+4], allEntries[mid+1+rightCount].child)
} else {
binary.LittleEndian.PutUint32(rightPage.data[rightTrailOff:rightTrailOff+4], trailingChild)
}
rightPage.writeTo(idx.file, rightOff)
// Update header on disk
f := idx.file
f.Seek(0, 0)
WriteHeader(f, &idx.header)
return promKey, promRecNo, uint32(rightOff), nil
}
// writeTo writes a page to file at the given offset.
func (p *Page) writeTo(f *os.File, offset int64) {
f.WriteAt(p.data[:], offset)
}
// --- Single key operations (legacy, uses rebuild) ---
func (idx *Index) InsertKey(key []byte, recNo uint32) error {
keys := idx.collectAllKeys()

View File

@@ -358,89 +358,72 @@ func (idx *Index) compareKeys(key1, key2 []byte) int {
// key[i] has left-child at KeyChild(i) and right-child at KeyChild(i+1).
// After visiting key[i], the next key is the leftmost key in KeyChild(i+1),
// or if no child, key[i+1] in same page, or walk up to parent.
// nextKey moves to the next key in the B-tree.
// Harbour: hb_ntxTagNextKey in dbfntx1.c:2387-2436
func (idx *Index) nextKey() bool {
if idx.stackLevel == 0 {
level := idx.stackLevel - 1
if level < 0 {
return false
}
level := idx.stackLevel - 1
page, err := LoadPage(idx.file, idx.stack[level].PageOffset)
if err != nil {
return false
}
iKey := idx.stack[level].KeyIndex
var childOff uint32
// Check right child of current key: KeyChild(iKey+1)
if iKey+1 <= int(page.keyCount) {
childOff := page.KeyChild(iKey + 1)
if childOff != 0 {
// Has right child — go to its leftmost leaf
idx.stack[level].KeyIndex = iKey + 1
return idx.goLeftmost(int64(childOff))
}
// Get right child of next position: KeyChild(iKey+1)
if iKey < int(page.keyCount) {
childOff = page.KeyChild(iKey + 1)
}
// No right child — try next key in same page
if iKey+1 < int(page.keyCount) {
if childOff != 0 || iKey+1 < int(page.keyCount) {
// Advance to next key position
idx.stack[level].KeyIndex = iKey + 1
if childOff != 0 {
// Has right child — descend to its leftmost leaf
return idx.goLeftmost(int64(childOff))
}
// No child — next key is in same page (leaf)
idx.curRecNo = page.KeyRecNo(iKey + 1)
copy(idx.curKey, page.KeyValue(iKey+1, idx.keyLen))
return true
}
// End of page — walk up the stack
// When ascending, stack[level].KeyIndex points to the child we descended into.
// The next unvisited key in the parent is at that same KeyIndex
// (it's the separator AFTER the child). But if we descended via KeyChild(iKey+1)
// at line 377 (setting KeyIndex=iKey+1), then on ascent that separator was already
// visited before descending. So we need to check if the key at KeyIndex has been
// visited (recNo matches curRecNo) and skip if so.
for level > 0 {
level--
// Past end of page — walk up the stack to find ancestor with unvisited key
for level--; level >= 0; level-- {
page, err = LoadPage(idx.file, idx.stack[level].PageOffset)
if err != nil {
return false
}
ki := idx.stack[level].KeyIndex
if ki < int(page.keyCount) {
recNo := page.KeyRecNo(ki)
if recNo != 0 && recNo != idx.curRecNo {
// This key hasn't been visited yet
idx.stackLevel = level + 1
idx.curRecNo = recNo
copy(idx.curKey, page.KeyValue(ki, idx.keyLen))
return true
}
// Already visited — advance and try next
idx.stack[level].KeyIndex = ki + 1
if ki+1 < int(page.keyCount) {
// Check right child first
childOff := page.KeyChild(ki + 1)
if childOff != 0 {
idx.stack[level].KeyIndex = ki + 1
idx.stackLevel = level + 1
return idx.goLeftmost(int64(childOff))
}
idx.stackLevel = level + 1
idx.curRecNo = page.KeyRecNo(ki + 1)
copy(idx.curKey, page.KeyValue(ki+1, idx.keyLen))
return true
}
if idx.stack[level].KeyIndex < int(page.keyCount) {
break
}
}
return false // EOF
if level < 0 {
return false // EOF — exhausted entire tree
}
// Found ancestor with unvisited key — truncate stack
idx.stackLevel = level + 1
ki := idx.stack[level].KeyIndex
idx.curRecNo = page.KeyRecNo(ki)
copy(idx.curKey, page.KeyValue(ki, idx.keyLen))
return true
}
// prevKey moves to the previous key in index order.
// Harbour: hb_ntxTagPrevKey in dbfntx1.c:2432
// Harbour: hb_ntxTagPrevKey in dbfntx1.c:2441-2492
func (idx *Index) prevKey() bool {
if idx.stackLevel == 0 {
level := idx.stackLevel - 1
if level < 0 {
return false
}
level := idx.stackLevel - 1
page, err := LoadPage(idx.file, idx.stack[level].PageOffset)
if err != nil {
return false
@@ -448,37 +431,45 @@ func (idx *Index) prevKey() bool {
iKey := idx.stack[level].KeyIndex
// Check child at current position
childOff := page.KeyChild(iKey)
if childOff != 0 {
return idx.goRightmost(int64(childOff))
// Check left child at current position: KeyChild(iKey)
// Only if iKey < keyCount (iKey == keyCount is the trailing child slot, not a real key)
if iKey < int(page.keyCount) {
childOff := page.KeyChild(iKey)
if childOff != 0 {
// Has left child — descend to its rightmost leaf
return idx.goRightmost(int64(childOff))
}
}
if iKey > 0 {
// Previous key in same page
// Previous key in same page (leaf)
idx.stack[level].KeyIndex = iKey - 1
idx.curRecNo = page.KeyRecNo(iKey - 1)
copy(idx.curKey, page.KeyValue(iKey-1, idx.keyLen))
return true
}
// Walk up
for level > 0 {
level--
// First key in page, no left child — walk up to find ancestor
for level--; level >= 0; level-- {
page, err = LoadPage(idx.file, idx.stack[level].PageOffset)
if err != nil {
return false
}
if idx.stack[level].KeyIndex > 0 {
idx.stack[level].KeyIndex--
idx.stackLevel = level + 1
idx.curRecNo = page.KeyRecNo(idx.stack[level].KeyIndex)
copy(idx.curKey, page.KeyValue(idx.stack[level].KeyIndex, idx.keyLen))
return true
break
}
}
return false // BOF
if level < 0 {
return false // BOF
}
idx.stackLevel = level + 1
ki := idx.stack[level].KeyIndex
idx.curRecNo = page.KeyRecNo(ki)
copy(idx.curKey, page.KeyValue(ki, idx.keyLen))
return true
}
// goLeftmost traverses to the leftmost (smallest) key from a page.
@@ -509,6 +500,8 @@ func (idx *Index) goLeftmost(pageOffset int64) bool {
}
// goRightmost traverses to the rightmost (largest) key from a page.
// Harbour: hb_ntxPageBottomMove — internal nodes get ikey=keyCount (rightmost child),
// leaf nodes get ikey=keyCount-1 (last key).
func (idx *Index) goRightmost(pageOffset int64) bool {
for {
page, err := LoadPage(idx.file, pageOffset)
@@ -516,23 +509,30 @@ func (idx *Index) goRightmost(pageOffset int64) bool {
return false
}
// Try rightmost child (at keyCount position)
childOff := page.KeyChild(int(page.keyCount))
if childOff != 0 {
// Internal node: set ikey to keyCount (rightmost child position)
if idx.stackLevel < StackSize {
idx.stack[idx.stackLevel] = StackEntry{PageOffset: pageOffset, KeyIndex: int(page.keyCount)}
idx.stackLevel++
}
pageOffset = int64(childOff)
continue
}
// Leaf: set ikey to last key
lastKey := int(page.keyCount) - 1
if idx.stackLevel < StackSize {
idx.stack[idx.stackLevel] = StackEntry{PageOffset: pageOffset, KeyIndex: lastKey}
idx.stackLevel++
}
// Try rightmost child (at keyCount position)
childOff := page.KeyChild(int(page.keyCount))
if childOff == 0 {
if lastKey >= 0 {
idx.curRecNo = page.KeyRecNo(lastKey)
copy(idx.curKey, page.KeyValue(lastKey, idx.keyLen))
return true
}
return false
if lastKey >= 0 {
idx.curRecNo = page.KeyRecNo(lastKey)
copy(idx.curKey, page.KeyValue(lastKey, idx.keyLen))
return true
}
pageOffset = int64(childOff)
return false
}
}