Align Five's CDX file layout with Harbour's expectations: - Compound root header at 0, compound leaf at 1024, tags at 1536+ - "RCHB" signature at offset 20 in compound root - IgnoreCase/collation flags at offset 503-505 - Compound leaf: LeftPtr/RightPtr = 0xFFFFFFFF, recBits=16 fixed - Tags sorted alphabetically in compound directory B-tree - Tag IndexOpt: TypeCompact | TypeCompound (0x60) Status of Harbour cross-read verification: - CHAR-only CDX tags: layout matches Harbour byte-for-byte - Numeric tags: Harbour uses IEEE double (8-byte) key encoding, Five uses DBF ASCII key bytes — causes DBFCDX/1012 corruption when Harbour reads Five-created CDX with numeric tags - Five reading Harbour CDX: works perfectly (existing) - Five reading Five CDX: works perfectly Remaining: numeric key encoding for full Harbour write-compatibility. CLAUDE.md updated to reflect this single remaining limitation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
533 lines
14 KiB
Go
533 lines
14 KiB
Go
// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
|
|
// All rights reserved.
|
|
|
|
// CDX compound index creation.
|
|
// Builds Harbour-compatible CDX files with bit-packed leaf pages.
|
|
|
|
package cdx
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"io"
|
|
"math/bits"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
|
|
"five/hbrdd/ntx"
|
|
)
|
|
|
|
// cdxTagMeta holds metadata for a tag during file assembly.
|
|
type cdxTagMeta struct {
|
|
name string
|
|
headerOff int64
|
|
rootPage uint32
|
|
keyExpr string
|
|
forExpr string
|
|
keyLen int
|
|
unique bool
|
|
desc bool
|
|
}
|
|
|
|
// CreateOrAddTag creates a CDX file with a new tag, or appends a tag to
|
|
// an existing CDX file. The existing file's bytes are preserved verbatim;
|
|
// only the compound root directory is rebuilt to include the new tag.
|
|
//
|
|
// Harbour: ordCreate() → hb_cdxIndexCreateTag()
|
|
func CreateOrAddTag(path string, tagName, keyExpr, forExpr string,
|
|
keyLen int, unique, descending bool, keys []ntx.KeyRecord) (*Index, error) {
|
|
|
|
if !strings.HasSuffix(strings.ToLower(path), ".cdx") {
|
|
path += ".cdx"
|
|
}
|
|
tagName = strings.ToUpper(tagName)
|
|
if len(tagName) > MaxTagNameLen {
|
|
tagName = tagName[:MaxTagNameLen]
|
|
}
|
|
|
|
// Read existing file contents (if any) and tag metadata
|
|
var existingData []byte
|
|
var existingTags []cdxTagMeta
|
|
|
|
if fi, err := os.Stat(path); err == nil && fi.Size() > 0 {
|
|
existing, err := OpenIndex(path)
|
|
if err == nil {
|
|
for _, t := range existing.Tags() {
|
|
if strings.ToUpper(t.Name) == tagName {
|
|
continue // will be replaced
|
|
}
|
|
existingTags = append(existingTags, cdxTagMeta{
|
|
name: t.Name,
|
|
headerOff: t.HeaderOffset(),
|
|
rootPage: t.RootPtr(),
|
|
keyExpr: t.KeyExpr(),
|
|
forExpr: t.ForExpr(),
|
|
keyLen: t.KeyLen(),
|
|
})
|
|
}
|
|
existing.Close()
|
|
}
|
|
// Read entire old file
|
|
existingData, _ = os.ReadFile(path)
|
|
}
|
|
|
|
// Create/truncate the file
|
|
f, err := os.Create(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var appendOff int64 // where to start writing new data
|
|
|
|
// Harbour CDX layout:
|
|
// 0x0000: Compound root header (1024 bytes)
|
|
// 0x0400: Compound directory leaf page (512 bytes) ← RootPtr points here
|
|
// 0x0600: Tag 1 header (1024 bytes = 2 pages)
|
|
// Tag 1 B-tree pages...
|
|
// Tag 2 header...
|
|
// etc.
|
|
// The compound leaf is always at offset HeaderLen (1024).
|
|
// Tag headers start at HeaderLen + PageLen (1536).
|
|
|
|
compoundLeafOff := int64(HeaderLen) // 1024 — fixed position
|
|
|
|
if len(existingData) > 0 {
|
|
// Write back existing data verbatim (preserves all old tag B-trees)
|
|
f.Write(existingData)
|
|
appendOff = int64(len(existingData))
|
|
// Align to page boundary
|
|
if appendOff%int64(PageLen) != 0 {
|
|
appendOff = (appendOff/int64(PageLen) + 1) * int64(PageLen)
|
|
}
|
|
} else {
|
|
// New file: skip compound root (1024) + compound leaf (512)
|
|
appendOff = int64(HeaderLen) + int64(PageLen) // 1536
|
|
}
|
|
|
|
// Write the new tag's header (1024 bytes) + B-tree pages
|
|
newTagHeaderOff := appendOff
|
|
appendOff += int64(HeaderLen) // 1024 bytes for tag header
|
|
|
|
// Build B-tree pages for the new tag
|
|
var rootPageOff uint32
|
|
if len(keys) == 0 {
|
|
pg := make([]byte, PageLen)
|
|
binary.LittleEndian.PutUint16(pg[0:2], NodeLeaf|NodeRoot)
|
|
f.WriteAt(pg, appendOff)
|
|
rootPageOff = uint32(appendOff)
|
|
appendOff += PageLen
|
|
} else {
|
|
root, next := buildCDXBTree(f, keys, keyLen, appendOff)
|
|
rootPageOff = uint32(root)
|
|
appendOff = next
|
|
}
|
|
|
|
// Write the new tag header
|
|
writeTagHeader(f, newTagHeaderOff, rootPageOff, keyExpr, forExpr,
|
|
uint16(keyLen), unique, descending)
|
|
|
|
newTag := cdxTagMeta{
|
|
name: tagName, headerOff: newTagHeaderOff, rootPage: rootPageOff,
|
|
keyExpr: keyExpr, forExpr: forExpr, keyLen: keyLen,
|
|
unique: unique, desc: descending,
|
|
}
|
|
|
|
// Collect all tags (existing + new) in offset order (= creation order)
|
|
allTags := append(existingTags, newTag)
|
|
|
|
// Write compound directory leaf page at fixed offset 1024.
|
|
// Harbour's compound B-tree stores entries in ALPHABETICAL order
|
|
// (it's a B-tree keyed by tag name). Sort a copy for the leaf.
|
|
sortedTags := make([]cdxTagMeta, len(allTags))
|
|
copy(sortedTags, allTags)
|
|
sort.Slice(sortedTags, func(i, j int) bool {
|
|
return strings.ToUpper(sortedTags[i].name) < strings.ToUpper(sortedTags[j].name)
|
|
})
|
|
writeCompoundLeaf(f, compoundLeafOff, sortedTags)
|
|
|
|
// Write compound root header at offset 0
|
|
writeCompoundHeader(f, uint32(compoundLeafOff), len(allTags))
|
|
|
|
f.Close()
|
|
return OpenIndex(path)
|
|
}
|
|
|
|
// --- Tag header writing ---
|
|
|
|
func writeTagHeader(f *os.File, offset int64, rootPtr uint32,
|
|
keyExpr, forExpr string, keySize uint16, unique, desc bool) {
|
|
|
|
buf := make([]byte, HeaderLen)
|
|
binary.LittleEndian.PutUint32(buf[0:4], rootPtr)
|
|
// Counter = 0 initially (Harbour convention)
|
|
binary.LittleEndian.PutUint16(buf[12:14], keySize)
|
|
// Harbour sets TypeCompact | TypeCompound on data tags (0x60)
|
|
opt := byte(TypeCompact | TypeCompound)
|
|
if unique {
|
|
opt |= TypeUnique
|
|
}
|
|
if forExpr != "" {
|
|
opt |= TypeForFilter
|
|
}
|
|
buf[14] = opt
|
|
buf[15] = 0x01
|
|
binary.LittleEndian.PutUint16(buf[16:18], uint16(HeaderLen))
|
|
binary.LittleEndian.PutUint16(buf[18:20], uint16(PageLen))
|
|
if desc {
|
|
binary.LittleEndian.PutUint16(buf[504:506], 1)
|
|
}
|
|
copy(buf[512:], []byte(keyExpr))
|
|
if forExpr != "" {
|
|
forOff := 512 + len(keyExpr) + 1
|
|
if forOff+len(forExpr) < HeaderLen {
|
|
copy(buf[forOff:], []byte(forExpr))
|
|
}
|
|
}
|
|
f.WriteAt(buf, offset)
|
|
}
|
|
|
|
// --- Compound root ---
|
|
|
|
func writeCompoundHeader(f *os.File, rootPagePtr uint32, nTags int) {
|
|
hdr := make([]byte, HeaderLen)
|
|
binary.LittleEndian.PutUint32(hdr[0:4], rootPagePtr)
|
|
// FreePtr = 0, Counter = 0 (Harbour convention for compound root)
|
|
binary.LittleEndian.PutUint16(hdr[12:14], MaxTagNameLen)
|
|
hdr[14] = TypeCompound | TypeStructure | TypeCompact // 0xE0
|
|
hdr[15] = 0x01
|
|
binary.LittleEndian.PutUint16(hdr[16:18], uint16(HeaderLen))
|
|
binary.LittleEndian.PutUint16(hdr[18:20], uint16(PageLen))
|
|
// Harbour writes "RCHB" signature at offset 20 in compound root
|
|
copy(hdr[20:24], []byte("RCHB"))
|
|
// IgnoreCase=1, Descending flags at offset 503-505 (Harbour convention)
|
|
hdr[503] = 1 // IgnoreCase
|
|
binary.LittleEndian.PutUint16(hdr[504:506], 1) // collation flag
|
|
f.WriteAt(hdr, 0)
|
|
}
|
|
|
|
func writeCompoundLeaf(f *os.File, offset int64, tags []cdxTagMeta) {
|
|
leaf := make([]byte, PageLen)
|
|
nTags := len(tags)
|
|
compKeyLen := MaxTagNameLen
|
|
|
|
// Harbour uses fixed 16-bit recBits for compound leaf (offsets < 64KB)
|
|
recBits := 16
|
|
dupBits := bitsNeeded(uint32(compKeyLen)) // 4
|
|
trlBits := bitsNeeded(uint32(compKeyLen)) // 4
|
|
keyBytes := (recBits + dupBits + trlBits + 7) / 8 // 3
|
|
|
|
binary.LittleEndian.PutUint16(leaf[0:2], NodeLeaf|NodeRoot)
|
|
binary.LittleEndian.PutUint16(leaf[2:4], uint16(nTags))
|
|
// Harbour sets LeftPtr/RightPtr to 0xFFFFFFFF for compound leaf
|
|
binary.LittleEndian.PutUint32(leaf[4:8], 0xFFFFFFFF)
|
|
binary.LittleEndian.PutUint32(leaf[8:12], 0xFFFFFFFF)
|
|
binary.LittleEndian.PutUint32(leaf[14:18], (1<<uint(recBits))-1)
|
|
leaf[18] = byte((1 << uint(dupBits)) - 1)
|
|
leaf[19] = byte((1 << uint(trlBits)) - 1)
|
|
leaf[20] = byte(recBits)
|
|
leaf[21] = byte(dupBits)
|
|
leaf[22] = byte(trlBits)
|
|
leaf[23] = byte(keyBytes)
|
|
|
|
prevKey := make([]byte, compKeyLen)
|
|
for j := range prevKey {
|
|
prevKey[j] = ' '
|
|
}
|
|
keyDataPos := PageLen
|
|
|
|
for i, t := range tags {
|
|
key := padTagName(t.name)
|
|
dup := commonPrefix(key, prevKey, compKeyLen)
|
|
trl := trailingSpaces(key, compKeyLen)
|
|
newBytes := compKeyLen - dup - trl
|
|
|
|
if newBytes > 0 {
|
|
keyDataPos -= newBytes
|
|
copy(leaf[keyDataPos:], key[dup:dup+newBytes])
|
|
}
|
|
|
|
recNo := uint32(t.headerOff)
|
|
var val uint64
|
|
val = uint64(trl)
|
|
val <<= uint(dupBits)
|
|
val |= uint64(dup)
|
|
val <<= uint(recBits)
|
|
val |= uint64(recNo)
|
|
|
|
entryOff := ExtHeadSize + i*int(keyBytes)
|
|
for j := 0; j < int(keyBytes); j++ {
|
|
leaf[entryOff+j] = byte(val & 0xFF)
|
|
val >>= 8
|
|
}
|
|
copy(prevKey, key)
|
|
}
|
|
|
|
freeSpc := keyDataPos - (ExtHeadSize + nTags*int(keyBytes))
|
|
if freeSpc < 0 {
|
|
freeSpc = 0
|
|
}
|
|
binary.LittleEndian.PutUint16(leaf[12:14], uint16(freeSpc))
|
|
f.WriteAt(leaf, offset)
|
|
}
|
|
|
|
// --- B-tree builder ---
|
|
|
|
func buildCDXBTree(f *os.File, keys []ntx.KeyRecord, keyLen int,
|
|
startOff int64) (rootOff int64, nextOff int64) {
|
|
|
|
maxRecNo := uint32(0)
|
|
for _, k := range keys {
|
|
if k.RecNo > maxRecNo {
|
|
maxRecNo = k.RecNo
|
|
}
|
|
}
|
|
recBits := bitsNeeded(maxRecNo)
|
|
dupBits := bitsNeeded(uint32(keyLen))
|
|
trlBits := bitsNeeded(uint32(keyLen))
|
|
keyBytesPerEntry := (recBits + dupBits + trlBits + 7) / 8
|
|
|
|
maxKeysPerLeaf := (PageLen - ExtHeadSize) / (int(keyBytesPerEntry) + keyLen)
|
|
if maxKeysPerLeaf < 2 {
|
|
maxKeysPerLeaf = 2
|
|
}
|
|
intEntrySize := keyLen + 8
|
|
maxKeysPerInt := (PageLen - 12) / intEntrySize
|
|
if maxKeysPerInt < 2 {
|
|
maxKeysPerInt = 2
|
|
}
|
|
|
|
curOff := startOff
|
|
|
|
type leafInfo struct {
|
|
pageOff int64
|
|
lastKey []byte
|
|
lastRec uint32
|
|
}
|
|
var leaves []leafInfo
|
|
|
|
for i := 0; i < len(keys); {
|
|
end := i + maxKeysPerLeaf
|
|
if end > len(keys) {
|
|
end = len(keys)
|
|
}
|
|
chunk := keys[i:end]
|
|
pageOff := curOff
|
|
writeCDXLeafPage(f, pageOff, chunk, keyLen, recBits, dupBits, trlBits, keyBytesPerEntry)
|
|
curOff += PageLen
|
|
leaves = append(leaves, leafInfo{
|
|
pageOff: pageOff,
|
|
lastKey: chunk[len(chunk)-1].Key,
|
|
lastRec: chunk[len(chunk)-1].RecNo,
|
|
})
|
|
i = end
|
|
}
|
|
|
|
// Link leaf pages
|
|
for i := range leaves {
|
|
pg := make([]byte, PageLen)
|
|
f.ReadAt(pg, leaves[i].pageOff)
|
|
if i > 0 {
|
|
binary.LittleEndian.PutUint32(pg[4:8], uint32(leaves[i-1].pageOff))
|
|
}
|
|
if i < len(leaves)-1 {
|
|
binary.LittleEndian.PutUint32(pg[8:12], uint32(leaves[i+1].pageOff))
|
|
}
|
|
f.WriteAt(pg, leaves[i].pageOff)
|
|
}
|
|
|
|
if len(leaves) == 1 {
|
|
pg := make([]byte, 2)
|
|
f.ReadAt(pg, leaves[0].pageOff)
|
|
attr := binary.LittleEndian.Uint16(pg)
|
|
attr |= NodeRoot
|
|
binary.LittleEndian.PutUint16(pg, attr)
|
|
f.WriteAt(pg, leaves[0].pageOff)
|
|
return leaves[0].pageOff, curOff
|
|
}
|
|
|
|
type childInfo struct {
|
|
pageOff int64
|
|
sepKey []byte
|
|
sepRec uint32
|
|
}
|
|
children := make([]childInfo, len(leaves))
|
|
for i, l := range leaves {
|
|
children[i] = childInfo{pageOff: l.pageOff, sepKey: l.lastKey, sepRec: l.lastRec}
|
|
}
|
|
|
|
for len(children) > 1 {
|
|
var nextLevel []childInfo
|
|
for i := 0; i < len(children); {
|
|
end := i + maxKeysPerInt + 1
|
|
if end > len(children) {
|
|
end = len(children)
|
|
}
|
|
group := children[i:end]
|
|
nKeys := len(group) - 1
|
|
|
|
pg := make([]byte, PageLen)
|
|
attr := uint16(0)
|
|
if len(children) <= maxKeysPerInt+1 {
|
|
attr |= NodeRoot
|
|
}
|
|
binary.LittleEndian.PutUint16(pg[0:2], attr)
|
|
binary.LittleEndian.PutUint16(pg[2:4], uint16(nKeys))
|
|
binary.LittleEndian.PutUint32(pg[4:8], uint32(group[0].pageOff))
|
|
|
|
off := 12
|
|
for k := 0; k < nKeys; k++ {
|
|
key := group[k].sepKey
|
|
if len(key) < keyLen {
|
|
padded := make([]byte, keyLen)
|
|
copy(padded, key)
|
|
for j := len(key); j < keyLen; j++ {
|
|
padded[j] = ' '
|
|
}
|
|
key = padded
|
|
}
|
|
copy(pg[off:off+keyLen], key[:keyLen])
|
|
off += keyLen
|
|
binary.BigEndian.PutUint32(pg[off:off+4], group[k].sepRec)
|
|
off += 4
|
|
binary.BigEndian.PutUint32(pg[off:off+4], uint32(group[k+1].pageOff))
|
|
off += 4
|
|
}
|
|
|
|
pageOff := curOff
|
|
f.WriteAt(pg, pageOff)
|
|
curOff += PageLen
|
|
|
|
ci := childInfo{pageOff: pageOff}
|
|
if end < len(children) {
|
|
ci.sepKey = group[nKeys].sepKey
|
|
ci.sepRec = group[nKeys].sepRec
|
|
}
|
|
nextLevel = append(nextLevel, ci)
|
|
i = end
|
|
}
|
|
children = nextLevel
|
|
}
|
|
|
|
return children[0].pageOff, curOff
|
|
}
|
|
|
|
// --- Leaf page writer ---
|
|
|
|
func writeCDXLeafPage(f *os.File, offset int64, keys []ntx.KeyRecord,
|
|
keyLen, recBits, dupBits, trlBits, keyBytesPerEntry int) {
|
|
|
|
pg := make([]byte, PageLen)
|
|
binary.LittleEndian.PutUint16(pg[0:2], NodeLeaf)
|
|
binary.LittleEndian.PutUint16(pg[2:4], uint16(len(keys)))
|
|
binary.LittleEndian.PutUint32(pg[14:18], (1<<uint(recBits))-1)
|
|
pg[18] = byte((1 << uint(dupBits)) - 1)
|
|
pg[19] = byte((1 << uint(trlBits)) - 1)
|
|
pg[20] = byte(recBits)
|
|
pg[21] = byte(dupBits)
|
|
pg[22] = byte(trlBits)
|
|
pg[23] = byte(keyBytesPerEntry)
|
|
|
|
prevKey := make([]byte, keyLen)
|
|
for j := range prevKey {
|
|
prevKey[j] = ' '
|
|
}
|
|
keyDataPos := PageLen
|
|
|
|
for i, kr := range keys {
|
|
key := kr.Key
|
|
if len(key) < keyLen {
|
|
padded := make([]byte, keyLen)
|
|
copy(padded, key)
|
|
for j := len(key); j < keyLen; j++ {
|
|
padded[j] = ' '
|
|
}
|
|
key = padded
|
|
} else if len(key) > keyLen {
|
|
key = key[:keyLen]
|
|
}
|
|
|
|
dup := commonPrefix(key, prevKey, keyLen)
|
|
trl := trailingSpaces(key, keyLen)
|
|
newBytes := keyLen - dup - trl
|
|
|
|
if newBytes > 0 {
|
|
keyDataPos -= newBytes
|
|
copy(pg[keyDataPos:], key[dup:dup+newBytes])
|
|
}
|
|
|
|
var val uint64
|
|
val = uint64(trl)
|
|
val <<= uint(dupBits)
|
|
val |= uint64(dup)
|
|
val <<= uint(recBits)
|
|
val |= uint64(kr.RecNo)
|
|
|
|
entryOff := ExtHeadSize + i*keyBytesPerEntry
|
|
for j := 0; j < keyBytesPerEntry; j++ {
|
|
pg[entryOff+j] = byte(val & 0xFF)
|
|
val >>= 8
|
|
}
|
|
copy(prevKey, key)
|
|
}
|
|
|
|
freeSpc := keyDataPos - (ExtHeadSize + len(keys)*keyBytesPerEntry)
|
|
if freeSpc < 0 {
|
|
freeSpc = 0
|
|
}
|
|
binary.LittleEndian.PutUint16(pg[12:14], uint16(freeSpc))
|
|
f.WriteAt(pg, offset)
|
|
}
|
|
|
|
// --- Helpers ---
|
|
|
|
func bitsNeeded(maxVal uint32) int {
|
|
if maxVal == 0 {
|
|
return 1
|
|
}
|
|
return bits.Len32(maxVal)
|
|
}
|
|
|
|
func commonPrefix(a, b []byte, maxLen int) int {
|
|
n := maxLen
|
|
if len(a) < n {
|
|
n = len(a)
|
|
}
|
|
if len(b) < n {
|
|
n = len(b)
|
|
}
|
|
for i := 0; i < n; i++ {
|
|
if a[i] != b[i] {
|
|
return i
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
func trailingSpaces(key []byte, keyLen int) int {
|
|
n := keyLen
|
|
if len(key) < n {
|
|
n = len(key)
|
|
}
|
|
count := 0
|
|
for i := n - 1; i >= 0; i-- {
|
|
if key[i] == ' ' {
|
|
count++
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
func padTagName(name string) []byte {
|
|
b := make([]byte, MaxTagNameLen)
|
|
copy(b, []byte(strings.ToUpper(name)))
|
|
for i := len(name); i < MaxTagNameLen; i++ {
|
|
b[i] = ' '
|
|
}
|
|
return b
|
|
}
|
|
|
|
// Ensure io import is used (for potential future reads)
|
|
var _ = io.EOF
|