Files
five/hbrdd/dbf/field.go
CharlesKWON af9e965bc6 perf(dbf): byte-level numeric field parser — zero alloc for int fields
parseNumericField was allocating on every call — `string(raw)` to
convert the record-buffer slice to a string, plus the implicit
allocation from TrimSpace's return value. For a 50k-row scan reading
two numeric fields, that's 100k+ small string allocations per scan,
all of which promptly became garbage.

Rewritten to walk the raw byte slice directly:
  - Find the trimmed range by byte indexing (no alloc).
  - Parse integer-typed fields (dec == 0) digit-by-digit into int64.
  - Only fall back to strconv.ParseFloat + string allocation for
    genuinely fractional data (dec > 0 or embedded `.`).

This also lifts the raw RDD baseline in our bench (6.8ms → 6.2ms)
because FieldGet hits this same parser. Every scan path benefits,
not just the FiveSql2 hot loop.

Measured (50k rows, 3-run steady state):

                       Before    After
  No WHERE              10.0ms   9.1ms
  Numeric WHERE          7.8ms   6.9ms   ← now 1.11x raw
  String WHERE           7.9ms   (see next commit)
  Raw RDD baseline       6.8ms   6.2ms   ← also faster

Validation:
  - hbrdd/dbf tests PASS (including integer/float field roundtrips)
  - FiveSql2 43/43
  - Harbour compat 51/51

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 14:02:42 +09:00

415 lines
9.9 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// DBF field type conversion: raw bytes ↔ Five Value.
// Each field type (C, N, L, D, M, I, B, @, etc.) has exact byte format.
//
// Reference: /mnt/d/harbour-core/src/rdd/dbf1.c (getValue/putValue)
// docs/dbf-engine-spec.md Section 3
package dbf
import (
"encoding/binary"
"five/hbrt"
"fmt"
"math"
"strconv"
"strings"
)
// GetFieldValue converts raw record bytes to a Five Value.
// Harbour: hb_dbfGetValue in dbf1.c
func GetFieldValue(recBuf []byte, offset uint16, field *FieldDesc) hbrt.Value {
raw := recBuf[offset : offset+uint16(field.Len)]
switch field.Type {
case 'C', 'c': // Character
return hbrt.MakeString(string(raw))
case 'N', 'n': // Numeric (ASCII)
return parseNumericField(raw, field.Dec)
case 'L', 'l': // Logical
return parseLogicalField(raw[0])
case 'D', 'd': // Date
return parseDateField(raw, field.Len)
case 'M', 'm': // Memo (block reference)
return parseMemoRef(raw, field.Len)
case 'I', 'i': // Integer (binary LE)
return parseIntegerField(raw, field.Len)
case 'B', 'b': // Double (IEEE 754 LE)
if field.Len == 8 {
bits := binary.LittleEndian.Uint64(raw)
return hbrt.MakeDoubleAuto(math.Float64frombits(bits))
}
return hbrt.MakeNil()
case '@': // Timestamp (4 bytes date + 4 bytes time, LE)
if field.Len >= 8 {
julian := int64(binary.LittleEndian.Uint32(raw[0:4]))
timeMs := int32(binary.LittleEndian.Uint32(raw[4:8]))
return hbrt.MakeTimestamp(julian, timeMs)
}
return hbrt.MakeNil()
case '+': // Autoincrement (binary LE integer)
return parseIntegerField(raw, field.Len)
case '=': // Modtime (same as Timestamp)
if field.Len >= 8 {
julian := int64(binary.LittleEndian.Uint32(raw[0:4]))
timeMs := int32(binary.LittleEndian.Uint32(raw[4:8]))
return hbrt.MakeTimestamp(julian, timeMs)
}
return hbrt.MakeNil()
case '^': // RowVersion (uint64 LE)
if field.Len == 8 {
return hbrt.MakeLong(int64(binary.LittleEndian.Uint64(raw)))
}
return hbrt.MakeNil()
case 'Y', 'y': // Currency (int64 LE, implicit 4 decimal places)
if field.Len == 8 {
cents := int64(binary.LittleEndian.Uint64(raw))
return hbrt.MakeDouble(float64(cents)/10000.0, 20, 4)
}
return hbrt.MakeNil()
case 'T', 't': // Timestamp (Harbour extension)
if field.Len >= 8 {
julian := int64(binary.LittleEndian.Uint32(raw[0:4]))
timeMs := int32(binary.LittleEndian.Uint32(raw[4:8]))
return hbrt.MakeTimestamp(julian, timeMs)
}
if field.Len == 4 {
// Time only
timeMs := int32(binary.LittleEndian.Uint32(raw[0:4]))
return hbrt.MakeTimestamp(0, timeMs)
}
return hbrt.MakeNil()
default:
// Unknown type: return as string
return hbrt.MakeString(string(raw))
}
}
// PutFieldValue converts a Five Value to raw record bytes.
// Harbour: hb_dbfPutValue in dbf1.c
func PutFieldValue(recBuf []byte, offset uint16, field *FieldDesc, val hbrt.Value) {
raw := recBuf[offset : offset+uint16(field.Len)]
switch field.Type {
case 'C', 'c': // Character
s := val.AsString()
copy(raw, s)
// Pad with spaces
if len(s) < int(field.Len) {
for i := len(s); i < int(field.Len); i++ {
raw[i] = ' '
}
}
case 'N', 'n': // Numeric (ASCII, right-aligned, space-padded)
formatNumericField(raw, field.Len, field.Dec, val)
case 'L', 'l': // Logical
if val.IsNil() {
raw[0] = ' '
} else if val.AsBool() {
raw[0] = 'T'
} else {
raw[0] = 'F'
}
case 'D', 'd': // Date
putDateField(raw, field.Len, val)
case 'M', 'm': // Memo (block reference)
// Memo writes handled by MemoHandler
// Here just store block number
if val.IsNumInt() {
putMemoRef(raw, field.Len, uint32(val.AsNumInt()))
}
case 'I', 'i', '+': // Integer / Autoincrement
putIntegerField(raw, field.Len, val)
case 'B', 'b': // Double (IEEE 754 LE)
if field.Len == 8 {
binary.LittleEndian.PutUint64(raw, math.Float64bits(val.AsNumDouble()))
}
case '@', '=', 'T', 't': // Timestamp / Modtime
if field.Len >= 8 {
binary.LittleEndian.PutUint32(raw[0:4], uint32(val.AsJulian()))
binary.LittleEndian.PutUint32(raw[4:8], uint32(val.AsTimeMs()))
}
case 'Y', 'y': // Currency
if field.Len == 8 {
cents := int64(val.AsNumDouble() * 10000.0)
binary.LittleEndian.PutUint64(raw, uint64(cents))
}
case '^': // RowVersion
if field.Len == 8 {
binary.LittleEndian.PutUint64(raw, uint64(val.AsLong()))
}
default:
// Unknown: write as string
s := val.AsString()
copy(raw, s)
}
}
// --- Internal parsers ---
func parseNumericField(raw []byte, dec byte) hbrt.Value {
// Byte-level fast path — avoids `string(raw)` + TrimSpace + ParseInt
// allocations on the hot scan path. Numeric DBF fields are ASCII,
// right-aligned, space-padded, optional leading sign, optional `.`
// for decimals. A full 50k-row scan can hit this fn 100 k+ times,
// so every allocation matters.
//
// Algorithm:
// 1. Walk past leading spaces.
// 2. Detect sign.
// 3. Accumulate int64 digit-by-digit.
// 4. If we hit `.` or the field has dec > 0, bail to float parser
// (that path is rare on integer-typed DBF fields like IDs /
// counters, which dominate WHERE predicates).
// 5. Walk past trailing spaces.
//
// All operations are byte comparisons on the raw record buffer —
// no heap allocation unless the field is genuinely fractional.
start := 0
end := len(raw)
for start < end && raw[start] == ' ' {
start++
}
for end > start && raw[end-1] == ' ' {
end--
}
if start == end {
return hbrt.MakeInt(0)
}
if dec == 0 {
// Fast integer path
i := start
neg := false
if raw[i] == '-' {
neg = true
i++
} else if raw[i] == '+' {
i++
}
var n int64
ok := i < end
for ; i < end; i++ {
c := raw[i]
if c == '.' {
ok = false
break
}
if c < '0' || c > '9' {
ok = false
break
}
n = n*10 + int64(c-'0')
}
if ok {
if neg {
n = -n
}
return hbrt.MakeNumInt(n)
}
// Fall through: has a `.` or unexpected char → use float path
}
// Decimal/float path — allocate once for strconv
f, err := strconv.ParseFloat(string(raw[start:end]), 64)
if err == nil {
return hbrt.MakeDouble(f, uint16(len(raw)), uint16(dec))
}
return hbrt.MakeInt(0)
}
func parseLogicalField(b byte) hbrt.Value {
switch b {
case 'T', 't', 'Y', 'y':
return hbrt.MakeBool(true)
case 'F', 'f', 'N', 'n':
return hbrt.MakeBool(false)
default:
return hbrt.MakeNil() // space = uninitialized
}
}
func parseDateField(raw []byte, fieldLen byte) hbrt.Value {
if fieldLen == 8 {
// Standard: YYYYMMDD ASCII
s := string(raw)
if strings.TrimSpace(s) == "" {
return hbrt.MakeDate(0) // empty date
}
y := parseInt(s[0:4])
m := parseInt(s[4:6])
d := parseInt(s[6:8])
if y > 0 {
return hbrt.MakeDate(dateToJulian(y, m, d))
}
return hbrt.MakeDate(0)
}
if fieldLen == 3 {
// Short: LE uint24
julian := int64(raw[0]) | int64(raw[1])<<8 | int64(raw[2])<<16
return hbrt.MakeDate(julian)
}
if fieldLen == 4 {
// VFP: LE uint32 Julian
return hbrt.MakeDate(int64(binary.LittleEndian.Uint32(raw)))
}
return hbrt.MakeDate(0)
}
func parseMemoRef(raw []byte, fieldLen byte) hbrt.Value {
if fieldLen == 4 {
blockNo := binary.LittleEndian.Uint32(raw)
return hbrt.MakeLong(int64(blockNo))
}
if fieldLen == 10 {
s := strings.TrimSpace(string(raw))
if s == "" {
return hbrt.MakeLong(0)
}
n, _ := strconv.ParseInt(s, 10, 64)
return hbrt.MakeLong(n)
}
return hbrt.MakeLong(0)
}
func parseIntegerField(raw []byte, fieldLen byte) hbrt.Value {
switch fieldLen {
case 1:
return hbrt.MakeInt(int(int8(raw[0])))
case 2:
return hbrt.MakeInt(int(int16(binary.LittleEndian.Uint16(raw))))
case 3:
v := int32(raw[0]) | int32(raw[1])<<8 | int32(raw[2])<<16
if v&0x800000 != 0 {
v |= ^0xFFFFFF // sign extend
}
return hbrt.MakeInt(int(v))
case 4:
return hbrt.MakeInt(int(int32(binary.LittleEndian.Uint32(raw))))
case 8:
return hbrt.MakeLong(int64(binary.LittleEndian.Uint64(raw)))
default:
return hbrt.MakeInt(0)
}
}
// --- Internal formatters ---
func formatNumericField(raw []byte, fieldLen, dec byte, val hbrt.Value) {
d := val.AsNumDouble()
format := "%" + strconv.Itoa(int(fieldLen)) + "." + strconv.Itoa(int(dec)) + "f"
s := []byte(fmt.Sprintf(format, d))
// If too wide, fill with asterisks (Harbour behavior)
if len(s) > int(fieldLen) {
for i := range raw {
raw[i] = '*'
}
return
}
// Right-align, space-pad left
copy(raw, s)
}
func putDateField(raw []byte, fieldLen byte, val hbrt.Value) {
if fieldLen == 8 {
if !val.IsDateTime() || val.AsJulian() == 0 {
copy(raw, " ")
return
}
y, m, d := julianToDate(val.AsJulian())
s := fmt.Sprintf("%04d%02d%02d", y, m, d)
copy(raw, s)
} else if fieldLen == 4 {
binary.LittleEndian.PutUint32(raw, uint32(val.AsJulian()))
}
}
func putMemoRef(raw []byte, fieldLen byte, blockNo uint32) {
if fieldLen == 4 {
binary.LittleEndian.PutUint32(raw, blockNo)
} else if fieldLen == 10 {
s := fmt.Sprintf("%10d", blockNo)
copy(raw, s)
}
}
func putIntegerField(raw []byte, fieldLen byte, val hbrt.Value) {
n := val.AsNumInt()
switch fieldLen {
case 1:
raw[0] = byte(int8(n))
case 2:
binary.LittleEndian.PutUint16(raw, uint16(int16(n)))
case 4:
binary.LittleEndian.PutUint32(raw, uint32(int32(n)))
case 8:
binary.LittleEndian.PutUint64(raw, uint64(n))
}
}
// --- Julian date helpers ---
func dateToJulian(y, m, d int) int64 {
if m <= 2 {
y--
m += 12
}
a := y / 100
b := 2 - a + a/4
return int64(365.25*float64(y+4716)) + int64(30.6001*float64(m+1)) + int64(d+b) - 1524
}
func julianToDate(julian int64) (y, m, d int) {
if julian <= 0 {
return 0, 0, 0
}
l := julian + 68569
n := 4 * l / 146097
l = l - (146097*n+3)/4
i := 4000 * (l + 1) / 1461001
l = l - 1461*i/4 + 31
j := 80 * l / 2447
d = int(l - 2447*j/80)
l = j / 11
m = int(j + 2 - 12*l)
y = int(100*(n-49) + i + l)
return
}
func parseInt(s string) int {
n := 0
for _, c := range s {
if c >= '0' && c <= '9' {
n = n*10 + int(c-'0')
}
}
return n
}