feat(pp,rtl): Tier 2 audit followups — JOIN hash + PP validation + C heuristic

Three medium-priority audit items in one commit, each independently
revertible.

  * **#18 JOIN hash-join fast path.** New std.ch shape:
        JOIN WITH <alias> TO <file> [FIELDS ...] ON <mfield> = <dfield>
    expands to a 6-arg __dbJoin call with the master/detail key
    field names. Runtime detects the extra args, builds an O(M)
    hash over the detail's key column, then probes per master row
    for O(N+M) total — vs the FOR form's O(N*M). For 1k×1k that's
    2k vs 1M operations; the gap widens with N. The original FOR
    form is unchanged and stays the fallback for arbitrary
    predicates. New helper dbHashKey type-tags the key string so
    `1` (numeric), `"1"` (string), and `.T.` (logical) don't
    collide in the bucket map.

  * **#38 PP rule result-marker validation.** ParseRule now walks
    the result template after parseMarkers and warns about every
    `<name>` (or `<(name)>` / `<.name.>` / `<{name}>` / `#<name>`
    / `<"name">`) that doesn't match a pattern marker. Warnings
    flow into pp.errors via handleDirective with the directive's
    filename:line, so a typo'd `<NaMe>` in an `#xcommand`
    case-sensitive rule fails the build with a clear diagnostic
    instead of silently producing broken expansions.

  * **#44 looksLikeInlineC heuristic strengthened.** Catches more
    of the common Harbour-PRG-with-C-inline-block shapes that
    used to fall through and produce cryptic Go-side errors:
    function-like #define, `extern "C"` linkage blocks, C return-
    type declarations (`int foo(`, `static char* bar(`), and the
    hb_ret*() helper family used by Harbour's C FFI return
    setters. Two small predicate helpers (allLetters,
    allIdentChars) keep the C-vs-Go disambiguation tight enough
    that legit Go code (`func name() int { ... }`) doesn't trip.

  * **#28 LIST/DISPLAY pagination** — explicitly deferred. Proper
    pagination requires interactive terminal handling (Inkey(0)
    for the keypress) which would hang in CI / batch mode. Will
    revisit when an interactive terminal layer needs it for
    other reasons.

Test fixtures: tests/std_ch/test_join_hash.prg verifies the new
ON-form path produces the same output as the FOR form would.
std.ch runner now stands at 16/16.

Other gates green:
  go test ./...      : PASS
  FiveSql2 SQL:1999  : 43/43
  Harbour compat     : 56/56
  std.ch suite       : 16/16
  FRB suite          : 7/7

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 19:21:19 +09:00
parent 29ca02e1bc
commit 2008266da7
6 changed files with 379 additions and 14 deletions

View File

@@ -28,6 +28,7 @@
package pp
import (
"fmt"
"strings"
)
@@ -40,6 +41,13 @@ type Rule struct {
Keyword string // first keyword (for fast matching)
Markers []Marker // parsed pattern markers
ResultTmpl string // result template with marker references
// Warnings collected during ParseRule. Currently only one source:
// result-template markers that reference a name absent from the
// pattern. Caller can surface these to the user — a typo'd
// `<For>` instead of `<for>` used to silently produce broken
// expansion output.
Warnings []string
}
// Marker represents a pattern marker like <x>, <!x!>, <x,...>, <*x*>.
@@ -105,9 +113,86 @@ func ParseRule(directive string, isCommand, caseSens bool) *Rule {
// Parse markers from pattern
rule.Markers = parseMarkers(pattern)
// Validate result-template marker references. Each `<name>`
// (and its smart-stringify / blockify / logify / dumb-stringify
// variants) must reference a name declared in the pattern.
// Catches typos like `<For>` vs `<for>` (case-sensitive
// xcommand) before they silently produce broken output at
// expansion time.
rule.Warnings = validateResultMarkers(pattern, result, rule.Markers, caseSens)
return rule
}
// validateResultMarkers scans the result template for marker
// references and reports any name not declared in the pattern.
// Result returned as a slice of human-readable warning strings —
// caller decides whether to surface or ignore.
func validateResultMarkers(pattern, result string, markers []Marker, caseSens bool) []string {
declared := make(map[string]bool, len(markers))
for _, m := range markers {
key := m.Name
if !caseSens {
key = strings.ToUpper(key)
}
declared[key] = true
}
if len(declared) == 0 {
// Nothing to validate against — rule is keyword-only.
return nil
}
var warnings []string
seen := map[string]bool{}
i := 0
for i < len(result) {
// Marker shapes recognised here mirror applyResult's loop:
// <name>, <(name)>, <.name.>, <{name}>, <"name">, #<name>.
// findMarkerEnd already understands all of them — we just
// need the inner identifier.
if result[i] != '<' && !(result[i] == '#' && i+1 < len(result) && result[i+1] == '<') {
i++
continue
}
start := i
if result[i] == '#' {
start = i + 1
}
end := findMarkerEnd(result, start)
if end == 0 {
i++
continue
}
// Extract identifier between the wrappers.
inner := result[start+1 : end-1]
// Strip prefix `(`, `.`, `"`, `{`
for len(inner) > 0 && (inner[0] == '(' || inner[0] == '.' || inner[0] == '"' || inner[0] == '{') {
inner = inner[1:]
}
// Strip suffix `)`, `.`, `"`, `}`
for len(inner) > 0 {
c := inner[len(inner)-1]
if c == ')' || c == '.' || c == '"' || c == '}' || c == ' ' {
inner = inner[:len(inner)-1]
} else {
break
}
}
key := inner
if !caseSens {
key = strings.ToUpper(key)
}
if key != "" && !declared[key] && !seen[key] {
seen[key] = true
warnings = append(warnings,
fmt.Sprintf("result-template marker <%s> not declared in pattern: %q",
inner, pattern))
}
i = end
}
return warnings
}
// parseMarkers extracts all <...> markers from a pattern.
func parseMarkers(pattern string) []Marker {
var markers []Marker

View File

@@ -378,29 +378,37 @@ func (pp *Preprocessor) handleDirective(filename, directive string, depth int, r
return true
}
// #command / #translate — parse and store rules
if strings.HasPrefix(upper, "COMMAND ") {
if rule := ParseRule(directive[8:], true, false); rule != nil {
pp.commands = append(pp.commands, rule)
// #command / #translate — parse and store rules. ParseRule now
// validates that result-template marker references resolve to a
// pattern marker; any unresolved name flows back as a warning
// surfaced via pp.errors with the directive's filename:line so
// the user can find the typo (e.g. case-sensitive `<For>` vs
// `<for>` in an #xcommand). Without surfacing, the broken
// expansion silently produced empty / mangled output at every
// call site.
registerRule := func(r *Rule, store *[]*Rule) {
if r == nil {
return
}
*store = append(*store, r)
for _, w := range r.Warnings {
pp.errors = append(pp.errors, fmt.Sprintf("%s:%d: #command: %s", filename, lineNo, w))
}
}
if strings.HasPrefix(upper, "COMMAND ") {
registerRule(ParseRule(directive[8:], true, false), &pp.commands)
return true
}
if strings.HasPrefix(upper, "TRANSLATE ") {
if rule := ParseRule(directive[10:], false, false); rule != nil {
pp.translates = append(pp.translates, rule)
}
registerRule(ParseRule(directive[10:], false, false), &pp.translates)
return true
}
if strings.HasPrefix(upper, "XCOMMAND ") {
if rule := ParseRule(directive[9:], true, true); rule != nil {
pp.commands = append(pp.commands, rule)
}
registerRule(ParseRule(directive[9:], true, true), &pp.commands)
return true
}
if strings.HasPrefix(upper, "XTRANSLATE ") {
if rule := ParseRule(directive[11:], false, true); rule != nil {
pp.translates = append(pp.translates, rule)
}
registerRule(ParseRule(directive[11:], false, true), &pp.translates)
return true
}
@@ -714,6 +722,16 @@ func looksLikeInlineC(body string) bool {
if strings.HasPrefix(l, "#include <") || strings.HasPrefix(l, `#include "`) {
return true
}
// Function-like #define is C-only — Go uses const / generics.
// `#define FOO(x) ...`
if strings.HasPrefix(l, "#define ") {
// Find the name and check for `(` immediately after with
// no space (function-like macro signature).
rest := strings.TrimSpace(l[8:])
if i := strings.IndexAny(rest, " \t("); i > 0 && i < len(rest) && rest[i] == '(' {
return true
}
}
// Bare `HB_FUNC( NAME )` with an unquoted identifier is the
// Harbour C FFI macro. The Go-side counterpart is
// `hbrt.HB_FUNC("NAME", fn)` — lowercase package prefix and a
@@ -723,12 +741,81 @@ func looksLikeInlineC(body string) bool {
strings.HasPrefix(l, "HB_FUNC_TRANSLATE(") {
return true
}
// `extern "C"` — C / C++ linkage block, never Go.
if strings.HasPrefix(l, `extern "C"`) {
return true
}
// C declarations at line start that have no Go analogue.
if strings.HasPrefix(l, "typedef ") || strings.HasPrefix(l, "struct ") ||
strings.HasPrefix(l, "int main(") || strings.HasPrefix(l, "void main(") {
return true
}
// C return-type declarations: `int name(`, `char *name(`, etc.
// Matching exactly `<C-type> <ident>(` keeps us off Go's
// `func name(` (which starts with `func`, not a type word)
// and Go variable declarations (which use `:=` or `var`).
if isCReturnTypeDecl(l) {
return true
}
// hb_ret*(...) helpers — Harbour's C-side return setters.
// hb_retc / hb_retni / hb_retnl / hb_retd / hb_retl / hb_retptr
if strings.HasPrefix(l, "hb_ret") {
rest := l[6:]
if i := strings.IndexByte(rest, '('); i >= 0 {
name := rest[:i]
if name != "" && allLetters(name) {
return true
}
}
}
}
return false
}
// isCReturnTypeDecl reports whether the line opens a C function
// declaration like `int foo(` / `static char* bar(`. We match a
// short prefix list of C-only types so a Go declaration like
// `func name() int { ... }` doesn't trip this.
func isCReturnTypeDecl(l string) bool {
cTypePrefixes := []string{
"int ", "void ", "char ", "long ", "short ", "double ", "float ",
"unsigned ", "signed ", "size_t ", "ssize_t ", "uint",
"static int ", "static void ", "static char ", "static long ",
}
for _, p := range cTypePrefixes {
if strings.HasPrefix(l, p) {
rest := strings.TrimLeft(l[len(p):], " \t*")
// rest should now start with an identifier followed by `(`.
if i := strings.IndexByte(rest, '('); i > 0 && i < 50 {
name := rest[:i]
if allIdentChars(name) {
return true
}
}
}
}
return false
}
func allLetters(s string) bool {
for _, c := range s {
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
return false
}
}
return s != ""
}
func allIdentChars(s string) bool {
for i, c := range s {
if c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') {
continue
}
if i > 0 && c >= '0' && c <= '9' {
continue
}
return false
}
return s != ""
}

View File

@@ -154,7 +154,26 @@
/* JOIN merges the current ("master") workarea with the named
detail alias into a fresh DBF, emitting one output row per
master/detail pair where FOR evaluates true. */
master/detail pair where FOR evaluates true.
The ON form takes the equality-key field names directly and
activates a hash-join fast path: build a hash over the detail's
key column once, then probe per master row — O(N+M) total instead
of the FOR form's O(N*M) nested-loop. Use it whenever the
join predicate is a simple `master.k = detail.k` equality. The
FOR form remains available for arbitrary predicates. Order
matters: the ON rule is more specific so it wins.
Note for callers: ON expects bare field names, not expressions.
Five doesn't auto-resolve bare identifiers to fields, but std.ch
passes them as quoted strings via <(mfield)> / <(dfield)> so the
PP captures the field names verbatim — runtime-side __dbJoin
does its own field lookup. */
#command JOIN WITH <(alias)> TO <(f)> [FIELDS <fields,...>] ;
ON <mfield> = <dfield> => ;
__dbJoin( <(alias)>, <(f)>, { <(fields)> }, NIL, ;
<(mfield)>, <(dfield)> )
#command JOIN [WITH <(alias)>] [TO <(f)>] [FIELDS <fields,...>] ;
[FOR <for>] => ;
__dbJoin( <(alias)>, <(f)>, { <(fields)> }, <{for}> )

View File

@@ -786,6 +786,33 @@ func nextTmpAlias(prefix string) string {
return fmt.Sprintf("%s_%d", prefix, n)
}
// dbHashKey turns a workarea field value into a hash-table key
// string. Numeric / Date / Logical / String types are encoded with a
// distinct one-byte tag prefix so values that happen to share the
// same string form across types ("1" vs 1 vs .T.) don't collide.
// NIL is its own bucket — Harbour's `==` says NIL never equals
// anything, but for join semantics we treat NIL keys as a single
// bucket so the user can still JOIN over rows with missing keys
// when both sides are NIL.
func dbHashKey(v hbrt.Value) string {
switch {
case v.IsNil():
return "\x00"
case v.IsNumeric():
return fmt.Sprintf("N\x01%g", v.AsNumDouble())
case v.IsLogical():
if v.AsBool() {
return "L\x01T"
}
return "L\x01F"
case v.IsDate():
return fmt.Sprintf("D\x01%d", v.AsJulian())
case v.IsString():
return "S\x01" + strings.TrimRight(v.AsString(), " ")
}
return "?\x01" + v.AsString()
}
// rtlDbNotImpl raises a runtime error explaining which xBase clause
// the user invoked that Five doesn't yet implement. std.ch routes
// SDF / DELIMITED / TO PRINTER / TO FILE variants here so they fail
@@ -1780,6 +1807,100 @@ func rtlDbJoin(t *hbrt.Thread) {
}
dstArea := wam.AreaAt(dstSel)
// Hash-join fast path. When the caller passes master and detail
// key field names (params 5 + 6), build a hash table over the
// detail in O(M), then scan master in O(N) and probe — total
// O(N+M) instead of the nested-loop's O(N*M). For 1k×1k that's
// 2k vs 1M operations, and the gap widens fast.
masterKeyName := ""
detailKeyName := ""
if nParams >= 5 && t.Local(5).IsString() {
masterKeyName = strings.ToUpper(strings.TrimSpace(t.Local(5).AsString()))
}
if nParams >= 6 && t.Local(6).IsString() {
detailKeyName = strings.ToUpper(strings.TrimSpace(t.Local(6).AsString()))
}
if masterKeyName != "" && detailKeyName != "" {
mkIdx, dkIdx := -1, -1
for i := 0; i < master.FieldCount(); i++ {
if strings.EqualFold(master.GetFieldInfo(i).Name, masterKeyName) {
mkIdx = i
break
}
}
for i := 0; i < detail.FieldCount(); i++ {
if strings.EqualFold(detail.GetFieldInfo(i).Name, detailKeyName) {
dkIdx = i
break
}
}
if mkIdx >= 0 && dkIdx >= 0 {
// Build detail hash: key string → list of cached field rows.
// We capture each detail row's wanted-field VALUES (not just
// rec numbers) so we don't have to re-select the detail area
// per probe — saves the WA-switch round trip and keeps the
// inner loop tight.
type detailRow struct {
vals []hbrt.Value
}
buckets := make(map[string][]detailRow, 1024)
wam.SelectByNum(detailSel)
detail.GoTop()
for !detail.EOF() {
k, _ := detail.GetValue(dkIdx)
key := dbHashKey(k)
row := detailRow{vals: make([]hbrt.Value, 0, len(srcRefs))}
for _, r := range srcRefs {
if r.isMaster {
row.vals = append(row.vals, hbrt.MakeNil()) // master fills later
} else {
v, _ := detail.GetValue(r.idx)
row.vals = append(row.vals, v)
}
}
buckets[key] = append(buckets[key], row)
detail.Skip(1)
}
// Scan master, probe detail buckets.
wam.SelectByNum(masterSel)
master.GoTop()
for !master.EOF() {
mk, _ := master.GetValue(mkIdx)
key := dbHashKey(mk)
rows, hit := buckets[key]
if hit {
// Cache master-side values for this row once.
mvals := make([]hbrt.Value, len(srcRefs))
for k, r := range srcRefs {
if r.isMaster {
v, _ := master.GetValue(r.idx)
mvals[k] = v
}
}
wam.SelectByNum(dstSel)
for _, drow := range rows {
dstArea.Append()
for k, r := range srcRefs {
if r.isMaster {
dstArea.PutValue(k, mvals[k])
} else {
dstArea.PutValue(k, drow.vals[k])
}
}
}
wam.SelectByNum(masterSel)
}
master.Skip(1)
}
goto closeDst
}
// Key names didn't resolve — fall through to nested-loop with
// (likely empty) bFor. User typo is reported as a no-result
// JOIN rather than crash; the destination DBF still gets
// created (matches Harbour: NO ROWS != error).
}
wam.SelectByNum(masterSel)
master.GoTop()
for !master.EOF() {
@@ -1818,6 +1939,8 @@ func rtlDbJoin(t *hbrt.Thread) {
master.Skip(1)
}
closeDst:
wam.SelectByNum(dstSel)
closeErr := wam.Close()
wam.SelectByNum(masterSel)

View File

@@ -31,6 +31,7 @@ TESTS=(
test_unsupported
test_block_comma
test_compound_lhs
test_join_hash
)
work="$(mktemp -d)"

View File

@@ -0,0 +1,50 @@
/* JOIN ... ON hash-join fast path. */
PROCEDURE Main()
LOCAL n
FErase("cust.dbf"); FErase("ord.dbf"); FErase("out.dbf")
/* Customers (master) */
dbCreate("cust.dbf", { {"CID","N",4,0}, {"CNAME","C",12,0} })
USE cust.dbf NEW EXCLUSIVE ALIAS cu
dbAppend(); FieldPut(1, 1); FieldPut(2, "Alice")
dbAppend(); FieldPut(1, 2); FieldPut(2, "Bob")
dbAppend(); FieldPut(1, 3); FieldPut(2, "Carol")
dbCommit()
dbCloseArea()
/* Orders (detail) */
dbCreate("ord.dbf", { {"OID","N",4,0}, {"CID","N",4,0}, {"AMT","N",8,2} })
USE ord.dbf NEW EXCLUSIVE ALIAS od
dbAppend(); FieldPut(1, 100); FieldPut(2, 1); FieldPut(3, 50)
dbAppend(); FieldPut(1, 101); FieldPut(2, 1); FieldPut(3, 30)
dbAppend(); FieldPut(1, 102); FieldPut(2, 2); FieldPut(3, 200)
dbAppend(); FieldPut(1, 103); FieldPut(2, 4); FieldPut(3, 99) /* no-match */
dbCommit()
dbCloseArea()
USE cust.dbf NEW EXCLUSIVE ALIAS cu
USE ord.dbf NEW EXCLUSIVE ALIAS od
SELECT cu
/* Hash-join form */
JOIN WITH od TO out.dbf FIELDS cid, cname, oid, amt ON cid = cid
SELECT cu ; dbCloseArea()
SELECT od ; dbCloseArea()
USE out.dbf NEW EXCLUSIVE
COUNT TO n
? "Hash-joined rows =", n, "(expect 3)"
dbGoTop()
DO WHILE !Eof()
? " ", FieldGet(1), AllTrim(FieldGet(2)), FieldGet(3), FieldGet(4)
dbSkip()
ENDDO
? "(expect three rows: Alice/100/50, Alice/101/30, Bob/102/200)"
dbCloseArea()
FErase("cust.dbf"); FErase("ord.dbf"); FErase("out.dbf")
? "DONE"
RETURN