Files
five/hbrtl/regex.go
CharlesKWON d5e15272d2 feat(charset): UTF-8 default string semantics with selectable charset
Five strings now operate in Unicode rune units by default. Core string
functions (LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT/PADR/PADL) are charset-aware:
UTF-8 rune semantics by default, byte/charset semantics when a legacy
charset (CP949, CP1252, ...) is selected. Initial charset is settable via
FIVE_CHARSET / HB_CODEPAGE env vars; default UTF8.

- hbrtl/charset.go: charset state + Str* helpers + DecodeToUTF8/EncodeFromUTF8
  + RTL HB_GETCHARSET/HB_SETCHARSET/HB_CDPSELECT/HB_TRANSLATE (x/text htmlindex)
- compiler/gengo: inlined string intrinsics now call charset-aware hbrtl.Str*
  helpers instead of byte-based Go (they previously bypassed the RTL registry)
- compiler/analyzer: register HB_GETCHARSET/HB_SETCHARSET/HB_TRANSLATE as known
- hbrtl/regex.go: add HB_REGEX (array-of-submatches)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 12:42:33 +09:00

170 lines
3.9 KiB
Go

// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
// All rights reserved.
// Regular expression functions using Go's regexp package.
// HB_REGEXCOMP, HB_REGEXMATCH, HB_REGEXSPLIT, HB_REGEXALL, HB_REGEXREPLACE
package hbrtl
import (
"five/hbrt"
"regexp"
)
// HB_REGEXCOMP(cPattern [, lCaseSensitive]) → pRegex
// Returns compiled regex as a pointer value.
func HbRegexComp(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
pattern := t.Local(1).AsString()
// Default case-sensitive; if 2nd param is .F., add (?i) prefix
if nParams >= 2 && !t.Local(2).IsNil() && !t.Local(2).AsBool() {
pattern = "(?i)" + pattern
}
re, err := regexp.Compile(pattern)
if err != nil {
t.RetNil()
return
}
t.RetPointer(re)
}
// HB_REGEXMATCH(cPattern|pRegex, cString [, lCaseSensitive]) → lMatch
func HbRegexMatch(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 3)
if re == nil {
t.RetBool(false)
return
}
str := t.Local(2).AsString()
t.RetBool(re.MatchString(str))
}
// HB_REGEXSPLIT(cPattern|pRegex, cString [, lCaseSensitive]) → aResult
func HbRegexSplit(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 3)
if re == nil {
t.RetNil()
return
}
str := t.Local(2).AsString()
parts := re.Split(str, -1)
items := make([]hbrt.Value, len(parts))
for i, p := range parts {
items[i] = hbrt.MakeString(p)
}
t.RetVal(hbrt.MakeArrayFrom(items))
}
// HB_REGEXALL(cPattern|pRegex, cString [, lCaseSensitive]) → aMatches
func HbRegexAll(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 3)
if re == nil {
t.RetNil()
return
}
str := t.Local(2).AsString()
matches := re.FindAllString(str, -1)
items := make([]hbrt.Value, len(matches))
for i, m := range matches {
items[i] = hbrt.MakeString(m)
}
t.RetVal(hbrt.MakeArrayFrom(items))
}
// HB_REGEX(cPattern|pRegex, cString [, lCaseSensitive]) → aSubmatches
//
// Returns the first match plus any capture groups. Maps directly to
// Go's regexp.FindStringSubmatch: result[1] is the whole match,
// result[2..] are the capture groups. Empty PRG array on no match —
// distinguishable from `Nil` by callers that need the "anchor present
// but groups missing" case.
//
// Harbour parity: this is the array-of-submatches `hb_regex(p, s)`
// flavour. The Five-original capture-only callers should use this.
func HbRegex(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 3)
if re == nil {
t.RetVal(hbrt.MakeArrayFrom(nil))
return
}
str := t.Local(2).AsString()
m := re.FindStringSubmatch(str)
if m == nil {
t.RetVal(hbrt.MakeArrayFrom(nil))
return
}
items := make([]hbrt.Value, len(m))
for i, s := range m {
items[i] = hbrt.MakeString(s)
}
t.RetVal(hbrt.MakeArrayFrom(items))
}
// HB_REGEXREPLACE(cPattern|pRegex, cString, cReplace [, lCaseSensitive]) → cResult
func HbRegexReplace(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 4)
if re == nil {
t.RetString("")
return
}
str := t.Local(2).AsString()
repl := t.Local(3).AsString()
t.RetString(re.ReplaceAllString(str, repl))
}
// getRegex extracts or compiles a regex from param at given index.
func getRegex(t *hbrt.Thread, paramIdx int, hasCaseParam bool) *regexp.Regexp {
v := t.Local(paramIdx)
if v.IsPointer() {
if re, ok := v.AsPointer().(*regexp.Regexp); ok {
return re
}
}
// String pattern — compile on the fly
pattern := v.AsString()
if hasCaseParam {
caseParam := t.Local(paramIdx + 2) // 3rd or 4th param
if !caseParam.IsNil() && !caseParam.AsBool() {
pattern = "(?i)" + pattern
}
}
re, err := regexp.Compile(pattern)
if err != nil {
return nil
}
return re
}