Five strings now operate in Unicode rune units by default. Core string functions (LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT/PADR/PADL) are charset-aware: UTF-8 rune semantics by default, byte/charset semantics when a legacy charset (CP949, CP1252, ...) is selected. Initial charset is settable via FIVE_CHARSET / HB_CODEPAGE env vars; default UTF8. - hbrtl/charset.go: charset state + Str* helpers + DecodeToUTF8/EncodeFromUTF8 + RTL HB_GETCHARSET/HB_SETCHARSET/HB_CDPSELECT/HB_TRANSLATE (x/text htmlindex) - compiler/gengo: inlined string intrinsics now call charset-aware hbrtl.Str* helpers instead of byte-based Go (they previously bypassed the RTL registry) - compiler/analyzer: register HB_GETCHARSET/HB_SETCHARSET/HB_TRANSLATE as known - hbrtl/regex.go: add HB_REGEX (array-of-submatches) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
170 lines
3.9 KiB
Go
170 lines
3.9 KiB
Go
// Copyright (c) 2026 Charles KWON OhJun (charleskwonohjun@gmail.com)
|
|
// All rights reserved.
|
|
|
|
// Regular expression functions using Go's regexp package.
|
|
// HB_REGEXCOMP, HB_REGEXMATCH, HB_REGEXSPLIT, HB_REGEXALL, HB_REGEXREPLACE
|
|
|
|
package hbrtl
|
|
|
|
import (
|
|
"five/hbrt"
|
|
"regexp"
|
|
)
|
|
|
|
// HB_REGEXCOMP(cPattern [, lCaseSensitive]) → pRegex
|
|
// Returns compiled regex as a pointer value.
|
|
func HbRegexComp(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
pattern := t.Local(1).AsString()
|
|
|
|
// Default case-sensitive; if 2nd param is .F., add (?i) prefix
|
|
if nParams >= 2 && !t.Local(2).IsNil() && !t.Local(2).AsBool() {
|
|
pattern = "(?i)" + pattern
|
|
}
|
|
|
|
re, err := regexp.Compile(pattern)
|
|
if err != nil {
|
|
t.RetNil()
|
|
return
|
|
}
|
|
t.RetPointer(re)
|
|
}
|
|
|
|
// HB_REGEXMATCH(cPattern|pRegex, cString [, lCaseSensitive]) → lMatch
|
|
func HbRegexMatch(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
re := getRegex(t, 1, nParams >= 3)
|
|
if re == nil {
|
|
t.RetBool(false)
|
|
return
|
|
}
|
|
|
|
str := t.Local(2).AsString()
|
|
t.RetBool(re.MatchString(str))
|
|
}
|
|
|
|
// HB_REGEXSPLIT(cPattern|pRegex, cString [, lCaseSensitive]) → aResult
|
|
func HbRegexSplit(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
re := getRegex(t, 1, nParams >= 3)
|
|
if re == nil {
|
|
t.RetNil()
|
|
return
|
|
}
|
|
|
|
str := t.Local(2).AsString()
|
|
parts := re.Split(str, -1)
|
|
|
|
items := make([]hbrt.Value, len(parts))
|
|
for i, p := range parts {
|
|
items[i] = hbrt.MakeString(p)
|
|
}
|
|
t.RetVal(hbrt.MakeArrayFrom(items))
|
|
}
|
|
|
|
// HB_REGEXALL(cPattern|pRegex, cString [, lCaseSensitive]) → aMatches
|
|
func HbRegexAll(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
re := getRegex(t, 1, nParams >= 3)
|
|
if re == nil {
|
|
t.RetNil()
|
|
return
|
|
}
|
|
|
|
str := t.Local(2).AsString()
|
|
matches := re.FindAllString(str, -1)
|
|
|
|
items := make([]hbrt.Value, len(matches))
|
|
for i, m := range matches {
|
|
items[i] = hbrt.MakeString(m)
|
|
}
|
|
t.RetVal(hbrt.MakeArrayFrom(items))
|
|
}
|
|
|
|
// HB_REGEX(cPattern|pRegex, cString [, lCaseSensitive]) → aSubmatches
|
|
//
|
|
// Returns the first match plus any capture groups. Maps directly to
|
|
// Go's regexp.FindStringSubmatch: result[1] is the whole match,
|
|
// result[2..] are the capture groups. Empty PRG array on no match —
|
|
// distinguishable from `Nil` by callers that need the "anchor present
|
|
// but groups missing" case.
|
|
//
|
|
// Harbour parity: this is the array-of-submatches `hb_regex(p, s)`
|
|
// flavour. The Five-original capture-only callers should use this.
|
|
func HbRegex(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
re := getRegex(t, 1, nParams >= 3)
|
|
if re == nil {
|
|
t.RetVal(hbrt.MakeArrayFrom(nil))
|
|
return
|
|
}
|
|
|
|
str := t.Local(2).AsString()
|
|
m := re.FindStringSubmatch(str)
|
|
if m == nil {
|
|
t.RetVal(hbrt.MakeArrayFrom(nil))
|
|
return
|
|
}
|
|
|
|
items := make([]hbrt.Value, len(m))
|
|
for i, s := range m {
|
|
items[i] = hbrt.MakeString(s)
|
|
}
|
|
t.RetVal(hbrt.MakeArrayFrom(items))
|
|
}
|
|
|
|
// HB_REGEXREPLACE(cPattern|pRegex, cString, cReplace [, lCaseSensitive]) → cResult
|
|
func HbRegexReplace(t *hbrt.Thread) {
|
|
nParams := t.ParamCount()
|
|
t.Frame(nParams, 0)
|
|
defer t.EndProc()
|
|
|
|
re := getRegex(t, 1, nParams >= 4)
|
|
if re == nil {
|
|
t.RetString("")
|
|
return
|
|
}
|
|
|
|
str := t.Local(2).AsString()
|
|
repl := t.Local(3).AsString()
|
|
t.RetString(re.ReplaceAllString(str, repl))
|
|
}
|
|
|
|
// getRegex extracts or compiles a regex from param at given index.
|
|
func getRegex(t *hbrt.Thread, paramIdx int, hasCaseParam bool) *regexp.Regexp {
|
|
v := t.Local(paramIdx)
|
|
if v.IsPointer() {
|
|
if re, ok := v.AsPointer().(*regexp.Regexp); ok {
|
|
return re
|
|
}
|
|
}
|
|
// String pattern — compile on the fly
|
|
pattern := v.AsString()
|
|
if hasCaseParam {
|
|
caseParam := t.Local(paramIdx + 2) // 3rd or 4th param
|
|
if !caseParam.IsNil() && !caseParam.AsBool() {
|
|
pattern = "(?i)" + pattern
|
|
}
|
|
}
|
|
re, err := regexp.Compile(pattern)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return re
|
|
}
|