feat(charset): UTF-8 default string semantics with selectable charset

Five strings now operate in Unicode rune units by default. Core string
functions (LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT/PADR/PADL) are charset-aware:
UTF-8 rune semantics by default, byte/charset semantics when a legacy
charset (CP949, CP1252, ...) is selected. Initial charset is settable via
FIVE_CHARSET / HB_CODEPAGE env vars; default UTF8.

- hbrtl/charset.go: charset state + Str* helpers + DecodeToUTF8/EncodeFromUTF8
  + RTL HB_GETCHARSET/HB_SETCHARSET/HB_CDPSELECT/HB_TRANSLATE (x/text htmlindex)
- compiler/gengo: inlined string intrinsics now call charset-aware hbrtl.Str*
  helpers instead of byte-based Go (they previously bypassed the RTL registry)
- compiler/analyzer: register HB_GETCHARSET/HB_SETCHARSET/HB_TRANSLATE as known
- hbrtl/regex.go: add HB_REGEX (array-of-submatches)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
CharlesKWON
2026-06-15 12:42:33 +09:00
parent a8f6e53785
commit d5e15272d2
8 changed files with 476 additions and 44 deletions

View File

@@ -629,6 +629,8 @@ var rtlFunctions = map[string]bool{
"HB_DATETIME": true, "HB_HOUR": true, "HB_MINUTE": true, "HB_SEC": true,
"HB_TTOC": true, "HB_CTOT": true, "HB_SECOND": true, "HB_ATOKENS": true,
"HB_CDPSELECT": true, "HB_TTOS": true, "HB_STOT": true, "HB_MILLISECONDS": true,
// Charset (charset.go)
"HB_GETCHARSET": true, "HB_SETCHARSET": true, "HB_TRANSLATE": true,
"HB_DATE": true, "HB_CTOD": true, "HB_DTOC": true, "HB_STOD": true,
"HB_DTOT": true, "HB_TTOD": true, "HB_TTOHOUR": true, "HB_TTOMIN": true,
"HB_TTOSEC": true, "HB_TTOMSEC": true, "HB_TTON": true, "HB_NTOT": true,

View File

@@ -1396,7 +1396,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
case "LEN":
if len(args) == 1 {
g.emitExpr(args[0])
g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(len(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }")
g.writeln("{ _v := t.Pop2(); if _v.IsString() { t.PushInt(hbrtl.StrLen(_v.AsString())) } else if _v.IsArray() { t.PushInt(len(_v.AsArray().Items)) } else if _v.IsHash() { t.PushInt(len(_v.AsHash().Keys)) } else { t.PushInt(0) } }")
return true
}
case "EMPTY":
@@ -1409,13 +1409,13 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
case "CHR":
if len(args) == 1 {
g.emitExpr(args[0])
g.writeln("t.PushString(string(byte(t.Pop2().AsNumInt())))")
g.writeln("t.PushString(hbrtl.StrChr(int(t.Pop2().AsNumInt())))")
return true
}
case "ASC":
if len(args) == 1 {
g.emitExpr(args[0])
g.writeln("{ _s := t.Pop2().AsString(); if len(_s)>0 { t.PushInt(int(_s[0])) } else { t.PushInt(0) } }")
g.writeln("{ _s := t.Pop2().AsString(); t.PushInt(hbrtl.StrAsc(_s)) }")
return true
}
case "EOF":
@@ -1496,8 +1496,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
g.emitExpr(args[0])
g.emitExpr(args[1])
g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
g.writeln("if len(_ps) >= _pn { t.PushString(_ps[:_pn])")
g.writeln("} else { t.PushString(_ps + hbrtl.Spaces(_pn - len(_ps))) } }")
g.writeln("t.PushString(hbrtl.StrPadR(_ps, _pn)) }")
return true
}
case "PADL":
@@ -1507,13 +1506,10 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
if len(args) == 3 {
g.emitExpr(args[2])
g.writeln("{ _pf := t.Pop2().AsString(); _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])")
g.writeln("} else { t.PushString(strings.Repeat(_pf[:1], _pn-len(_ps)) + _ps) } }")
g.imports["strings"] = true
g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, _pf)) }")
} else {
g.writeln("{ _pn := int(t.Pop2().AsNumInt()); _ps := t.Pop2().AsString()")
g.writeln("if len(_ps) >= _pn { t.PushString(_ps[len(_ps)-_pn:])")
g.writeln("} else { t.PushString(hbrtl.Spaces(_pn - len(_ps)) + _ps) } }")
g.writeln("t.PushString(hbrtl.StrPadL(_ps, _pn, \" \")) }")
}
return true
}
@@ -1523,13 +1519,12 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
g.emitExpr(args[1])
if len(args) == 3 {
g.emitExpr(args[2])
g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString()")
g.writeln("{ _sl := int(t.Pop2().AsNumInt()); _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()")
g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, _sl, true)) }")
} else {
g.writeln("{ _sl := 0; _sp := int(t.Pop2().AsNumInt())-1; _ss := t.Pop2().AsString(); _sl = len(_ss) - _sp")
g.writeln("{ _sp := int(t.Pop2().AsNumInt()); _ss := t.Pop2().AsString()")
g.writeln("t.PushString(hbrtl.StrSubStr(_ss, _sp, 0, false)) }")
}
g.writeln("if _sp < 0 { _sp = 0 }; if _sp > len(_ss) { _sp = len(_ss) }")
g.writeln("if _sp+_sl > len(_ss) { _sl = len(_ss) - _sp }")
g.writeln("t.PushString(_ss[_sp:_sp+_sl]) }")
return true
}
case "LEFT":
@@ -1537,7 +1532,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
g.emitExpr(args[0])
g.emitExpr(args[1])
g.writeln("{ _ln := int(t.Pop2().AsNumInt()); _ls := t.Pop2().AsString()")
g.writeln("if _ln >= len(_ls) { t.PushString(_ls) } else if _ln <= 0 { t.PushString(\"\") } else { t.PushString(_ls[:_ln]) } }")
g.writeln("t.PushString(hbrtl.StrLeft(_ls, _ln)) }")
return true
}
case "RIGHT":
@@ -1545,7 +1540,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
g.emitExpr(args[0])
g.emitExpr(args[1])
g.writeln("{ _rn := int(t.Pop2().AsNumInt()); _rs := t.Pop2().AsString()")
g.writeln("if _rn >= len(_rs) { t.PushString(_rs) } else if _rn <= 0 { t.PushString(\"\") } else { t.PushString(_rs[len(_rs)-_rn:]) } }")
g.writeln("t.PushString(hbrtl.StrRight(_rs, _rn)) }")
return true
}
case "AT":
@@ -1553,9 +1548,7 @@ func (g *Generator) tryEmitInlineRTL(name string, args []ast.Expr) bool {
g.emitExpr(args[0])
g.emitExpr(args[1])
g.writeln("{ _as := t.Pop2().AsString(); _ak := t.Pop2().AsString()")
g.writeln("_ai := strings.Index(_as, _ak)")
g.writeln("if _ai >= 0 { t.PushInt(_ai+1) } else { t.PushInt(0) } }")
g.imports["strings"] = true
g.writeln("t.PushInt(hbrtl.StrAt(_ak, _as)) }")
return true
}
case "IIF":

319
hbrtl/charset.go Normal file
View File

@@ -0,0 +1,319 @@
// charset.go — Five 문자셋(charset/codepage) 서브시스템.
//
// 설계
// - Five 의 문자열 기본 인코딩은 UTF-8 이다(Go 네이티브). 코어 문자열
// 함수(CHR/ASC/LEN/SUBSTR/LEFT/RIGHT/AT)는 활성 charset 이 UTF-8 이면
// '문자(rune)' 단위로 동작한다.
// - 활성 charset 을 지정하면(예: CP949, CP1252) 그 charset 의 의미로
// 동작하고, 입출력 경계 변환은 HB_TRANSLATE / 디코드·인코드 헬퍼로 한다.
// - 정의하지 않으면 UTF-8. 초기값은 환경변수 FIVE_CHARSET(또는 HB_CODEPAGE)
// 로 지정할 수 있다.
//
// PRG surface
// HB_GETCHARSET() → cCurrent (예: "UTF8")
// HB_SETCHARSET([cName]) → cPrev (인자 없으면 조회만)
// HB_CDPSELECT([cName]) → cPrev (Harbour 호환 별칭)
// HB_TRANSLATE(cStr, cFrom, cTo) → cConverted (charset 간 변환)
package hbrtl
import (
"os"
"strings"
"sync"
"unicode/utf8"
"five/hbrt"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/htmlindex"
)
var (
csMu sync.RWMutex
csName = "UTF8"
)
func init() {
v := os.Getenv("FIVE_CHARSET")
if v == "" {
v = os.Getenv("HB_CODEPAGE")
}
if v != "" {
csName = normCharset(v)
}
}
// normCharset — 표기 정규화(대문자, 하이픈 제거 일부). "utf-8" → "UTF8".
func normCharset(s string) string {
s = strings.ToUpper(strings.TrimSpace(s))
switch s {
case "UTF-8", "UTF8", "":
return "UTF8"
}
return s
}
// charsetIsUTF8 — 활성 charset 이 UTF-8 인가(기본값).
func charsetIsUTF8() bool { return csIsUTF8(GetCharset()) }
func csIsUTF8(name string) bool {
switch normCharset(name) {
case "UTF8", "UTF-8", "":
return true
}
return false
}
// GetCharset / SetCharset — 활성 charset 조회/설정. SetCharset 은 이전값 반환.
func GetCharset() string {
csMu.RLock()
defer csMu.RUnlock()
return csName
}
func SetCharset(name string) string {
csMu.Lock()
defer csMu.Unlock()
prev := csName
if strings.TrimSpace(name) != "" {
csName = normCharset(name)
}
return prev
}
// encodingFor — charset 이름으로 x/text encoding 획득. UTF-8/미지원이면 nil.
func encodingFor(name string) encoding.Encoding {
if csIsUTF8(name) {
return nil // identity
}
// htmlindex 는 "euc-kr","windows-1252","shift_jis" 등 표준 별칭을 받는다.
alias := strings.ToLower(strings.TrimSpace(name))
switch normCharset(name) {
case "CP949", "MS949", "EUCKR", "EUC-KR", "KSC5601":
alias = "euc-kr"
case "CP1252", "WINDOWS1252", "WINDOWS-1252", "LATIN1", "ISO8859-1", "ISO-8859-1":
alias = "windows-1252"
case "CP932", "SHIFTJIS", "SJIS", "SHIFT-JIS":
alias = "shift_jis"
case "CP936", "GBK", "GB2312":
alias = "gbk"
case "CP950", "BIG5":
alias = "big5"
}
if enc, err := htmlindex.Get(alias); err == nil {
return enc
}
return nil
}
// DecodeToUTF8 — 지정 charset 바이트열을 내부 UTF-8 문자열로 디코드.
func DecodeToUTF8(b []byte, fromCharset string) string {
enc := encodingFor(fromCharset)
if enc == nil {
return string(b)
}
if out, err := enc.NewDecoder().Bytes(b); err == nil {
return string(out)
}
return string(b)
}
// EncodeFromUTF8 — 내부 UTF-8 문자열을 지정 charset 바이트열로 인코드.
func EncodeFromUTF8(s, toCharset string) []byte {
enc := encodingFor(toCharset)
if enc == nil {
return []byte(s)
}
if out, err := enc.NewEncoder().Bytes([]byte(s)); err == nil {
return out
}
return []byte(s)
}
// ── charset-aware 코어 문자열 헬퍼 ───────────────────────────────────────
// 컴파일러(gengo)가 LEN/CHR/ASC/SUBSTR/LEFT/RIGHT/AT 를 인라인으로 펼칠 때
// 이 헬퍼들을 호출한다. 활성 charset 이 UTF-8(기본)이면 rune(문자) 단위,
// 아니면 byte 단위로 동작한다.
// StrLen — LEN(cString): charset 단위 길이.
func StrLen(s string) int {
if charsetIsUTF8() {
return utf8.RuneCountInString(s)
}
return len(s)
}
// StrChr — CHR(nCode): charset 단위 1문자 생성.
func StrChr(n int) string {
if charsetIsUTF8() {
if n < 0 {
n = 0
}
return string(rune(n))
}
return string([]byte{byte(n)})
}
// StrAsc — ASC(cString): 첫 문자의 코드값.
func StrAsc(s string) int {
if s == "" {
return 0
}
if charsetIsUTF8() {
r, _ := utf8.DecodeRuneInString(s)
return int(r)
}
return int(s[0])
}
// StrSubStr — SUBSTR(cString, nStart, nLen): nStart 는 1-기반.
// hasLen 이 false 면 nStart 부터 끝까지.
func StrSubStr(s string, start, length int, hasLen bool) string {
if charsetIsUTF8() {
rs := []rune(s)
n := len(rs)
sp := start - 1
if sp < 0 {
sp = 0
}
if sp > n {
sp = n
}
sl := length
if !hasLen {
sl = n - sp
}
if sl < 0 {
sl = 0
}
if sp+sl > n {
sl = n - sp
}
return string(rs[sp : sp+sl])
}
n := len(s)
sp := start - 1
if sp < 0 {
sp = 0
}
if sp > n {
sp = n
}
sl := length
if !hasLen {
sl = n - sp
}
if sl < 0 {
sl = 0
}
if sp+sl > n {
sl = n - sp
}
return s[sp : sp+sl]
}
// StrLeft — LEFT(cString, nLen).
func StrLeft(s string, n int) string {
if n <= 0 {
return ""
}
if charsetIsUTF8() {
rs := []rune(s)
if n >= len(rs) {
return s
}
return string(rs[:n])
}
if n >= len(s) {
return s
}
return s[:n]
}
// StrRight — RIGHT(cString, nLen).
func StrRight(s string, n int) string {
if n <= 0 {
return ""
}
if charsetIsUTF8() {
rs := []rune(s)
if n >= len(rs) {
return s
}
return string(rs[len(rs)-n:])
}
if n >= len(s) {
return s
}
return s[len(s)-n:]
}
// StrAt — AT(cSearch, cTarget): cTarget 안에서 cSearch 의 1-기반 위치, 없으면 0.
func StrAt(search, target string) int {
idx := strings.Index(target, search)
if idx < 0 {
return 0
}
if charsetIsUTF8() {
return utf8.RuneCountInString(target[:idx]) + 1
}
return idx + 1
}
// StrPadR — PADR(cString, nLen): 오른쪽 공백 패딩(초과 시 왼쪽 nLen 컷).
func StrPadR(s string, n int) string {
l := StrLen(s)
if l >= n {
return StrLeft(s, n)
}
return s + Spaces(n-l)
}
// StrPadL — PADL(cString, nLen [, cFill]): 왼쪽 패딩(초과 시 오른쪽 nLen 컷).
func StrPadL(s string, n int, fill string) string {
l := StrLen(s)
if l >= n {
return StrRight(s, n)
}
fc := " "
if fill != "" {
if charsetIsUTF8() {
fc = string([]rune(fill)[:1])
} else {
fc = fill[:1]
}
}
return strings.Repeat(fc, n-l) + s
}
// ── PRG RTL ──────────────────────────────────────────────────────────
// HbGetCharset: HB_GETCHARSET() → cName
func HbGetCharset(t *hbrt.Thread) {
t.Frame(0, 0)
defer t.EndProcFast()
t.PushString(GetCharset())
t.RetValue()
}
// HbSetCharset: HB_SETCHARSET([cName]) → cPrev
func HbSetCharset(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProcFast()
name := ""
if nParams >= 1 {
name = t.Local(1).AsString()
}
t.PushString(SetCharset(name))
t.RetValue()
}
// HbTranslate: HB_TRANSLATE(cStr, cFrom, cTo) → cConverted
func HbTranslate(t *hbrt.Thread) {
t.Frame(3, 0)
defer t.EndProcFast()
s := t.Local(1).AsString()
from := t.Local(2).AsString()
to := t.Local(3).AsString()
t.PushString(string(EncodeFromUTF8(DecodeToUTF8([]byte(s), from), to)))
t.RetValue()
}

View File

@@ -10,6 +10,7 @@ import (
"math"
"os"
"strings"
"unicode/utf8"
)
// --- String functions ---
@@ -21,63 +22,101 @@ func At(t *hbrt.Thread) {
search := t.Local(1).AsString()
target := t.Local(2).AsString()
idx := strings.Index(target, search)
if idx >= 0 {
t.RetInt(int64(idx + 1))
} else {
if idx < 0 {
t.RetInt(0)
return
}
if charsetIsUTF8() {
t.RetInt(int64(utf8.RuneCountInString(target[:idx]) + 1)) // 문자 위치
} else {
t.RetInt(int64(idx + 1)) // 바이트 위치
}
}
// Left returns leftmost n characters.
// Left returns leftmost n characters (UTF-8: rune 단위, 레거시: 바이트).
func Left(t *hbrt.Thread) {
t.Frame(2, 0)
defer t.EndProcFast()
s := t.Local(1).AsString()
n := int(t.Local(2).AsNumInt())
if n >= len(s) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
if charsetIsUTF8() {
rs := []rune(s)
if n >= len(rs) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
} else {
t.PushString(string(rs[:n]))
}
} else {
t.PushString(s[:n])
if n >= len(s) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
} else {
t.PushString(s[:n])
}
}
t.RetValue()
}
// Right returns rightmost n characters.
// Right returns rightmost n characters (UTF-8: rune 단위, 레거시: 바이트).
func Right(t *hbrt.Thread) {
t.Frame(2, 0)
defer t.EndProcFast()
s := t.Local(1).AsString()
n := int(t.Local(2).AsNumInt())
if n >= len(s) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
if charsetIsUTF8() {
rs := []rune(s)
if n >= len(rs) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
} else {
t.PushString(string(rs[len(rs)-n:]))
}
} else {
t.PushString(s[len(s)-n:])
if n >= len(s) {
t.PushString(s)
} else if n <= 0 {
t.PushString("")
} else {
t.PushString(s[len(s)-n:])
}
}
t.RetValue()
}
// Asc returns ASCII code of first character.
// Asc returns the code of the first character (UTF-8: codepoint, 레거시: byte).
func Asc(t *hbrt.Thread) {
t.Frame(1, 0)
defer t.EndProcFast()
s := t.Local(1).AsString()
if len(s) > 0 {
t.RetInt(int64(s[0]))
} else {
if len(s) == 0 {
t.RetInt(0)
return
}
if charsetIsUTF8() {
r, _ := utf8.DecodeRuneInString(s)
t.RetInt(int64(r))
} else {
t.RetInt(int64(s[0]))
}
}
// Chr returns character from ASCII code.
// Chr returns the character for a code (UTF-8: codepoint→UTF-8, 레거시: 1 byte).
func Chr(t *hbrt.Thread) {
t.Frame(1, 0)
defer t.EndProcFast()
n := int(t.Local(1).AsNumInt())
t.PushString(string([]byte{byte(n)}))
if charsetIsUTF8() {
if n < 0 {
n = 0
}
t.PushString(string(rune(n))) // 코드포인트 → UTF-8
} else {
t.PushString(string([]byte{byte(n)}))
}
t.RetValue()
}

View File

@@ -64,12 +64,16 @@ func HbATokens(t *hbrt.Thread) {
}
// hb_cdpSelect([cCodepage]) → cPrevCodepage
// Stub: Five uses UTF-8 internally, codepage selection is a no-op.
// 활성 charset 을 설정/조회한다. 기본은 UTF-8. (charset.go 참고)
func HbCdpSelect(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProcFast()
t.RetString("")
name := ""
if nParams >= 1 {
name = t.Local(1).AsString()
}
t.RetString(SetCharset(name))
}
// Used() → lUsed — checks if current workarea is in use

View File

@@ -93,6 +93,41 @@ func HbRegexAll(t *hbrt.Thread) {
t.RetVal(hbrt.MakeArrayFrom(items))
}
// HB_REGEX(cPattern|pRegex, cString [, lCaseSensitive]) → aSubmatches
//
// Returns the first match plus any capture groups. Maps directly to
// Go's regexp.FindStringSubmatch: result[1] is the whole match,
// result[2..] are the capture groups. Empty PRG array on no match —
// distinguishable from `Nil` by callers that need the "anchor present
// but groups missing" case.
//
// Harbour parity: this is the array-of-submatches `hb_regex(p, s)`
// flavour. The Five-original capture-only callers should use this.
func HbRegex(t *hbrt.Thread) {
nParams := t.ParamCount()
t.Frame(nParams, 0)
defer t.EndProc()
re := getRegex(t, 1, nParams >= 3)
if re == nil {
t.RetVal(hbrt.MakeArrayFrom(nil))
return
}
str := t.Local(2).AsString()
m := re.FindStringSubmatch(str)
if m == nil {
t.RetVal(hbrt.MakeArrayFrom(nil))
return
}
items := make([]hbrt.Value, len(m))
for i, s := range m {
items[i] = hbrt.MakeString(s)
}
t.RetVal(hbrt.MakeArrayFrom(items))
}
// HB_REGEXREPLACE(cPattern|pRegex, cString, cReplace [, lCaseSensitive]) → cResult
func HbRegexReplace(t *hbrt.Thread) {
nParams := t.ParamCount()

View File

@@ -233,6 +233,7 @@ func RegisterRTL(vm *hbrt.VM) {
// Regex (Go regexp)
hbrt.Sym("HB_REGEXCOMP", hbrt.FsPublic, HbRegexComp),
hbrt.Sym("HB_REGEX", hbrt.FsPublic, HbRegex),
hbrt.Sym("HB_REGEXMATCH", hbrt.FsPublic, HbRegexMatch),
hbrt.Sym("HB_REGEXSPLIT", hbrt.FsPublic, HbRegexSplit),
hbrt.Sym("HB_REGEXALL", hbrt.FsPublic, HbRegexAll),
@@ -446,6 +447,9 @@ func RegisterRTL(vm *hbrt.VM) {
hbrt.Sym("HB_SECOND", hbrt.FsPublic, HbSecond),
hbrt.Sym("HB_ATOKENS", hbrt.FsPublic, HbATokens),
hbrt.Sym("HB_CDPSELECT", hbrt.FsPublic, HbCdpSelect),
hbrt.Sym("HB_GETCHARSET", hbrt.FsPublic, HbGetCharset),
hbrt.Sym("HB_SETCHARSET", hbrt.FsPublic, HbSetCharset),
hbrt.Sym("HB_TRANSLATE", hbrt.FsPublic, HbTranslate),
hbrt.Sym("DBSETINDEX", hbrt.FsPublic, rtlDbSetIndex),
hbrt.Sym("HB_TTOS", hbrt.FsPublic, HbTToS),
hbrt.Sym("HB_STOT", hbrt.FsPublic, HbSToT),

View File

@@ -10,6 +10,7 @@ import (
"fmt"
"math"
"strings"
"unicode/utf8"
)
// spacesCache: pre-built space strings for common pad sizes.
@@ -134,7 +135,11 @@ func Len(t *hbrt.Thread) {
v := t.Local(1)
switch {
case v.IsString():
t.RetInt(int64(v.StringLen()))
if charsetIsUTF8() {
t.RetInt(int64(utf8.RuneCountInString(v.AsString()))) // 문자(rune) 수
} else {
t.RetInt(int64(v.StringLen())) // 바이트 수(레거시 charset)
}
case v.IsArray():
t.RetInt(int64(len(v.AsArray().Items)))
case v.IsHash():
@@ -161,6 +166,37 @@ func SubStr(t *hbrt.Thread) {
s := v.AsString()
start := int(t.Local(2).AsNumInt())
// UTF-8(기본): 문자(rune) 단위. 레거시 charset: 바이트 단위.
if charsetIsUTF8() {
rs := []rune(s)
n := len(rs)
if start < 0 {
start = n + start + 1
}
if start < 1 {
start = 1
}
start--
if start >= n {
t.PushString("")
t.RetValue()
return
}
res := rs[start:]
if nParams >= 3 && !t.Local(3).IsNil() {
nLen := int(t.Local(3).AsNumInt())
if nLen < 0 {
nLen = 0
}
if nLen < len(res) {
res = res[:nLen]
}
}
t.PushString(string(res))
t.RetValue()
return
}
// Harbour: 1-based index, negative = from end
if start < 0 {
start = len(s) + start + 1